SmartCane/r_app/parameters_project.R
2026-01-29 21:10:24 +01:00

844 lines
32 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\parameters_project.R
#
# PARAMETERS_PROJECT.R
# ====================
# This script defines project parameters, directory structures, and loads field boundaries.
# It establishes all the necessary paths and creates required directories for the SmartCane project.
# 1. Load required libraries
# -------------------------
suppressPackageStartupMessages({
library(here)
library(readxl)
library(sf)
library(dplyr)
library(tidyr)
library(jsonlite) # For reading tiling_config.json
})
# 2. Client type mapping (for conditional script execution)
# ---------------------------------------------------------
# Maps project names to client types for pipeline control
# Client types:
# - "cane_supply": Runs Scripts 20,21,30,31,80,91 (full pipeline with Excel output)
# - "agronomic_support": Runs Scripts 80,90 only (KPI calculation + Word report)
# - "extension_service": (Future - not yet implemented)
#
# NOTE: This will eventually migrate to Laravel environment variables/database
# For now, maintain this mapping and update as projects are added
CLIENT_TYPE_MAP <- list(
"angata" = "cane_supply",
"aura" = "agronomic_support",
"chemba" = "cane_supply",
"xinavane" = "cane_supply",
"esa" = "cane_supply"
)
get_client_type <- function(project_name) {
client_type <- CLIENT_TYPE_MAP[[project_name]]
if (is.null(client_type)) {
warning(sprintf("Project '%s' not in CLIENT_TYPE_MAP - defaulting to 'cane_supply'", project_name))
return("cane_supply")
}
return(client_type)
}
# 2b. Client-specific KPI configurations
# ----------------------------------------
# Defines which KPIs and outputs are required for each client type
# This enables Script 80 to conditionally calculate only relevant metrics
#
# Structure:
# - kpi_calculations: Vector of KPI types to calculate for this client
# - outputs: Vector of output formats to generate (determines RDS/Excel naming)
# - requires_harvest_data: Boolean - whether Script 31 harvest predictions are needed
# - script_90_compatible: Boolean - whether output should match Script 90 expectations
# - script_91_compatible: Boolean - whether output should match Script 91 expectations
#
CLIENT_TYPE_CONFIGS <- list(
# Aura (agronomic_support): Farm-level KPI summaries for weekly reports to agronomists
"agronomic_support" = list(
client_type = "agronomic_support",
description = "Farm-level KPI summaries for agronomic decision support",
kpi_calculations = c(
"field_uniformity",
"area_change",
"tch_forecasted",
"growth_decline",
"weed_presence",
"gap_filling"
),
outputs = c(
"kpi_summary_tables", # Summary statistics for Script 90 report front page
"field_details" # Detailed field table for Script 90 report end section
),
requires_harvest_data = FALSE, # Script 31 predictions not used
script_90_compatible = TRUE, # Output format matches Script 90 expectations
script_91_compatible = FALSE
),
# Cane Supply (cane_supply): Per-field analysis with harvest timing prediction
"cane_supply" = list(
client_type = "cane_supply",
description = "Per-field analysis with harvest prediction and phase assignment",
kpi_calculations = c(
"per_field_analysis", # Use 80_weekly_stats_utils.R for field-level statistics
"phase_assignment", # Assign growth phases (Germination, Tillering, Grand Growth, Maturation)
"harvest_prediction", # Include Script 31 harvest age predictions if available
"status_triggers" # Calculate field status (Normal, Monitor, Alert, Urgent)
),
outputs = c(
"field_analysis_excel", # Excel file with per-field metrics
"field_analysis_summary" # Summary RDS for Script 91 report
),
requires_harvest_data = TRUE, # harvest.xlsx is required for phase assignment
script_90_compatible = FALSE,
script_91_compatible = TRUE
)
)
#' Get KPI configuration for a specific client type
#' @param client_type Character string of client type (e.g., "agronomic_support", "cane_supply")
#' @return List containing configuration for that client type
get_client_kpi_config <- function(client_type) {
config <- CLIENT_TYPE_CONFIGS[[client_type]]
if (is.null(config)) {
warning(sprintf("Client type '%s' not in CLIENT_TYPE_CONFIGS - defaulting to 'cane_supply'", client_type))
return(CLIENT_TYPE_CONFIGS[["cane_supply"]])
}
return(config)
}
# 3. Smart detection for tile-based vs single-file mosaic approach
# ----------------------------------------------------------------
detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) {
# PRIORITY 1: Check for tiling_config.json metadata file from script 10
# This is the most reliable source since script 10 explicitly records its decision
if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) {
# Try to find tiling_config.json in any grid-size subfolder
config_files <- list.files(daily_tiles_split_dir,
pattern = "tiling_config\\.json$",
recursive = TRUE,
full.names = TRUE)
if (length(config_files) > 0) {
# Found a config file - use the most recent one
config_file <- config_files[which.max(file.info(config_files)$mtime)]
tryCatch({
config_json <- jsonlite::read_json(config_file)
return(list(
has_tiles = config_json$has_tiles %||% TRUE,
detected_tiles = character(),
total_files = 0,
source = "tiling_config.json",
grid_size = config_json$grid_size %||% "unknown"
))
}, error = function(e) {
warning("Error reading tiling_config.json: ", e$message)
# Fall through to file-based detection
})
}
}
# PRIORITY 2: File-based detection (fallback if metadata not found)
# Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories
if (!dir.exists(merged_final_tif_dir)) {
return(list(
has_tiles = FALSE,
detected_tiles = character(),
total_files = 0,
source = "directory_not_found"
))
}
# First check if there are grid-size subdirectories (5x5, 10x10, etc.)
# This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
# Found grid-size subdirectories - tiles exist!
grid_size <- grid_patterns[1]
grid_dir <- file.path(merged_final_tif_dir, grid_size)
# List sample tile files from the grid directory
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
return(list(
has_tiles = TRUE,
detected_tiles = sample_tiles,
total_files = length(sample_tiles),
source = "grid_subdirectory_detection",
grid_size = grid_size,
grid_path = grid_dir
))
}
# Fall back to checking for tile-named files directly in merged_final_tif
# List all .tif files in merged_final_tif
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)
if (length(tif_files) == 0) {
return(list(
has_tiles = FALSE,
detected_tiles = character(),
total_files = 0,
source = "no_files_found"
))
}
# Check if ANY file matches tile naming pattern: *_XX.tif (where XX is 2 digits)
# Tile pattern examples: 2025-11-27_00.tif, 2025-11-27_01.tif, week_50_2024_00.tif
tile_pattern <- "_(\\d{2})\\.tif$"
tile_files <- tif_files[grepl(tile_pattern, tif_files)]
has_tiles <- length(tile_files) > 0
return(list(
has_tiles = has_tiles,
detected_tiles = tile_files,
total_files = length(tif_files),
source = "file_pattern_detection"
))
}
# 4. Define project directory structure
# -----------------------------------
setup_project_directories <- function(project_dir, data_source = "merged_tif_8b") {
# Base directories
laravel_storage_dir <- here("laravel_app", "storage", "app", project_dir)
# Determine which TIF source folder to use based on data_source parameter
# Default is merged_tif_8b for newer data with cloud masking (8-band + UDM)
# Alternative: merged_tif for 4-band legacy data
merged_tif_folder <- here(laravel_storage_dir, data_source)
# Detect tile mode based on metadata from script 10 or file patterns
merged_final_dir <- here(laravel_storage_dir, "merged_final_tif")
daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split")
tile_detection <- detect_tile_structure_from_merged_final(
merged_final_tif_dir = merged_final_dir,
daily_tiles_split_dir = daily_tiles_split_dir
)
use_tile_mosaic <- tile_detection$has_tiles
# Main subdirectories
dirs <- list(
reports = here(laravel_storage_dir, "reports"),
logs = here(laravel_storage_dir, "logs"),
data = here(laravel_storage_dir, "Data"),
tif = list(
merged = merged_tif_folder, # Use data_source parameter to select folder
final = merged_final_dir
),
# New per-field directory structure (Script 10 output)
field_tiles = here(laravel_storage_dir, "field_tiles"),
field_tiles_ci = here(laravel_storage_dir, "field_tiles_CI"),
weekly_mosaic = here(laravel_storage_dir, "weekly_mosaic"),
weekly_tile_max = here(laravel_storage_dir, "weekly_tile_max"),
extracted_ci = list(
base = here(laravel_storage_dir, "Data", "extracted_ci"),
daily = here(laravel_storage_dir, "Data", "extracted_ci", "daily_vals"),
cumulative = here(laravel_storage_dir, "Data", "extracted_ci", "cumulative_vals"),
# New per-field daily RDS structure (Script 20 output)
daily_per_field = here(laravel_storage_dir, "Data", "extracted_ci", "daily_vals")
),
vrt = here(laravel_storage_dir, "Data", "vrt"),
harvest = here(laravel_storage_dir, "Data", "HarvestData")
)
# Create all directories
for (dir_path in unlist(dirs)) {
dir.create(dir_path, showWarnings = FALSE, recursive = TRUE)
}
# Return directory structure for use in other functions
return(list(
laravel_storage_dir = laravel_storage_dir,
reports_dir = dirs$reports,
log_dir = dirs$logs,
data_dir = dirs$data,
planet_tif_folder = dirs$tif$merged,
merged_final = dirs$tif$final,
daily_CI_vals_dir = dirs$extracted_ci$daily,
cumulative_CI_vals_dir = dirs$extracted_ci$cumulative,
# New per-field directory paths (Script 10 & 20 outputs)
field_tiles_dir = dirs$field_tiles,
field_tiles_ci_dir = dirs$field_tiles_ci,
daily_vals_per_field_dir = dirs$extracted_ci$daily_per_field,
# Field boundaries path for all scripts
field_boundaries_path = here(laravel_storage_dir, "Data", "pivot.geojson"),
weekly_CI_mosaic = if (use_tile_mosaic) dirs$weekly_tile_max else dirs$weekly_mosaic, # SMART: Route based on tile detection
daily_vrt = dirs$vrt, # Point to Data/vrt folder where R creates VRT files from CI extraction
weekly_tile_max = dirs$weekly_tile_max, # Per-tile weekly MAX mosaics (Script 04 output)
use_tile_mosaic = use_tile_mosaic, # Flag indicating if tiles are used for this project
tile_detection_info = list(
has_tiles = tile_detection$has_tiles,
detected_source = tile_detection$source,
detected_count = tile_detection$total_files,
grid_size = tile_detection$grid_size %||% "unknown",
sample_tiles = head(tile_detection$detected_tiles, 3)
),
harvest_dir = dirs$harvest,
extracted_CI_dir = dirs$extracted_ci$base
))
}
#set working dir.
# 5. Load field boundaries
# ----------------------
load_field_boundaries <- function(data_dir) {
# Choose field boundaries file based on project and script type
# ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model)
# All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson
use_pivot_2 <- exists("project_dir") && project_dir == "esa" &&
exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03
if (use_pivot_2) {
field_boundaries_path <- here(data_dir, "pivot_2.geojson")
} else {
field_boundaries_path <- here(data_dir, "pivot.geojson")
}
if (!file.exists(field_boundaries_path)) {
stop(paste("Field boundaries file not found at path:", field_boundaries_path))
}
tryCatch({
# Read GeoJSON with explicit CRS handling
field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE)
# Remove OBJECTID column immediately if it exists
if ("OBJECTID" %in% names(field_boundaries_sf)) {
field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID)
}
# Validate and fix CRS if needed - DO NOT call is.na on CRS objects as it can cause errors
# Just ensure CRS is set; terra will handle projection if needed
tryCatch({
# Simply assign WGS84 if not already set (safe approach)
# This avoids any problematic is.na() calls on complex CRS objects
if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) {
st_crs(field_boundaries_sf) <- 4326
warning("CRS was missing, assigned WGS84 (EPSG:4326)")
}
}, error = function(e) {
# If any CRS operation fails, just try to set it
tryCatch({
st_crs(field_boundaries_sf) <<- 4326
}, error = function(e2) {
# Silently continue - terra might handle it
warning(paste("Could not set CRS:", e2$message))
})
})
# Handle column names - accommodate optional sub_area column
# IMPORTANT: Must preserve geometry column properly when renaming sf object
if ("sub_area" %in% names(field_boundaries_sf)) {
# Reorder columns but keep geometry last
field_boundaries_sf <- field_boundaries_sf %>%
dplyr::select(field, sub_field, sub_area) %>%
sf::st_set_geometry("geometry")
} else {
# Reorder columns but keep geometry last
field_boundaries_sf <- field_boundaries_sf %>%
dplyr::select(field, sub_field) %>%
sf::st_set_geometry("geometry")
}
# Convert to terra vector if possible, otherwise use sf
# Some GeoJSON files (like aura with complex MultiPolygons) may have GDAL/terra compatibility issues
field_boundaries <- tryCatch({
field_boundaries_terra <- terra::vect(field_boundaries_sf)
# Ensure terra object has valid CRS with safer checks
crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL)
crs_str <- if (!is.null(crs_value)) as.character(crs_value) else ""
if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) {
terra::crs(field_boundaries_terra) <- "EPSG:4326"
warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)")
}
field_boundaries_terra
}, error = function(e) {
warning(paste("Terra conversion failed, using sf object instead:", e$message))
# Return sf object as fallback - functions will handle both types
field_boundaries_sf
})
return(list(
field_boundaries_sf = field_boundaries_sf,
field_boundaries = field_boundaries
))
}, error = function(e) {
cat("[DEBUG] Error in load_field_boundaries:\n")
cat(" Message:", e$message, "\n")
cat(" Call:", deparse(e$call), "\n")
stop(paste("Error loading field boundaries:", e$message))
})
}
# 6. Load harvesting data
# ---------------------
load_harvesting_data <- function(data_dir) {
harvest_file <- here(data_dir, "harvest.xlsx")
if (!file.exists(harvest_file)) {
warning(paste("Harvest data file not found at path:", harvest_file))
return(NULL)
}
# Helper function to parse dates with multiple format detection
parse_flexible_date <- function(x) {
if (is.na(x) || is.null(x)) return(NA_real_)
if (inherits(x, "Date")) return(x)
if (inherits(x, "POSIXct")) return(as.Date(x))
# If it's numeric (Excel date serial), convert directly
if (is.numeric(x)) {
return(as.Date(x, origin = "1899-12-30"))
}
# Try character conversion with multiple formats
x_char <- as.character(x)
# Try common formats: YYYY-MM-DD, DD/MM/YYYY, MM/DD/YYYY, YYYY-MM-DD HH:MM:SS
formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S")
for (fmt in formats) {
result <- suppressWarnings(as.Date(x_char, format = fmt))
if (!is.na(result)) return(result)
}
# If all else fails, return NA
return(NA)
}
tryCatch({
harvesting_data <- read_excel(harvest_file) %>%
dplyr::select(
c(
"field",
"sub_field",
"year",
"season_start",
"season_end",
"age",
"sub_area",
"tonnage_ha"
)
) %>%
mutate(
field = as.character(field),
sub_field = as.character(sub_field),
year = as.numeric(year),
season_start = sapply(season_start, parse_flexible_date),
season_end = sapply(season_end, parse_flexible_date),
season_start = as.Date(season_start, origin = "1970-01-01"),
season_end = as.Date(season_end, origin = "1970-01-01"),
age = as.numeric(age),
sub_area = as.character(sub_area),
tonnage_ha = as.numeric(tonnage_ha)
) %>%
mutate(
season_end = case_when(
season_end > Sys.Date() ~ Sys.Date(),
is.na(season_end) ~ Sys.Date(),
TRUE ~ season_end
),
age = round(as.numeric(season_end - season_start) / 7, 0)
)
return(harvesting_data)
}, error = function(e) {
warning(paste("Error loading harvesting data:", e$message))
return(NULL)
})
}
# 5. Define logging functions globally first
# ---------------------------------------
# Create a simple default log function in case setup_logging hasn't been called yet
log_message <- function(message, level = "INFO") {
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
formatted_message <- paste0("[", level, "] ", timestamp, " - ", message)
cat(formatted_message, "\n")
}
log_head <- function(list, level = "INFO") {
log_message(paste(capture.output(str(head(list))), collapse = "\n"), level)
}
# 8. Set up full logging system with file output
# -------------------------------------------
setup_logging <- function(log_dir) {
log_file <- here(log_dir, paste0(format(Sys.Date(), "%Y%m%d"), ".log"))
# Create enhanced log functions
log_message <- function(message, level = "INFO") {
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
formatted_message <- paste0("[", level, "] ", timestamp, " - ", message)
cat(formatted_message, "\n", file = log_file, append = TRUE)
# Also print to console for debugging
if (level %in% c("ERROR", "WARNING")) {
cat(formatted_message, "\n")
}
}
log_head <- function(list, level = "INFO") {
log_message(paste(capture.output(str(head(list))), collapse = "\n"), level)
}
# Update the global functions with the enhanced versions
assign("log_message", log_message, envir = .GlobalEnv)
assign("log_head", log_head, envir = .GlobalEnv)
return(list(
log_file = log_file,
log_message = log_message,
log_head = log_head
))
}
# 8. HELPER FUNCTIONS FOR COMMON CALCULATIONS
# -----------------------------------------------
# Centralized functions to reduce duplication across scripts
# Get ISO week and year from a date
get_iso_week <- function(date) {
as.numeric(format(date, "%V"))
}
get_iso_year <- function(date) {
as.numeric(format(date, "%G"))
}
# Get both ISO week and year as a list
get_iso_week_year <- function(date) {
list(
week = as.numeric(format(date, "%V")),
year = as.numeric(format(date, "%G"))
)
}
# Format week/year into a readable label
format_week_label <- function(date, separator = "_") {
wwy <- get_iso_week_year(date)
sprintf("week%02d%s%d", wwy$week, separator, wwy$year)
}
# Auto-detect mosaic mode (tiled vs single-file)
# Returns: "tiled", "single-file", or "unknown"
detect_mosaic_mode <- function(project_dir) {
# Check for tile-based approach: weekly_tile_max/{grid_size}/week_*.tif
weekly_tile_max <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max")
if (dir.exists(weekly_tile_max)) {
subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
return("tiled")
}
}
# Check for single-file approach: weekly_mosaic/week_*.tif
weekly_mosaic <- file.path("laravel_app", "storage", "app", project_dir, "weekly_mosaic")
if (dir.exists(weekly_mosaic)) {
files <- list.files(weekly_mosaic, pattern = "^week_.*\\.tif$")
if (length(files) > 0) {
return("single-file")
}
}
return("unknown")
}
# Auto-detect grid size from tile directory structure
# Returns: e.g., "5x5", "10x10", or "unknown"
detect_grid_size <- function(project_dir) {
weekly_tile_max <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max")
if (dir.exists(weekly_tile_max)) {
subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
return(grid_patterns[1]) # Return first match (usually only one)
}
}
return("unknown")
}
# Build storage paths consistently across all scripts
get_project_storage_path <- function(project_dir, subdir = NULL) {
base <- file.path("laravel_app", "storage", "app", project_dir)
if (!is.null(subdir)) file.path(base, subdir) else base
}
get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") {
if (mosaic_mode == "auto") {
mosaic_mode <- detect_mosaic_mode(project_dir)
}
if (mosaic_mode == "tiled") {
grid_size <- detect_grid_size(project_dir)
if (grid_size != "unknown") {
get_project_storage_path(project_dir, file.path("weekly_tile_max", grid_size))
} else {
get_project_storage_path(project_dir, "weekly_tile_max/5x5") # Fallback default
}
} else {
get_project_storage_path(project_dir, "weekly_mosaic")
}
}
get_kpi_dir <- function(project_dir, client_type) {
subdir <- if (client_type == "agronomic_support") "field_level" else "field_analysis"
get_project_storage_path(project_dir, file.path("reports", "kpis", subdir))
}
# Logging functions for clean output
smartcane_log <- function(message, level = "INFO", verbose = TRUE) {
if (!verbose) return(invisible(NULL))
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
prefix <- sprintf("[%s]", level)
cat(sprintf("%s %s\n", prefix, message))
}
smartcane_debug <- function(message, verbose = FALSE) {
if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") {
return(invisible(NULL))
}
smartcane_log(message, level = "DEBUG", verbose = TRUE)
}
smartcane_warn <- function(message) {
smartcane_log(message, level = "WARN", verbose = TRUE)
}
# ============================================================================
# PHASE 3 & 4: OPTIMIZATION & DOCUMENTATION
# ============================================================================
# System Constants
# ----------------
# Define once, use everywhere
RSCRIPT_PATH <- "C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe"
# Used in run_full_pipeline.R for calling R scripts via system()
# Data Source Documentation
# ---------------------------
# Explains the two satellite data formats and when to use each
#
# SmartCane uses PlanetScope imagery from Planet Labs API in two formats:
#
# 1. merged_tif (4-band):
# - Standard format: Red, Green, Blue, Near-Infrared
# - Size: ~150-200 MB per date
# - Use case: Agronomic support, general crop health monitoring
# - Projects: aura, xinavane
# - Cloud handling: Basic cloud masking from Planet metadata
#
# 2. merged_tif_8b (8-band with cloud confidence):
# - Enhanced format: 4-band imagery + 4-band UDM2 cloud mask
# - UDM2 bands: Clear, Snow, Shadow, Light Haze
# - Size: ~250-350 MB per date
# - Use case: Harvest prediction, supply chain optimization
# - Projects: angata, chemba, esa (cane_supply clients)
# - Cloud handling: Per-pixel cloud confidence from Planet UDM2
# - Why: Cane supply chains need precise confidence to predict harvest dates
# (don't want to predict based on cloudy data)
#
# The system auto-detects which is available via detect_data_source()
# Mosaic Mode Documentation
# --------------------------
# SmartCane supports two ways to store and process weekly mosaics:
#
# 1. Single-file mosaic ("single-file"):
# - One GeoTIFF per week: weekly_mosaic/week_02_2026.tif
# - 5 bands per file: R, G, B, NIR, CI (Canopy Index)
# - Size: ~300-500 MB per week
# - Pros: Simpler file management, easier full-field visualization
# - Cons: Slower for field-specific queries, requires loading full raster
# - Best for: Agronomic support (aura) with <100 fields
# - Script 04 output: 5-band single-file mosaic
#
# 2. Tiled mosaic ("tiled"):
# - Grid of tiles per week: weekly_tile_max/5x5/week_02_2026_{TT}.tif
# - Example: 25 files (5×5 grid) × 5 bands = 125 individual tiffs
# - Size: ~15-20 MB per tile, organized in folders
# - Pros: Parallel processing, fast field lookups, scales to 1000+ fields
# - Cons: More file I/O, requires tile-to-field mapping metadata
# - Best for: Cane supply (angata, chemba) with 500+ fields
# - Script 04 output: Per-tile tiff files in weekly_tile_max/{grid}/
# - Tile assignment: Field boundaries mapped to grid coordinates
#
# The system auto-detects which is available via detect_mosaic_mode()
# Client Type Documentation
# --------------------------
# SmartCane runs different analysis pipelines based on client_type:
#
# CLIENT_TYPE: cane_supply
# Purpose: Optimize sugar mill supply chain (harvest scheduling)
# Scripts run: 20 (CI), 21 (RDS to CSV), 30 (Growth), 31 (Harvest pred), 40 (Mosaic), 80 (KPI), 91 (Excel)
# Outputs:
# - Per-field analysis: field status, growth phase, harvest readiness
# - Excel reports (Script 91): Detailed metrics for logistics planning
# - KPI directory: reports/kpis/field_analysis/ (one RDS per week)
# Harvest data: Required (harvest.xlsx - planting dates for phase assignment)
# Data source: merged_tif_8b (uses cloud confidence for confidence)
# Mosaic mode: tiled (scales to 500+ fields)
# Projects: angata, chemba, xinavane, esa
#
# CLIENT_TYPE: agronomic_support
# Purpose: Provide weekly crop health insights to agronomists
# Scripts run: 80 (KPI), 90 (Word report)
# Outputs:
# - Farm-level KPI summaries (no per-field breakdown)
# - Word reports (Script 90): Charts and trends for agronomist decision support
# - KPI directory: reports/kpis/field_level/ (one RDS per week)
# Harvest data: Not used
# Data source: merged_tif (simpler, smaller)
# Mosaic mode: single-file (100-200 fields)
# Projects: aura
#
# Detect data source (merged_tif vs merged_tif_8b) based on availability
# Returns the first available source; defaults to merged_tif_8b if neither exists
detect_data_source <- function(project_dir) {
storage_dir <- get_project_storage_path(project_dir)
# Preferred order: check merged_tif first, fall back to merged_tif_8b
for (source in c("merged_tif", "merged_tif_8b")) {
source_dir <- file.path(storage_dir, source)
if (dir.exists(source_dir)) {
tifs <- list.files(source_dir, pattern = "\\.tif$")
if (length(tifs) > 0) {
smartcane_log(sprintf("Detected data source: %s (%d TIF files)", source, length(tifs)))
return(source)
}
}
}
smartcane_warn(sprintf("No data source found for %s - defaulting to merged_tif_8b", project_dir))
return("merged_tif_8b")
}
# Check KPI completeness for a reporting period
# Returns: List with kpis_df (data.frame), missing_count, and all_complete (boolean)
# This replaces duplicate KPI checking logic in run_full_pipeline.R (lines ~228-270, ~786-810)
check_kpi_completeness <- function(project_dir, client_type, end_date, reporting_weeks_needed) {
kpi_dir <- get_kpi_dir(project_dir, client_type)
kpis_needed <- data.frame()
for (weeks_back in 0:(reporting_weeks_needed - 1)) {
check_date <- end_date - (weeks_back * 7)
wwy <- get_iso_week_year(check_date)
# Build week pattern and check if it exists
week_pattern <- sprintf("week%02d_%d", wwy$week, wwy$year)
files_this_week <- list.files(kpi_dir, pattern = week_pattern)
has_kpis <- length(files_this_week) > 0
# Track missing weeks
kpis_needed <- rbind(kpis_needed, data.frame(
week = wwy$week,
year = wwy$year,
date = check_date,
has_kpis = has_kpis,
pattern = week_pattern,
file_count = length(files_this_week)
))
# Debug logging
smartcane_debug(sprintf(
"Week %02d/%d (%s): %s (%d files)",
wwy$week, wwy$year, format(check_date, "%Y-%m-%d"),
if (has_kpis) "✓ FOUND" else "✗ MISSING",
length(files_this_week)
))
}
# Summary statistics
missing_count <- sum(!kpis_needed$has_kpis)
all_complete <- missing_count == 0
return(list(
kpis_df = kpis_needed,
kpi_dir = kpi_dir,
missing_count = missing_count,
missing_weeks = kpis_needed[!kpis_needed$has_kpis, ],
all_complete = all_complete
))
}
# 9. Initialize the project
# ----------------------
# Export project directories and settings
initialize_project <- function(project_dir, data_source = "merged_tif_8b") {
# Set up directory structure, passing data_source to select TIF folder
dirs <- setup_project_directories(project_dir, data_source = data_source)
# Set up logging
logging <- setup_logging(dirs$log_dir)
# Load field boundaries
boundaries <- load_field_boundaries(dirs$data_dir)
# Load harvesting data
harvesting_data <- load_harvesting_data(dirs$data_dir)
# Return all initialized components
return(c(
dirs,
list(
logging = logging,
field_boundaries = boundaries$field_boundaries,
field_boundaries_sf = boundaries$field_boundaries_sf,
harvesting_data = harvesting_data
)
))
}
# When script is sourced, initialize with the global project_dir variable if it exists
if (exists("project_dir")) {
# Now we can safely log before initialization
log_message(paste("Initializing project with directory:", project_dir))
# Use data_source if it exists (passed from 02_ci_extraction.R), otherwise use default
data_src <- if (exists("data_source")) data_source else "merged_tif_8b"
log_message(paste("Using data source directory:", data_src))
project_config <- initialize_project(project_dir, data_source = data_src)
# Expose all variables to the global environment
list2env(project_config, envir = .GlobalEnv)
# Log project initialization completion with tile mode info
log_message(paste("Project initialized with directory:", project_dir))
if (exists("use_tile_mosaic")) {
mosaic_mode <- if (use_tile_mosaic) "TILE-BASED" else "SINGLE-FILE"
log_message(paste("Mosaic mode detected:", mosaic_mode))
if (exists("tile_detection_info") && !is.null(tile_detection_info)) {
log_message(paste(" - Detection source:", tile_detection_info$detected_source))
log_message(paste(" - Grid size:", tile_detection_info$grid_size))
log_message(paste(" - Detected files in storage:", tile_detection_info$detected_count))
if (length(tile_detection_info$sample_tiles) > 0) {
log_message(paste(" - Sample tile files:", paste(tile_detection_info$sample_tiles, collapse = ", ")))
}
}
}
} else {
warning("project_dir variable not found. Please set project_dir before sourcing parameters_project.R")
}