SmartCane/r_app/parameters_project_OLD.R

1241 lines
47 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\parameters_project.R
#
# PARAMETERS_PROJECT.R
# ====================
# This script defines project parameters, directory structures, and loads field boundaries.
# It establishes all the necessary paths and creates required directories for the SmartCane project.
# 1. Load required libraries
# -------------------------
suppressPackageStartupMessages({
library(here)
library(readxl)
library(sf)
library(dplyr)
library(tidyr)
library(jsonlite) # For reading tiling_config.json
})
# 2. Client type mapping (for conditional script execution)
# ---------------------------------------------------------
# Maps project names to client types for pipeline control
# Client types:
# - "cane_supply": Runs Scripts 20,21,30,31,80,91 (full pipeline with Excel output)
# - "agronomic_support": Runs Scripts 80,90 only (KPI calculation + Word report)
# - "extension_service": (Future - not yet implemented)
#
# NOTE: This will eventually migrate to Laravel environment variables/database
# For now, maintain this mapping and update as projects are added
CLIENT_TYPE_MAP <- list(
"angata" = "cane_supply",
"aura" = "agronomic_support",
"chemba" = "cane_supply",
"xinavane" = "cane_supply",
"esa" = "cane_supply"
)
get_client_type <- function(project_name) {
client_type <- CLIENT_TYPE_MAP[[project_name]]
if (is.null(client_type)) {
warning(sprintf("Project '%s' not in CLIENT_TYPE_MAP - defaulting to 'cane_supply'", project_name))
return("cane_supply")
}
return(client_type)
}
# 2b. Client-specific KPI configurations
# ----------------------------------------
# Defines which KPIs and outputs are required for each client type
# This enables Script 80 to conditionally calculate only relevant metrics
#
# Structure:
# - kpi_calculations: Vector of KPI types to calculate for this client
# - outputs: Vector of output formats to generate (determines RDS/Excel naming)
# - requires_harvest_data: Boolean - whether Script 31 harvest predictions are needed
# - script_90_compatible: Boolean - whether output should match Script 90 expectations
# - script_91_compatible: Boolean - whether output should match Script 91 expectations
#
CLIENT_TYPE_CONFIGS <- list(
# Aura (agronomic_support): Farm-level KPI summaries for weekly reports to agronomists
"agronomic_support" = list(
client_type = "agronomic_support",
description = "Farm-level KPI summaries for agronomic decision support",
kpi_calculations = c(
"field_uniformity",
"area_change",
"tch_forecasted",
"growth_decline",
"weed_presence",
"gap_filling"
),
outputs = c(
"kpi_summary_tables", # Summary statistics for Script 90 report front page
"field_details" # Detailed field table for Script 90 report end section
),
requires_harvest_data = FALSE, # Script 31 predictions not used
script_90_compatible = TRUE, # Output format matches Script 90 expectations
script_91_compatible = FALSE
),
# Cane Supply (cane_supply): Per-field analysis with harvest timing prediction
"cane_supply" = list(
client_type = "cane_supply",
description = "Per-field analysis with harvest prediction and phase assignment",
kpi_calculations = c(
"per_field_analysis", # Use 80_weekly_stats_utils.R for field-level statistics
"phase_assignment", # Assign growth phases (Germination, Tillering, Grand Growth, Maturation)
"harvest_prediction", # Include Script 31 harvest age predictions if available
"status_triggers" # Calculate field status (Normal, Monitor, Alert, Urgent)
),
outputs = c(
"field_analysis_excel", # Excel file with per-field metrics
"field_analysis_summary" # Summary RDS for Script 91 report
),
requires_harvest_data = TRUE, # harvest.xlsx is required for phase assignment
script_90_compatible = FALSE,
script_91_compatible = TRUE
)
)
#' Get KPI configuration for a specific client type
#' @param client_type Character string of client type (e.g., "agronomic_support", "cane_supply")
#' @return List containing configuration for that client type
get_client_kpi_config <- function(client_type) {
config <- CLIENT_TYPE_CONFIGS[[client_type]]
if (is.null(config)) {
warning(sprintf("Client type '%s' not in CLIENT_TYPE_CONFIGS - defaulting to 'cane_supply'", client_type))
return(CLIENT_TYPE_CONFIGS[["cane_supply"]])
}
return(config)
}
# 3. Smart detection for tile-based vs single-file mosaic approach
# ----------------------------------------------------------------
detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) {
# PRIORITY 1: Check for tiling_config.json metadata file from script 10
# This is the most reliable source since script 10 explicitly records its decision
if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) {
# Try to find tiling_config.json in any grid-size subfolder
config_files <- list.files(daily_tiles_split_dir,
pattern = "tiling_config\\.json$",
recursive = TRUE,
full.names = TRUE)
if (length(config_files) > 0) {
# Found a config file - use the most recent one
config_file <- config_files[which.max(file.info(config_files)$mtime)]
tryCatch({
config_json <- jsonlite::read_json(config_file)
return(list(
has_tiles = config_json$has_tiles %||% TRUE,
detected_tiles = character(),
total_files = 0,
source = "tiling_config.json",
grid_size = config_json$grid_size %||% "unknown"
))
}, error = function(e) {
warning("Error reading tiling_config.json: ", e$message)
# Fall through to file-based detection
})
}
}
# PRIORITY 2: File-based detection (fallback if metadata not found)
# Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories
if (!dir.exists(merged_final_tif_dir)) {
return(list(
has_tiles = FALSE,
detected_tiles = character(),
total_files = 0,
source = "directory_not_found"
))
}
# First check if there are grid-size subdirectories (5x5, 10x10, etc.)
# This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
# Found grid-size subdirectories - tiles exist!
grid_size <- grid_patterns[1]
grid_dir <- file.path(merged_final_tif_dir, grid_size)
# List sample tile files from the grid directory
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
return(list(
has_tiles = TRUE,
detected_tiles = sample_tiles,
total_files = length(sample_tiles),
source = "grid_subdirectory_detection",
grid_size = grid_size,
grid_path = grid_dir
))
}
# Fall back to checking for tile-named files directly in merged_final_tif
# List all .tif files in merged_final_tif
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)
if (length(tif_files) == 0) {
return(list(
has_tiles = FALSE,
detected_tiles = character(),
total_files = 0,
source = "no_files_found"
))
}
# Check if ANY file matches tile naming pattern: *_XX.tif (where XX is 2 digits)
# Tile pattern examples: 2025-11-27_00.tif, 2025-11-27_01.tif, week_50_2024_00.tif
tile_pattern <- "_(\\d{2})\\.tif$"
tile_files <- tif_files[grepl(tile_pattern, tif_files)]
has_tiles <- length(tile_files) > 0
return(list(
has_tiles = has_tiles,
detected_tiles = tile_files,
total_files = length(tif_files),
source = "file_pattern_detection"
))
}
# 4. Define project directory structure
# -----------------------------------
# ==============================================================================
# CENTRALIZED PATH MANAGEMENT - setup_project_directories()
# ==============================================================================
# This function is the single source of truth for ALL file paths used across the pipeline.
# All scripts should call this function once at startup and use returned paths.
# This eliminates ~88 hardcoded file.path() calls scattered across 8 scripts.
#
# USAGE:
# paths <- setup_project_directories(project_dir)
# merged_tif_dir <- paths$merged_tif_folder
# daily_ci_dir <- paths$daily_ci_vals_dir
# kpi_output_dir <- paths$kpi_reports_dir
#
# TIERS (8-layer directory structure):
# Tier 1: Raw data (merged_tif)
# Tier 2: Per-field TIFFs (field_tiles, field_tiles_CI)
# Tier 3: CI Extraction (daily_ci_vals, cumulative_ci_vals)
# Tier 4: Growth Model (growth_model_interpolated)
# Tier 5: Mosaics (weekly_mosaic, weekly_tile_max)
# Tier 6: KPI & Reporting (kpi_reports_dir, kpi_field_stats_dir)
# Tier 7: Support (data, vrt, harvest, logs)
# Tier 8: Config & Metadata (field_boundaries_path, tiling_config_path)
#
# BENEFITS:
# ✓ Single source of truth (eliminates ~88 hardcoded file.path() calls)
# ✓ Auto-creates all directories (no scattered dir.create() calls)
# ✓ Easy to update storage structure globally
# ✓ Consistent naming across all 8 scripts
# ==============================================================================
setup_project_directories <- function(project_dir, data_source = "merged_tif") {
# ===========================================================================
# BASE DIRECTORIES (Foundation for all paths)
# ===========================================================================
laravel_storage_dir <- here("laravel_app", "storage", "app", project_dir)
# ===========================================================================
# TIER 1: RAW DATA & INPUT PATHS (Script 00 - Python download output)
# ===========================================================================
merged_tif_folder <- here(laravel_storage_dir, "merged_tif") # 4-band raw GeoTIFFs from Planet
# ===========================================================================
# TIER 2: TILING PATHS (Script 10 - Per-field tiff creation)
# ===========================================================================
# Per-field TIFF structure: field_tiles/{FIELD_NAME}/{YYYY-MM-DD}.tif
field_tiles_dir <- here(laravel_storage_dir, "field_tiles")
# Per-field CI TIFFs (pre-computed, used by Script 40): field_tiles_CI/{FIELD_NAME}/{YYYY-MM-DD}.tif
field_tiles_ci_dir <- here(laravel_storage_dir, "field_tiles_CI")
# Legacy tiling (for backward compatibility): daily_tiles_split/{grid_size}/{YYYY-MM-DD}/{YYYY-MM-DD}_XX.tif
daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split")
# ===========================================================================
# TIER 3: CI EXTRACTION PATHS (Script 20 - Canopy Index calculation)
# ===========================================================================
extracted_ci_base_dir <- here(laravel_storage_dir, "Data", "extracted_ci")
# Daily CI values (cumulative RDS): combined_CI_data.rds
daily_ci_vals_dir <- here(extracted_ci_base_dir, "daily_vals")
# Cumulative CI across time: All_pivots_Cumulative_CI_quadrant_year_v2.rds
cumulative_ci_vals_dir <- here(extracted_ci_base_dir, "cumulative_vals")
# Per-field CI data for Python harvest prediction (Script 21): ci_data_for_python.csv
ci_for_python_dir <- here(extracted_ci_base_dir, "ci_data_for_python")
# ===========================================================================
# TIER 4: GROWTH MODEL PATHS (Script 30 - Interpolation & smoothing)
# ===========================================================================
growth_model_interpolated_dir <- here(laravel_storage_dir, "growth_model_interpolated")
# ===========================================================================
# TIER 5: MOSAIC PATHS (Script 40 - Weekly mosaics)
# ===========================================================================
# Per-field weekly mosaics (per-field architecture): weekly_mosaic/{FIELD}/{week_XX_YYYY}.tif
weekly_mosaic_dir <- here(laravel_storage_dir, "weekly_mosaic")
# Tile-based weekly max (legacy): weekly_tile_max/{grid_size}/week_XX_YYYY.tif
weekly_tile_max_dir <- here(laravel_storage_dir, "weekly_tile_max")
# ===========================================================================
# TIER 6: KPI & REPORTING PATHS (Scripts 80, 90, 91)
# ===========================================================================
reports_dir <- here(laravel_storage_dir, "reports")
kpi_reports_dir <- here(reports_dir, "kpis") # Where Script 80 outputs KPI CSV/RDS files
kpi_field_stats_dir <- here(kpi_reports_dir, "field_stats") # Per-field KPI details
kpi_field_analysis_dir <- here(kpi_reports_dir, "field_analysis") # Field-level analysis for Script 91
# ===========================================================================
# TIER 7: SUPPORT PATHS (Data, VRT, Harvest)
# ===========================================================================
data_dir <- here(laravel_storage_dir, "Data")
vrt_dir <- here(data_dir, "vrt") # Virtual Raster files created during CI extraction
harvest_dir <- here(data_dir, "HarvestData") # Harvest schedule data
log_dir <- here(laravel_storage_dir, "logs") # Log files
# ===========================================================================
# TIER 8: CONFIG & METADATA PATHS
# ===========================================================================
# Field boundaries GeoJSON (same across all scripts)
field_boundaries_path <- here(data_dir, "pivot.geojson")
# Tiling configuration metadata from Script 10
tiling_config_path <- here(daily_tiles_split_dir, "tiling_config.json")
# ===========================================================================
# CREATE ALL DIRECTORIES (once per pipeline run)
# ===========================================================================
all_dirs <- c(
# Tier 1
merged_tif_folder,
# Tier 2
field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir,
# Tier 3
extracted_ci_base_dir, daily_ci_vals_dir, cumulative_ci_vals_dir, ci_for_python_dir,
# Tier 4
growth_model_interpolated_dir,
# Tier 5
weekly_mosaic_dir, weekly_tile_max_dir,
# Tier 6
reports_dir, kpi_reports_dir, kpi_field_stats_dir, kpi_field_analysis_dir,
# Tier 7
data_dir, vrt_dir, harvest_dir, log_dir
)
for (dir_path in all_dirs) {
dir.create(dir_path, showWarnings = FALSE, recursive = TRUE)
}
# ===========================================================================
# RETURN COMPREHENSIVE PATH LIST
# Scripts should source parameters_project.R and receive paths object like:
# paths <- setup_project_directories(project_dir)
# Then use: paths$merged_tif_folder, paths$daily_ci_vals_dir, etc.
# ===========================================================================
return(list(
# PROJECT ROOT
laravel_storage_dir = laravel_storage_dir,
# TIER 1: Raw data
merged_tif_folder = merged_tif_folder,
# TIER 2: Per-field TIFFs
field_tiles_dir = field_tiles_dir,
field_tiles_ci_dir = field_tiles_ci_dir,
daily_tiles_split_dir = daily_tiles_split_dir,
# TIER 3: CI Extraction
extracted_ci_base_dir = extracted_ci_base_dir,
daily_ci_vals_dir = daily_ci_vals_dir,
cumulative_ci_vals_dir = cumulative_ci_vals_dir,
ci_for_python_dir = ci_for_python_dir,
# TIER 4: Growth Model
growth_model_interpolated_dir = growth_model_interpolated_dir,
# TIER 5: Mosaics
weekly_mosaic_dir = weekly_mosaic_dir,
weekly_tile_max_dir = weekly_tile_max_dir,
# TIER 6: KPI & Reporting
reports_dir = reports_dir,
kpi_reports_dir = kpi_reports_dir,
kpi_field_stats_dir = kpi_field_stats_dir,
kpi_field_analysis_dir = kpi_field_analysis_dir,
# TIER 7: Support
data_dir = data_dir,
vrt_dir = vrt_dir,
harvest_dir = harvest_dir,
log_dir = log_dir,
# TIER 8: Config & Metadata
field_boundaries_path = field_boundaries_path,
tiling_config_path = tiling_config_path
))
}
# ==============================================================================
# TIER-BY-TIER PATH REFERENCE (for setup_project_directories output)
# ==============================================================================
#
# TIER 1: RAW DATA (Script 00 - Python download)
# paths$merged_tif_folder
# └─ {YYYY-MM-DD}.tif (4-band uint16 GeoTIFFs from Planet API)
#
# TIER 2: PER-FIELD TIFFS (Script 10)
# paths$field_tiles_dir/{FIELD_NAME}/{YYYY-MM-DD}.tif
# paths$field_tiles_ci_dir/{FIELD_NAME}/{YYYY-MM-DD}.tif
# paths$daily_tiles_split_dir/{grid_size}/{YYYY-MM-DD}/{YYYY-MM-DD}_XX.tif (legacy)
#
# TIER 3: CI EXTRACTION (Script 20)
# paths$daily_ci_vals_dir/combined_CI_data.rds
# paths$cumulative_ci_vals_dir/All_pivots_Cumulative_CI_quadrant_year_v2.rds
# paths$ci_for_python_dir/ci_data_for_python.csv (Script 21 output)
#
# TIER 4: GROWTH MODEL (Script 30)
# paths$growth_model_interpolated_dir/ (RDS files with interpolated CI)
#
# TIER 5: MOSAICS (Script 40)
# paths$weekly_mosaic_dir/{FIELD_NAME}/week_XX_YYYY.tif
# paths$weekly_tile_max_dir/{grid_size}/week_XX_YYYY_00.tif (legacy)
#
# TIER 6: KPI & REPORTING (Scripts 80, 90, 91)
# paths$kpi_reports_dir/ (KPI outputs from Script 80)
# paths$kpi_field_stats_dir/ (Per-field KPI RDS)
# paths$kpi_field_analysis_dir/ (Analysis RDS for Script 91)
# paths$reports_dir/ (Word/HTML reports)
#
# TIER 7: SUPPORT (Various scripts)
# paths$data_dir/pivot.geojson (Field boundaries)
# paths$data_dir/harvest.xlsx (Harvest schedule)
# paths$vrt_dir/ (Virtual raster files)
# paths$harvest_dir/ (Harvest predictions from Python)
# paths$log_dir/ (Pipeline logs)
#
# TIER 8: CONFIG & METADATA
# paths$field_boundaries_path (Full path to pivot.geojson)
# paths$tiling_config_path (Metadata from Script 10)
#
# ==============================================================================
# 5. Utility Functions
# ----------------------
# NOTE: load_field_boundaries() and load_harvesting_data() are defined in 00_common_utils.R
# to avoid duplication. They are sourced after parameters_project.R and used by all scripts.
# 6. Load harvesting data
# ---------------------
load_harvesting_data <- function(data_dir) {
harvest_file <- here(data_dir, "harvest.xlsx")
if (!file.exists(harvest_file)) {
warning(paste("Harvest data file not found at path:", harvest_file))
return(NULL)
}
# Helper function to parse dates with multiple format detection
parse_flexible_date <- function(x) {
if (is.na(x) || is.null(x)) return(NA_real_)
if (inherits(x, "Date")) return(x)
if (inherits(x, "POSIXct")) return(as.Date(x))
# If it's numeric (Excel date serial), convert directly
if (is.numeric(x)) {
return(as.Date(x, origin = "1899-12-30"))
}
# Try character conversion with multiple formats
x_char <- as.character(x)
# Try common formats: YYYY-MM-DD, DD/MM/YYYY, MM/DD/YYYY, YYYY-MM-DD HH:MM:SS
formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S")
for (fmt in formats) {
result <- suppressWarnings(as.Date(x_char, format = fmt))
if (!is.na(result)) return(result)
}
# If all else fails, return NA
return(NA)
}
tryCatch({
harvesting_data <- read_excel(harvest_file) %>%
dplyr::select(
c(
"field",
"sub_field",
"year",
"season_start",
"season_end",
"age",
"sub_area",
"tonnage_ha"
)
) %>%
mutate(
field = as.character(field),
sub_field = as.character(sub_field),
year = as.numeric(year),
season_start = sapply(season_start, parse_flexible_date),
season_end = sapply(season_end, parse_flexible_date),
season_start = as.Date(season_start, origin = "1970-01-01"),
season_end = as.Date(season_end, origin = "1970-01-01"),
age = as.numeric(age),
sub_area = as.character(sub_area),
tonnage_ha = as.numeric(tonnage_ha)
) %>%
mutate(
season_end = case_when(
season_end > Sys.Date() ~ Sys.Date(),
is.na(season_end) ~ Sys.Date(),
TRUE ~ season_end
),
age = round(as.numeric(season_end - season_start) / 7, 0)
)
return(harvesting_data)
}, error = function(e) {
warning(paste("Error loading harvesting data:", e$message))
return(NULL)
})
}
# 5. Define logging functions globally first
# ---------------------------------------
# Create a simple default log function in case setup_logging hasn't been called yet
log_message <- function(message, level = "INFO") {
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
formatted_message <- paste0("[", level, "] ", timestamp, " - ", message)
cat(formatted_message, "\n")
}
log_head <- function(list, level = "INFO") {
log_message(paste(capture.output(str(head(list))), collapse = "\n"), level)
}
# 8. Set up full logging system with file output
# -------------------------------------------
setup_logging <- function(log_dir) {
log_file <- here(log_dir, paste0(format(Sys.Date(), "%Y%m%d"), ".log"))
# Create enhanced log functions
log_message <- function(message, level = "INFO") {
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
formatted_message <- paste0("[", level, "] ", timestamp, " - ", message)
cat(formatted_message, "\n", file = log_file, append = TRUE)
# Also print to console for debugging
if (level %in% c("ERROR", "WARNING")) {
cat(formatted_message, "\n")
}
}
log_head <- function(list, level = "INFO") {
log_message(paste(capture.output(str(head(list))), collapse = "\n"), level)
}
# Update the global functions with the enhanced versions
assign("log_message", log_message, envir = .GlobalEnv)
assign("log_head", log_head, envir = .GlobalEnv)
return(list(
log_file = log_file,
log_message = log_message,
log_head = log_head
))
}
# 8. HELPER FUNCTIONS FOR COMMON CALCULATIONS
# -----------------------------------------------
# Centralized functions to reduce duplication across scripts
# Get ISO week and year from a date
get_iso_week <- function(date) {
as.numeric(format(date, "%V"))
}
get_iso_year <- function(date) {
as.numeric(format(date, "%G"))
}
# Get both ISO week and year as a list
get_iso_week_year <- function(date) {
list(
week = as.numeric(format(date, "%V")),
year = as.numeric(format(date, "%G"))
)
}
# Format week/year into a readable label
format_week_label <- function(date, separator = "_") {
wwy <- get_iso_week_year(date)
sprintf("week%02d%s%d", wwy$week, separator, wwy$year)
}
# Auto-detect mosaic mode
# For per-field architecture, always returns "single-file" (weekly_mosaic/{FIELD}/week_*.tif)
detect_mosaic_mode <- function(project_dir) {
# Per-field architecture uses single-file mosaics organized per-field
weekly_mosaic <- file.path("laravel_app", "storage", "app", project_dir, "weekly_mosaic")
if (dir.exists(weekly_mosaic)) {
return("single-file") # Per-field structure
}
return("unknown")
}
# Auto-detect grid size from tile directory structure
# For per-field architecture, returns "unknown" since grid-based organization is legacy
detect_grid_size <- function(project_dir) {
# Per-field architecture doesn't use grid-based organization anymore
return("unknown")
}
# Build storage paths consistently across all scripts
get_project_storage_path <- function(project_dir, subdir = NULL) {
base <- file.path("laravel_app", "storage", "app", project_dir)
if (!is.null(subdir)) file.path(base, subdir) else base
}
get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") {
# Per-field architecture always uses weekly_mosaic (single-file, per-field organization)
get_project_storage_path(project_dir, "weekly_mosaic")
}
get_kpi_dir <- function(project_dir, client_type) {
subdir <- if (client_type == "agronomic_support") "field_level" else "field_analysis"
get_project_storage_path(project_dir, file.path("reports", "kpis", subdir))
}
# Logging functions moved to 00_common_utils.R
# - smartcane_log() — Main logging function with level prefix
# - smartcane_debug() — Conditional debug logging
# - smartcane_warn() — Warning wrapper
# Import with: source("r_app/00_common_utils.R")
# ============================================================================
# PHASE 3 & 4: OPTIMIZATION & DOCUMENTATION
# ============================================================================
# System Constants
# ----------------
# Define once, use everywhere
RSCRIPT_PATH <- "C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe"
# Used in run_full_pipeline.R for calling R scripts via system()
# Data Source Documentation
# ---------------------------
# Explains the two satellite data formats and when to use each
#
# SmartCane uses PlanetScope imagery from Planet Labs API in two formats:
#
# 1. merged_tif (4-band):
# - Standard format: Red, Green, Blue, Near-Infrared
# - Size: ~150-200 MB per date
# - Use case: Agronomic support, general crop health monitoring
# - Projects: aura, xinavane
# - Cloud handling: Basic cloud masking from Planet metadata
#
# 2. merged_tif_8b (8-band with cloud confidence):
# - Enhanced format: 4-band imagery + 4-band UDM2 cloud mask
# - UDM2 bands: Clear, Snow, Shadow, Light Haze
# - Size: ~250-350 MB per date
# - Use case: Harvest prediction, supply chain optimization
# - Projects: angata, chemba, esa (cane_supply clients)
# - Cloud handling: Per-pixel cloud confidence from Planet UDM2
# - Why: Cane supply chains need precise confidence to predict harvest dates
# (don't want to predict based on cloudy data)
#
# The system auto-detects which is available via detect_data_source()
# Mosaic Mode Documentation
# --------------------------
# SmartCane supports two ways to store and process weekly mosaics:
#
# 1. Single-file mosaic ("single-file"):
# - One GeoTIFF per week: weekly_mosaic/week_02_2026.tif
# - 5 bands per file: R, G, B, NIR, CI (Canopy Index)
# - Size: ~300-500 MB per week
# - Pros: Simpler file management, easier full-field visualization
# - Cons: Slower for field-specific queries, requires loading full raster
# - Best for: Agronomic support (aura) with <100 fields
# - Script 04 output: 5-band single-file mosaic
#
# 2. Tiled mosaic ("tiled"):
# - Grid of tiles per week: weekly_tile_max/5x5/week_02_2026_{TT}.tif
# - Example: 25 files (5×5 grid) × 5 bands = 125 individual tiffs
# - Size: ~15-20 MB per tile, organized in folders
# - Pros: Parallel processing, fast field lookups, scales to 1000+ fields
# - Cons: More file I/O, requires tile-to-field mapping metadata
# - Best for: Cane supply (angata, chemba) with 500+ fields
# - Script 04 output: Per-tile tiff files in weekly_tile_max/{grid}/
# - Tile assignment: Field boundaries mapped to grid coordinates
#
# The system auto-detects which is available via detect_mosaic_mode()
# Client Type Documentation
# --------------------------
# SmartCane runs different analysis pipelines based on client_type:
#
# CLIENT_TYPE: cane_supply
# Purpose: Optimize sugar mill supply chain (harvest scheduling)
# Scripts run: 20 (CI), 21 (RDS to CSV), 30 (Growth), 31 (Harvest pred), 40 (Mosaic), 80 (KPI), 91 (Excel)
# Outputs:
# - Per-field analysis: field status, growth phase, harvest readiness
# - Excel reports (Script 91): Detailed metrics for logistics planning
# - KPI directory: reports/kpis/field_analysis/ (one RDS per week)
# Harvest data: Required (harvest.xlsx - planting dates for phase assignment)
# Data source: merged_tif_8b (uses cloud confidence for confidence)
# Mosaic mode: tiled (scales to 500+ fields)
# Projects: angata, chemba, xinavane, esa
#
# CLIENT_TYPE: agronomic_support
# Purpose: Provide weekly crop health insights to agronomists
# Scripts run: 80 (KPI), 90 (Word report)
# Outputs:
# - Farm-level KPI summaries (no per-field breakdown)
# - Word reports (Script 90): Charts and trends for agronomist decision support
# - KPI directory: reports/kpis/field_level/ (one RDS per week)
# Harvest data: Not used
# Data source: merged_tif (simpler, smaller)
# Mosaic mode: single-file (100-200 fields)
# Projects: aura
#
# Detect data source (merged_tif vs merged_tif_8b) based on availability
# Returns the first available source; defaults to merged_tif_8b if neither exists
detect_data_source <- function(project_dir) {
# Data source is always merged_tif for consistency
return("merged_tif")
}
# Check KPI completeness for a reporting period
# Returns: List with kpis_df (data.frame), missing_count, and all_complete (boolean)
# This replaces duplicate KPI checking logic in run_full_pipeline.R (lines ~228-270, ~786-810)
check_kpi_completeness <- function(project_dir, client_type, end_date, reporting_weeks_needed) {
kpi_dir <- get_kpi_dir(project_dir, client_type)
kpis_needed <- data.frame()
for (weeks_back in 0:(reporting_weeks_needed - 1)) {
check_date <- end_date - (weeks_back * 7)
wwy <- get_iso_week_year(check_date)
# Build week pattern and check if it exists
week_pattern <- sprintf("week%02d_%d", wwy$week, wwy$year)
files_this_week <- list.files(kpi_dir, pattern = week_pattern)
has_kpis <- length(files_this_week) > 0
# Track missing weeks
kpis_needed <- rbind(kpis_needed, data.frame(
week = wwy$week,
year = wwy$year,
date = check_date,
has_kpis = has_kpis,
pattern = week_pattern,
file_count = length(files_this_week)
))
# Debug logging
smartcane_debug(sprintf(
"Week %02d/%d (%s): %s (%d files)",
wwy$week, wwy$year, format(check_date, "%Y-%m-%d"),
if (has_kpis) "✓ FOUND" else "✗ MISSING",
length(files_this_week)
))
}
# Summary statistics
missing_count <- sum(!kpis_needed$has_kpis)
all_complete <- missing_count == 0
return(list(
kpis_df = kpis_needed,
kpi_dir = kpi_dir,
missing_count = missing_count,
missing_weeks = kpis_needed[!kpis_needed$has_kpis, ],
all_complete = all_complete
))
}
# ==============================================================================
# HELPER FUNCTIONS FOR run_full_pipeline.R PATH VERIFICATION (SC-116)
# ==============================================================================
# These functions replace hardcoded file.path() calls in run_full_pipeline.R
# with centralized, testable helper functions. Each function verifies a specific
# output directory for a pipeline stage.
#' Get verification path for Script 31 harvest output
#'
#' @param project_dir Character. Project name (e.g., "angata", "aura")
#' @param week_num Integer. ISO week number (01-53)
#' @param year_num Integer. Year (e.g., 2026)
#' @return Character. Full path to expected harvest imminent CSV file
#' @details
#' Script 31 generates: {project}_{project}_harvest_imminent_week_{WW}_{YYYY}.csv
#' Location: laravel_app/storage/app/{project}/reports/kpis/field_stats/
#'
get_harvest_output_path <- function(project_dir, week_num, year_num) {
file.path(
"laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats",
sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, week_num, year_num)
)
}
#' Check if harvest output file exists for a given week
#'
#' @param project_dir Character. Project name
#' @param week_num Integer. ISO week number
#' @param year_num Integer. Year
#' @return Logical. TRUE if file exists, FALSE otherwise
#'
check_harvest_output_exists <- function(project_dir, week_num, year_num) {
path <- get_harvest_output_path(project_dir, week_num, year_num)
file.exists(path)
}
#' Get expected output directory for a mosaic verification based on mode
#'
#' @param project_dir Character. Project name
#' @param mosaic_mode Character. Either "tiled" or "single-file"
#' @return Character. Full path to mosaic directory
#'
#' @details
#' Tiled: laravel_app/storage/app/{project}/weekly_tile_max/
#' Single-file: laravel_app/storage/app/{project}/weekly_mosaic/
#'
get_mosaic_verification_dir <- function(project_dir, mosaic_mode) {
base <- file.path("laravel_app", "storage", "app", project_dir)
if (mosaic_mode == "tiled") {
file.path(base, "weekly_tile_max")
} else {
# Default to single-file (single-file is standard for per-field architecture)
file.path(base, "weekly_mosaic")
}
}
#' Check if mosaic files exist for a specific week
#'
#' @param project_dir Character. Project name
#' @param week_num Integer. ISO week number
#' @param year_num Integer. Year
#' @param mosaic_mode Character. "tiled" or "single-file"
#' @return List with created (logical), file_count (int), and sample_files (char vector)
#'
check_mosaic_exists <- function(project_dir, week_num, year_num, mosaic_mode) {
mosaic_dir <- get_mosaic_verification_dir(project_dir, mosaic_mode)
if (!dir.exists(mosaic_dir)) {
return(list(created = FALSE, file_count = 0, sample_files = character()))
}
week_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year_num)
# Search recursively for per-field architecture support
mosaic_files <- list.files(mosaic_dir, pattern = week_pattern, recursive = TRUE, full.names = FALSE)
list(
created = length(mosaic_files) > 0,
file_count = length(mosaic_files),
sample_files = head(mosaic_files, 3) # First 3 files as sample
)
}
# 9. Initialize the project
# ----------------------
# Export project directories and settings
initialize_project <- function(project_dir, data_source = "merged_tif") {
# Set up directory structure, passing data_source to select TIF folder
dirs <- setup_project_directories(project_dir, data_source = data_source)
# Set up logging
logging <- setup_logging(dirs$log_dir)
# Load field boundaries
boundaries <- load_field_boundaries(dirs$data_dir)
# Load harvesting data
harvesting_data <- load_harvesting_data(dirs$data_dir)
# Return all initialized components
return(c(
dirs,
list(
logging = logging,
field_boundaries = boundaries$field_boundaries,
field_boundaries_sf = boundaries$field_boundaries_sf,
harvesting_data = harvesting_data
)
))
}
# When script is sourced, initialize with the global project_dir variable if it exists
if (exists("project_dir")) {
# Now we can safely log before initialization
log_message(paste("Initializing project with directory:", project_dir))
# Use data_source if it exists (passed from 02_ci_extraction.R), otherwise use default
data_src <- if (exists("data_source")) data_source else "merged_tif"
log_message(paste("Using data source directory:", data_src))
project_config <- initialize_project(project_dir, data_source = data_src)
# Expose all variables to the global environment
list2env(project_config, envir = .GlobalEnv)
# Log project initialization completion with tile mode info
log_message(paste("Project initialized with directory:", project_dir))
if (exists("use_tile_mosaic")) {
mosaic_mode <- if (use_tile_mosaic) "TILE-BASED" else "SINGLE-FILE"
log_message(paste("Mosaic mode detected:", mosaic_mode))
if (exists("tile_detection_info") && !is.null(tile_detection_info)) {
log_message(paste(" - Detection source:", tile_detection_info$detected_source))
log_message(paste(" - Grid size:", tile_detection_info$grid_size))
log_message(paste(" - Detected files in storage:", tile_detection_info$detected_count))
if (length(tile_detection_info$sample_tiles) > 0) {
log_message(paste(" - Sample tile files:", paste(tile_detection_info$sample_tiles, collapse = ", ")))
}
}
}
} else {
warning("project_dir variable not found. Please set project_dir before sourcing parameters_project.R")
}
#' Safe Logging Function
#'
#' Generic logging with [LEVEL] prefix. Works standalone without any framework.
#' Consistent with SmartCane logging standard.
#'
#' @param message The message to log
#' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG"
#' @return NULL (invisible, used for side effects)
#'
#' @examples
#' safe_log("Processing started", "INFO")
#' safe_log("Check input file", "WARNING")
#' safe_log("Failed to load data", "ERROR")
#'
safe_log <- function(message, level = "INFO") {
prefix <- sprintf("[%s]", level)
cat(sprintf("%s %s\n", prefix, message))
}
#' SmartCane Debug Logging (Conditional)
#'
#' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set.
#' Useful for development/troubleshooting without cluttering normal output.
#'
#' @param message The message to log
#' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE)
#' @return NULL (invisible, used for side effects)
#'
#' @examples
#' smartcane_debug("Processing field 1", verbose = FALSE) # Only if SMARTCANE_DEBUG=TRUE
#' smartcane_debug("Detailed state info", verbose = TRUE) # Always outputs
#'
smartcane_debug <- function(message, verbose = FALSE) {
if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") {
return(invisible(NULL))
}
safe_log(message, level = "DEBUG")
}
#' SmartCane Warning Logging
#'
#' Logs WARN-level messages. Convenience wrapper around safe_log().
#'
#' @param message The message to log
#' @return NULL (invisible, used for side effects)
#'
#' @examples
#' smartcane_warn("Check data format before proceeding")
#'
smartcane_warn <- function(message) {
safe_log(message, level = "WARN")
}
#' Load Field Boundaries from GeoJSON
#'
#' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson).
#' Handles CRS validation and column standardization.
#'
#' @param data_dir Directory containing GeoJSON file
#' @return List with elements:
#' - field_boundaries_sf: sf (Simple Features) object
#' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback)
#'
#' @details
#' Automatically selects pivot_2.geojson for ESA project during CI extraction,
#' otherwise uses pivot.geojson. Handles both multi-polygon and simple polygon geometries.
#'
#' @examples
#' boundaries <- load_field_boundaries("laravel_app/storage/app/angata")
#' head(boundaries$field_boundaries_sf)
#'
load_field_boundaries <- function(data_dir) {
# Choose field boundaries file based on project and script type
# ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model)
# All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson
use_pivot_2 <- exists("project_dir") && project_dir == "esa" &&
exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03
if (use_pivot_2) {
field_boundaries_path <- file.path(data_dir, "pivot_2.geojson")
} else {
field_boundaries_path <- file.path(data_dir, "pivot.geojson")
}
if (!file.exists(field_boundaries_path)) {
stop(paste("Field boundaries file not found at path:", field_boundaries_path))
}
tryCatch({
# Read GeoJSON with explicit CRS handling
field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE)
# Remove OBJECTID column immediately if it exists
if ("OBJECTID" %in% names(field_boundaries_sf)) {
field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID)
}
# **CRITICAL**: Repair invalid geometries (degenerate vertices, self-intersections, etc.)
# This must happen BEFORE any spatial operations (CRS transform, intersect, crop, etc.)
# to prevent S2 geometry validation errors during downstream processing
field_boundaries_sf <- repair_geojson_geometries(field_boundaries_sf)
# Validate and fix CRS if needed
tryCatch({
# Simply assign WGS84 if not already set (safe approach)
if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) {
st_crs(field_boundaries_sf) <- 4326
warning("CRS was missing, assigned WGS84 (EPSG:4326)")
}
}, error = function(e) {
tryCatch({
st_crs(field_boundaries_sf) <<- 4326
}, error = function(e2) {
warning(paste("Could not set CRS:", e2$message))
})
})
# Handle column names - accommodate optional sub_area column
if ("sub_area" %in% names(field_boundaries_sf)) {
field_boundaries_sf <- field_boundaries_sf %>%
dplyr::select(field, sub_field, sub_area) %>%
sf::st_set_geometry("geometry")
} else {
field_boundaries_sf <- field_boundaries_sf %>%
dplyr::select(field, sub_field) %>%
sf::st_set_geometry("geometry")
}
# Convert to terra vector if possible, otherwise use sf
field_boundaries <- tryCatch({
field_boundaries_terra <- terra::vect(field_boundaries_sf)
crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL)
crs_str <- if (!is.null(crs_value)) as.character(crs_value) else ""
if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) {
terra::crs(field_boundaries_terra) <- "EPSG:4326"
warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)")
}
field_boundaries_terra
}, error = function(e) {
warning(paste("Terra conversion failed, using sf object instead:", e$message))
field_boundaries_sf
})
return(list(
field_boundaries_sf = field_boundaries_sf,
field_boundaries = field_boundaries
))
}, error = function(e) {
cat("[DEBUG] Error in load_field_boundaries:\n")
cat(" Message:", e$message, "\n")
cat(" Call:", deparse(e$call), "\n")
stop(paste("Error loading field boundaries:", e$message))
})
}
#' Generate a Sequence of Dates for Processing
#'
#' Creates a date range from start_date to end_date and extracts week/year info.
#' Used by Scripts 20, 30, 40 to determine data processing windows.
#'
#' @param end_date The end date for the sequence (Date object or "YYYY-MM-DD" string)
#' @param offset Number of days to look back from end_date (e.g., 7 for one week)
#' @return A list containing:
#' - week: ISO week number of start_date
#' - year: ISO year of start_date
#' - days_filter: Vector of dates in "YYYY-MM-DD" format
#' - start_date: Start date as Date object
#' - end_date: End date as Date object
#'
#' @details
#' IMPORTANT: Uses `lubridate::week()` and `lubridate::year()` which return
#' ISO week numbers (week 1 starts on Monday). For ISO week-based calculations,
#' use `lubridate::isoweek()` and `lubridate::isoyear()` instead.
#'
#' @examples
#' dates <- date_list(as.Date("2025-01-15"), offset = 7)
#' # Returns: week=2, year=2025, days_filter = c("2025-01-09", ..., "2025-01-15")
#'
#' dates <- date_list("2025-12-31", offset = 14)
#' # Handles string input and returns 14 days of data
#'
date_list <- function(end_date, offset) {
# Input validation
if (!lubridate::is.Date(end_date)) {
end_date <- as.Date(end_date)
if (is.na(end_date)) {
stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
}
}
offset <- as.numeric(offset)
if (is.na(offset) || offset < 1) {
stop("Invalid offset provided. Expected a positive number.")
}
# Calculate date range
offset <- offset - 1 # Adjust offset to include end_date
start_date <- end_date - lubridate::days(offset)
# Extract ISO week and year information (from END date for reporting period)
week <- lubridate::isoweek(end_date)
year <- lubridate::isoyear(end_date)
# Generate sequence of dates
days_filter <- seq(from = start_date, to = end_date, by = "day")
days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering
# Log the date range
safe_log(paste("Date range generated from", start_date, "to", end_date))
return(list(
"week" = week,
"year" = year,
"days_filter" = days_filter,
"start_date" = start_date,
"end_date" = end_date
))
}
# ==============================================================================
#' Repair Invalid GeoJSON Geometries
#'
#' Fixes common geometry issues in GeoJSON/sf objects:
#' - Degenerate vertices (duplicate points)
#' - Self-intersecting polygons
#' - Invalid ring orientation
#' - Empty or NULL geometries
#'
#' Uses sf::st_make_valid() with buffer trick as fallback.
#'
#' @param sf_object sf object (GeoDataFrame) with potentially invalid geometries
#' @return sf object with repaired geometries
#'
#' @details
#' **Why this matters:**
#' Pivot GeoJSON files sometimes contain degenerate vertices or self-intersecting
#' rings from manual editing or GIS data sources. These cause errors when using
#' S2 geometry (strict validation) during cropping operations.
#'
#' **Repair strategy (priority order):**
#' 1. Try st_make_valid() - GEOS-based repair (most reliable)
#' 2. Fallback: st_union() + buffer(0) - Forces polygon validity
#' 3. Last resort: Silently keep original if repair fails
#'
#' @examples
#' \dontrun{
#' fields <- st_read("pivot.geojson")
#' fields_fixed <- repair_geojson_geometries(fields)
#' cat(paste("Fixed geometries: before=",
#' nrow(fields[!st_is_valid(fields), ]),
#' ", after=",
#' nrow(fields_fixed[!st_is_valid(fields_fixed), ])))
#' }
#'
repair_geojson_geometries <- function(sf_object) {
if (!inherits(sf_object, "sf")) {
stop("Input must be an sf (Simple Features) object")
}
# Count invalid geometries BEFORE repair
invalid_before <- sum(!sf::st_is_valid(sf_object), na.rm = TRUE)
if (invalid_before == 0) {
safe_log("All geometries already valid - no repair needed", "INFO")
return(sf_object)
}
safe_log(paste("Found", invalid_before, "invalid geometries - attempting repair"), "WARNING")
# STRATEGY: Apply st_make_valid() to entire sf object (most reliable for GEOS)
# This handles degenerate vertices, self-intersections, invalid rings while preserving all features
repaired <- tryCatch({
# st_make_valid() on entire sf object preserves all features and attributes
repaired_geom <- sf::st_make_valid(sf_object)
# Verify we still have the same number of rows
if (nrow(repaired_geom) != nrow(sf_object)) {
warning("st_make_valid() changed number of features - attempting row-wise repair")
# Fallback: Repair row-by-row to maintain original structure
repaired_geom <- sf_object
for (i in seq_len(nrow(sf_object))) {
tryCatch({
if (!sf::st_is_valid(sf_object[i, ])) {
repaired_geom[i, ] <- sf::st_make_valid(sf_object[i, ])
}
}, error = function(e) {
safe_log(paste("Could not repair row", i, "-", e$message), "WARNING")
})
}
}
safe_log("✓ st_make_valid() successfully repaired geometries", "INFO")
repaired_geom
}, error = function(e) {
safe_log(paste("st_make_valid() failed:", e$message), "WARNING")
NULL
})
# If repair failed, keep original
if (is.null(repaired)) {
safe_log(paste("Could not repair", invalid_before, "invalid geometries - keeping original"),
"WARNING")
return(sf_object)
}
# Count invalid geometries AFTER repair
invalid_after <- sum(!sf::st_is_valid(repaired), na.rm = TRUE)
safe_log(paste("Repair complete: before =", invalid_before, ", after =", invalid_after), "INFO")
return(repaired)
}
# ==============================================================================
# END 00_COMMON_UTILS.R
# ==============================================================================