SmartCane/r_app/80_calculate_kpis.R

# ============================================================================
# SCRIPT 80: Key Performance Indicator (KPI) Calculation
# ============================================================================
# PURPOSE:
#   Calculate per-field and farm-level KPIs from weekly CI mosaics. Computes
#   field uniformity, growth trends (4-week and 8-week), area change detection,
#   TCH forecasts, stress identification, and weed presence. Generates
#   comprehensive Excel/CSV/RDS exports for dashboards and stakeholder reporting.
#
# INPUT DATA:
#   - Source 1: laravel_app/storage/app/{project}/weekly_mosaic/{FIELD}/week_*.tif
#   - Source 2: Field boundaries (pivot.geojson) and harvest data (harvest.xlsx)
#   - Source 3: Historical field stats (RDS from previous weeks)
#
# OUTPUT DATA:
#   - Destination: laravel_app/storage/app/{project}/output/
#   - Format: Excel (.xlsx), CSV (.csv), RDS (.rds)
#   - Files: {project}_field_analysis_week{WW}_{YYYY}.xlsx + metadata
#
# USAGE:
#   Rscript 80_calculate_kpis.R [project] [week] [year]
#
#   Example (Windows PowerShell):
#   & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R angata 5 2026
#
# PARAMETERS:
#   - project: Project name (character) - angata, chemba, xinavane, esa, simba
#   - week: ISO week number (numeric, 1-53, default current week)
#   - year: ISO year (numeric, default current year)
#
# CLIENT TYPES:
#   - cane_supply: Yes - uses 80_utils_cane_supply.R (placeholder)
#   - agronomic_support: Yes - uses 80_utils_agronomic_support.R (6 KPI funcs)
#
# DEPENDENCIES:
#   - Packages: terra, sf, tidyverse, lubridate, writexl, spdep
#   - Utils files: parameters_project.R, 00_common_utils.R, 80_utils_agronomic_support.R, 80_utils_cane_supply.R
#   - External data: Field boundaries (pivot.geojson), harvest data (harvest.xlsx)
#   - Input data: Weekly mosaic TIFFs (Script 40 output)
#   - Data directories: weekly_mosaic/, output/ (created if missing)
#
# NOTES:
#   - KPIs calculated: Field Uniformity (CV), Area Change (pixel %), TCH Forecast,
#     Growth/Decline Trend, Weed Presence (spatial autocorrelation), Gap Filling %
#   - Client-aware: Conditional sourcing based on client_config$script_90_compatible
#   - Exports: 21-column output with field-level metrics, phase, status, alerts
#   - Supports 4-week and 8-week trend analysis from historical RDS cache
#   - Critical dependency for Scripts 90/91 (reporting/dashboards)
#   - Uses Moran's I for spatial clustering detection (weed/stress patterns)
#
#
# ============================================================================
#    [✓] Extract planting dates per field
#    [✓] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
#
# COMMAND-LINE USAGE:
#   Option 1: Rscript 80_calculate_kpis.R 2026-01-14 angata
#     Arguments: [end_date] [project_dir]
#
#   Option 2: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
#     Arguments: [end_date] [project_dir] [offset_days]
#
#  & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-01-12 angata 7
#
# Usage in run_full_pipeline.R:
#   source("r_app/80_calculate_kpis.R")
#   main()

# ============================================================================
# NEXT INTEGRATIONS (See Linear issues for detailed requirements)
# ============================================================================
# 1. [✓] Load imminent_prob from script 31 (week_WW_YYYY.csv)
# 2. [✓] Load planting_date from harvest.xlsx for field-specific age calculation
# 3. [ ] Improve Status_trigger logic to use actual imminent_prob values
# ============================================================================

# ============================================================================
# CONFIGURATION
# ============================================================================
FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5
FOUR_WEEK_TREND_GROWTH_MIN <- 0.1
FOUR_WEEK_TREND_GROWTH_MAX <- 0.5
FOUR_WEEK_TREND_NO_GROWTH_RANGE <- 0.1
FOUR_WEEK_TREND_DECLINE_MAX <- -0.1
FOUR_WEEK_TREND_DECLINE_MIN <- -0.5
FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5

# CV TREND THRESHOLDS
CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05

# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS
# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend
#   Negative slope = CV decreasing = field becoming MORE uniform = GOOD
#   Positive slope = CV increasing = field becoming MORE patchy = BAD
#   Near zero = Homogenous growth (all crops progressing equally)
CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.03   # CV decreasing rapidly (>3% drop over 8 weeks)
CV_SLOPE_IMPROVEMENT_MIN <- -0.02         # CV decreasing (2-3% drop over 8 weeks)
CV_SLOPE_IMPROVEMENT_MAX <- -0.01         # Gradual improvement (1-2% drop over 8 weeks)
CV_SLOPE_HOMOGENOUS_MIN <- -0.01          # Essentially stable (small natural variation)
CV_SLOPE_HOMOGENOUS_MAX <- 0.01           # No change in uniformity (within ±1% over 8 weeks)
CV_SLOPE_PATCHINESS_MIN <- 0.01           # Minor divergence (1-2% increase over 8 weeks)
CV_SLOPE_PATCHINESS_MAX <- 0.02           # Growing patchiness (2-3% increase over 8 weeks)
CV_SLOPE_SEVERE_MIN <- 0.02               # Severe fragmentation (>3% increase over 8 weeks)


# PERCENTILE CALCULATIONS
CI_PERCENTILE_LOW <- 0.10
CI_PERCENTILE_HIGH <- 0.90

# GERMINATION THRESHOLD (for germination_progress calculation)
GERMINATION_CI_THRESHOLD <- 2.0

# PLANTING DATE & AGE CONFIGURATION
# Load from harvest.xlsx (scripts 22 & 23) - no fallback to uniform dates

# HISTORICAL DATA LOOKBACK
WEEKS_FOR_FOUR_WEEK_TREND <- 4
WEEKS_FOR_CV_TREND_SHORT <- 2
WEEKS_FOR_CV_TREND_LONG <- 8

# ============================================================================
# 1. Load required libraries
# ============================================================================

suppressPackageStartupMessages({
  # File path handling
  library(here)      # For relative path resolution (platform-independent file paths)

  # Spatial data handling
  library(sf)        # For reading/manipulating field boundaries (GeoJSON)
  library(terra)     # For raster operations (reading mosaic TIFFs)

  # Data manipulation
  library(dplyr)     # For data wrangling (filter, mutate, group_by, summarize)
  library(tidyr)     # For data reshaping (pivot_longer, pivot_wider, gather)
  library(lubridate) # For date/time operations (week extraction, date arithmetic)

  # File I/O
  library(readr)     # For reading CSV files (harvest predictions from Python)
  library(readxl)    # For reading harvest.xlsx (harvest dates for field mapping)
  library(writexl)   # For writing Excel outputs (KPI summary tables)
  library(progress)  # For progress bars during field processing

  # ML models (for yield prediction KPI)
  library(caret)     # For training Random Forest with cross-validation
  library(CAST)      # For Forward Feature Selection in caret models

  # ML models (for yield prediction KPI)
  library(caret)     # For training Random Forest with cross-validation
  library(CAST)      # For Forward Feature Selection in caret models


})

# ============================================================================
# LOAD CONFIGURATION - MUST BE DONE FIRST
# ============================================================================
# Parameters must be loaded early to determine client type and paths

tryCatch({
  source(here("r_app", "parameters_project.R"))
}, error = function(e) {
  stop("Error loading parameters_project.R: ", e$message)
})

# Get client configuration from global project setup
# NOTE: This cannot be done until parameters_project.R is sourced
# We determine client_type from the current project_dir (if running in main() context)
# For now, set a placeholder that will be overridden in main()
if (exists("project_dir", envir = .GlobalEnv)) {
  temp_client_type <- get_client_type(get("project_dir", envir = .GlobalEnv))
} else {
  temp_client_type <- "cane_supply"  # Safe default
}
temp_client_config <- get_client_kpi_config(temp_client_type)

# ============================================================================
# LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES
# ============================================================================
# All clients use the common utilities (shared statistical functions, reporting)
tryCatch({
  source(here("r_app", "80_utils_common.R"))
}, error = function(e) {
  stop("Error loading 80_utils_common.R: ", e$message)
})

# Load both client-specific utilities (functions will be available for both workflows)
# This avoids needing to determine client type at startup time
message("Loading client-specific utilities (80_utils_agronomic_support.R and 80_utils_cane_supply.R)...")
tryCatch({
  source(here("r_app", "80_utils_agronomic_support.R"))
}, error = function(e) {
  stop("Error loading 80_utils_agronomic_support.R: ", e$message)
})

tryCatch({
  source(here("r_app", "80_utils_cane_supply.R"))
}, error = function(e) {
  stop("Error loading 80_utils_cane_supply.R: ", e$message)
})

# ============================================================================
# PHASE AND STATUS TRIGGER DEFINITIONS
# ============================================================================

PHASE_DEFINITIONS <- data.frame(
  phase = c("Germination", "Tillering", "Grand Growth", "Maturation"),
  age_start = c(0, 4, 17, 39),
  age_end = c(6, 16, 39, 200),
  stringsAsFactors = FALSE
)

STATUS_TRIGGERS <- data.frame(
  trigger = c(
    "germination_started",
    "germination_complete",
    "stress_detected_whole_field",
    "strong_recovery",
    "growth_on_track",
    "maturation_progressing",
    "harvest_ready"
  ),
  age_min = c(0, 0, NA, NA, 4, 39, 45),
  age_max = c(6, 6, NA, NA, 39, 200, 200),
  description = c(
    "10% of field CI > 2",
    "70% of field CI >= 2",
    "CI decline > -1.5 + low CV",
    "CI increase > +1.5",
    "CI increasing consistently",
    "High CI, stable/declining",
    "Age 45+ weeks (ready to harvest)"
  ),
  stringsAsFactors = FALSE
)


main <- function() {
  # Parse command-line arguments
  args <- commandArgs(trailingOnly = TRUE)

  # end_date (arg 1)
  # Priority: 1) Command-line arg, 2) Global end_date variable (for recursive calls), 3) Global end_date_str, 4) Sys.Date()
  end_date <- if (length(args) >= 1 && !is.na(args[1])) {
    # Parse date explicitly in YYYY-MM-DD format from command line
    as.Date(args[1], format = "%Y-%m-%d")
  } else if (exists("end_date", envir = .GlobalEnv)) {
    global_date <- get("end_date", envir = .GlobalEnv)
    # Check if it's a valid Date with length > 0
    if (is.Date(global_date) && length(global_date) > 0 && !is.na(global_date)) {
      global_date
    } else if (exists("end_date_str", envir = .GlobalEnv)) {
      as.Date(get("end_date_str", envir = .GlobalEnv))
    } else {
      Sys.Date()
    }
  } else if (exists("end_date_str", envir = .GlobalEnv)) {
    as.Date(get("end_date_str", envir = .GlobalEnv))
  } else {
    Sys.Date()
  }

  # project_dir (arg 2)
  project_dir <- if (length(args) >= 2 && !is.na(args[2])) {
    as.character(args[2])
  } else if (exists("project_dir", envir = .GlobalEnv)) {
    get("project_dir", envir = .GlobalEnv)
  } else {
    "angata"
  }

  # offset (arg 3) - for backward compatibility with old 09
  offset <- if (length(args) >= 3 && !is.na(args[3])) {
    as.numeric(args[3])
  } else {
    7
  }

  # Validate end_date is a proper Date object
  if (is.null(end_date) || length(end_date) == 0 || !inherits(end_date, "Date")) {
    stop("ERROR: end_date is not valid. Got: ", class(end_date), " with length ", length(end_date))
  }

  assign("project_dir", project_dir, envir = .GlobalEnv)
  assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv)

  message("\n", strrep("=", 70))
  message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION")
  message(strrep("=", 70))
  message("Date:", format(end_date, "%Y-%m-%d"))
  message("Project:", project_dir)
  message("Mode: Conditional KPI calculation based on client type")
  message("")

  # Load configuration and utilities
  # source(here("r_app", "crop_messaging_utils.R"))

  tryCatch({
    source(here("r_app", "parameters_project.R"))
  }, error = function(e) {
    stop("Error loading parameters_project.R: ", e$message)
  })

  # Initialize project directories and configuration
  setup <- setup_project_directories(project_dir)

  # DETERMINE CLIENT TYPE AND KPI CONFIGURATION
  client_type <- get_client_type(project_dir)
  client_config <- get_client_kpi_config(client_type)

  # Assign to global environment so utilities and downstream scripts can access it
  assign("client_config", client_config, envir = .GlobalEnv)

  message("Client Type:", client_type)
  message("KPI Calculations:", paste(client_config$kpi_calculations, collapse = ", "))
  message("Output Formats:", paste(client_config$outputs, collapse = ", "))

  # Use centralized paths from setup object (no need for file.path calls)
  weekly_mosaic <- setup$weekly_mosaic_dir
  daily_vals_dir <- setup$daily_ci_vals_dir
  reports_dir <- setup$kpi_reports_dir
  data_dir <- setup$data_dir


  tryCatch({
    source(here("r_app", "30_growth_model_utils.R"))
  }, error = function(e) {
    warning("30_growth_model_utils.R not found - yield prediction KPI will use placeholder data")
  })

  # CONDITIONAL EXECUTION BASED ON CLIENT TYPE
  # ============================================

  if (client_config$script_90_compatible && "kpi_summary_tables" %in% client_config$outputs) {
    #  WORKFLOW: Run 6 farm-level KPIs for Script 90 compatibility
    message("\n", strrep("=", 70))
    message("WORKFLOW: CALCULATING 6 FARM-LEVEL KPIs (Script 90 compatible)")
    message(strrep("=", 70))

    # Prepare outputs and inputs for KPI calculation (already created by setup_project_directories)
    reports_dir_kpi <- file.path(setup$reports_dir, "kpis")
    cumulative_CI_vals_dir <- setup$cumulative_CI_vals_dir

    # Load field boundaries for workflow (use data_dir from setup)
    message("\nLoading field boundaries for KPI calculation...")
    tryCatch({
      boundaries_result <- load_field_boundaries(setup$data_dir)

      if (is.list(boundaries_result) && "field_boundaries_sf" %in% names(boundaries_result)) {
        field_boundaries_sf <- boundaries_result$field_boundaries_sf
      } else {
        field_boundaries_sf <- boundaries_result
      }

      if (nrow(field_boundaries_sf) == 0) {
        stop("No fields loaded from boundaries")
      }

      message(paste("  ✓ Loaded", nrow(field_boundaries_sf), "fields"))
    }, error = function(e) {
      stop("ERROR loading field boundaries: ", e$message)
    })

    # Load harvesting data for yield prediction (using common helper function)
    harvesting_data <- load_harvest_data(setup$data_dir)

    # Extract current week/year from end_date
    current_week <- as.numeric(format(end_date, "%V"))
    current_year <- as.numeric(format(end_date, "%G"))

    # Call with correct signature
    kpi_results <- calculate_all_field_analysis_agronomic_support(
      field_boundaries_sf = field_boundaries_sf,
      current_week = current_week,
      current_year = current_year,
      current_mosaic_dir = setup$weekly_mosaic_dir,
      previous_mosaic_dir = NULL,
      ci_rds_path = NULL,
      harvesting_data = harvesting_data,
      output_dir = reports_dir_kpi,
      data_dir = setup$data_dir,
      project_dir = project_dir
    )

    # Extract results
    field_analysis_df <- kpi_results$field_analysis_df
    export_paths <- kpi_results$export_paths

    cat("\n===  KPI CALCULATION COMPLETE ===\n")
    cat("Summary tables saved for Script 90 integration\n")
    cat("Output directory:", reports_dir_kpi, "\n\n")

  } else if (client_config$script_91_compatible && "field_analysis_excel" %in% client_config$outputs) {
    # CANE_SUPPLY WORKFLOW: Run per-field analysis with phase assignment
    message("\n", strrep("=", 70))
    message("CANE_SUPPLY WORKFLOW: PER-FIELD ANALYSIS (Script 91 compatible)")
    message(strrep("=", 70))

   # Define variables needed for workflow functions
   data_dir <- setup$data_dir

   # Load field boundaries for workflow (use data_dir from setup)
    message("\nLoading field boundaries for KPI calculation...")
    tryCatch({
      boundaries_result <- load_field_boundaries(setup$data_dir)

      if (is.list(boundaries_result) && "field_boundaries_sf" %in% names(boundaries_result)) {
        field_boundaries_sf <- boundaries_result$field_boundaries_sf
      } else {
        field_boundaries_sf <- boundaries_result
      }

      if (nrow(field_boundaries_sf) == 0) {
        stop("No fields loaded from boundaries")
      }

      message(paste("  ✓ Loaded", nrow(field_boundaries_sf), "fields"))
    }, error = function(e) {
      stop("ERROR loading field boundaries: ", e$message)
    })

    # Load harvesting data for yield prediction (using common helper function)
    harvesting_data <- load_harvest_data(setup$data_dir)

    # Extract current week/year from end_date
    current_week <- as.numeric(format(end_date, "%V"))
    current_year <- as.numeric(format(end_date, "%G"))


   # Call the main orchestrator function from kpi_calculation_utils.R
    workflow_results <- calculate_field_analysis_cane_supply(
      setup = setup,
      client_config = client_config,
      end_date = end_date,
      project_dir = project_dir,
      weekly_mosaic = weekly_mosaic,
      daily_vals_dir = daily_vals_dir,
      field_boundaries_sf = field_boundaries_sf,
      data_dir = data_dir
    )

    # Extract results
    field_analysis_df <- workflow_results$field_analysis_df
    farm_kpi_results <- workflow_results$farm_kpi_results
    export_paths <- workflow_results$export_paths

  } else {
    # Unknown client type - log warning and exit
    warning(sprintf("Unknown client type: %s - no workflow matched", client_type))
    cat("\n⚠️  Warning: Client type '", client_type, "' does not match any known workflow\n", sep = "")
    cat("Expected: 'agronomic_support' (aura) or 'cane_supply' (angata, etc.)\n")
    cat("Check CLIENT_TYPE_MAP in parameters_project.R\n\n")
  }
}

if (sys.nframe() == 0) {
  main()
}