SmartCane/r_app/80_calculate_kpis.R

993 lines
40 KiB
R
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ============================================================================
# SCRIPT 80: Key Performance Indicator (KPI) Calculation
# ============================================================================
# PURPOSE:
# Calculate per-field and farm-level KPIs from weekly CI mosaics. Computes
# field uniformity, growth trends (4-week and 8-week), area change detection,
# TCH forecasts, stress identification, and weed presence. Generates
# comprehensive Excel/CSV/RDS exports for dashboards and stakeholder reporting.
#
# INPUT DATA:
# - Source 1: laravel_app/storage/app/{project}/weekly_mosaic/{FIELD}/week_*.tif
# - Source 2: Field boundaries (pivot.geojson) and harvest data (harvest.xlsx)
# - Source 3: Historical field stats (RDS from previous weeks)
#
# OUTPUT DATA:
# - Destination: laravel_app/storage/app/{project}/output/
# - Format: Excel (.xlsx), CSV (.csv), RDS (.rds)
# - Files: {project}_field_analysis_week{WW}_{YYYY}.xlsx + metadata
#
# USAGE:
# Rscript 80_calculate_kpis.R [project] [week] [year]
#
# Example (Windows PowerShell):
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R angata 5 2026
#
# PARAMETERS:
# - project: Project name (character) - angata, chemba, xinavane, esa, simba
# - week: ISO week number (numeric, 1-53, default current week)
# - year: ISO year (numeric, default current year)
#
# CLIENT TYPES:
# - cane_supply (ANGATA): Yes - uses 80_utils_cane_supply.R (placeholder)
# - agronomic_support (AURA): Yes - uses 80_utils_agronomic_support.R (6 KPI funcs)
#
# DEPENDENCIES:
# - Packages: terra, sf, tidyverse, lubridate, writexl, spdep
# - Utils files: parameters_project.R, 00_common_utils.R, 80_utils_agronomic_support.R, 80_utils_cane_supply.R
# - External data: Field boundaries (pivot.geojson), harvest data (harvest.xlsx)
# - Input data: Weekly mosaic TIFFs (Script 40 output)
# - Data directories: weekly_mosaic/, output/ (created if missing)
#
# NOTES:
# - KPIs calculated: Field Uniformity (CV), Area Change (pixel %), TCH Forecast,
# Growth/Decline Trend, Weed Presence (spatial autocorrelation), Gap Filling %
# - Client-aware: Conditional sourcing based on client_config$script_90_compatible
# - Exports: 21-column output with field-level metrics, phase, status, alerts
# - Supports 4-week and 8-week trend analysis from historical RDS cache
# - Critical dependency for Scripts 90/91 (reporting/dashboards)
# - Uses Moran's I for spatial clustering detection (weed/stress patterns)
#
#
# ============================================================================
# [✓] Extract planting dates per field
# [✓] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
#
# COMMAND-LINE USAGE:
# Option 1: Rscript 80_calculate_kpis.R 2026-01-14 angata
# Arguments: [end_date] [project_dir]
#
# Option 2: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
# Arguments: [end_date] [project_dir] [offset_days]
#
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-01-12 angata 7
#
# Usage in run_full_pipeline.R:
# source("r_app/80_calculate_kpis.R")
# main()
# ============================================================================
# NEXT INTEGRATIONS (See Linear issues for detailed requirements)
# ============================================================================
# 1. [✓] Load imminent_prob from script 31 (week_WW_YYYY.csv)
# 2. [✓] Load planting_date from harvest.xlsx for field-specific age calculation
# 3. [ ] Improve Status_trigger logic to use actual imminent_prob values
# ============================================================================
# ============================================================================
# CONFIGURATION
# ============================================================================
FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5
FOUR_WEEK_TREND_GROWTH_MIN <- 0.1
FOUR_WEEK_TREND_GROWTH_MAX <- 0.5
FOUR_WEEK_TREND_NO_GROWTH_RANGE <- 0.1
FOUR_WEEK_TREND_DECLINE_MAX <- -0.1
FOUR_WEEK_TREND_DECLINE_MIN <- -0.5
FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5
# CV TREND THRESHOLDS
CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05
# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS
# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend
# Negative slope = CV decreasing = field becoming MORE uniform = GOOD
# Positive slope = CV increasing = field becoming MORE patchy = BAD
# Near zero = Homogenous growth (all crops progressing equally)
CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.03 # CV decreasing rapidly (>3% drop over 8 weeks)
CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # CV decreasing (2-3% drop over 8 weeks)
CV_SLOPE_IMPROVEMENT_MAX <- -0.01 # Gradual improvement (1-2% drop over 8 weeks)
CV_SLOPE_HOMOGENOUS_MIN <- -0.01 # Essentially stable (small natural variation)
CV_SLOPE_HOMOGENOUS_MAX <- 0.01 # No change in uniformity (within ±1% over 8 weeks)
CV_SLOPE_PATCHINESS_MIN <- 0.01 # Minor divergence (1-2% increase over 8 weeks)
CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness (2-3% increase over 8 weeks)
CV_SLOPE_SEVERE_MIN <- 0.02 # Severe fragmentation (>3% increase over 8 weeks)
# PERCENTILE CALCULATIONS
CI_PERCENTILE_LOW <- 0.10
CI_PERCENTILE_HIGH <- 0.90
# GERMINATION THRESHOLD (for germination_progress calculation)
GERMINATION_CI_THRESHOLD <- 2.0
# PLANTING DATE & AGE CONFIGURATION
# Load from harvest.xlsx (scripts 22 & 23) - no fallback to uniform dates
# HISTORICAL DATA LOOKBACK
WEEKS_FOR_FOUR_WEEK_TREND <- 4
WEEKS_FOR_CV_TREND_SHORT <- 2
WEEKS_FOR_CV_TREND_LONG <- 8
# ============================================================================
# 1. Load required libraries
# ============================================================================
suppressPackageStartupMessages({
# File path handling
library(here) # For relative path resolution (platform-independent file paths)
# Spatial data handling
library(sf) # For reading/manipulating field boundaries (GeoJSON)
library(terra) # For raster operations (reading mosaic TIFFs)
# Data manipulation
library(dplyr) # For data wrangling (filter, mutate, group_by, summarize)
library(tidyr) # For data reshaping (pivot_longer, pivot_wider, gather)
library(lubridate) # For date/time operations (week extraction, date arithmetic)
# File I/O
library(readr) # For reading CSV files (harvest predictions from Python)
library(readxl) # For reading harvest.xlsx (harvest dates for field mapping)
library(writexl) # For writing Excel outputs (KPI summary tables)
library(progress) # For progress bars during field processing
# ML/Analysis (optional - only for harvest model inference)
tryCatch({
library(torch) # For PyTorch model inference (harvest readiness prediction)
}, error = function(e) {
message("Note: torch package not available - harvest model inference will be skipped")
})
})
# ============================================================================
# LOAD CONFIGURATION - MUST BE DONE FIRST
# ============================================================================
# Parameters must be loaded early to determine client type and paths
tryCatch({
source(here("r_app", "parameters_project.R"))
}, error = function(e) {
stop("Error loading parameters_project.R: ", e$message)
})
# Get client configuration from global project setup
# NOTE: This cannot be done until parameters_project.R is sourced
# We determine client_type from the current project_dir (if running in main() context)
# For now, set a placeholder that will be overridden in main()
if (exists("project_dir", envir = .GlobalEnv)) {
temp_client_type <- get_client_type(get("project_dir", envir = .GlobalEnv))
} else {
temp_client_type <- "cane_supply" # Safe default
}
temp_client_config <- get_client_kpi_config(temp_client_type)
# ============================================================================
# LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES
# ============================================================================
# All clients use the common utilities (shared statistical functions, reporting)
tryCatch({
source(here("r_app", "80_utils_common.R"))
}, error = function(e) {
stop("Error loading 80_utils_common.R: ", e$message)
})
# Load both client-specific utilities (functions will be available for both workflows)
# This avoids needing to determine client type at startup time
message("Loading client-specific utilities (80_utils_agronomic_support.R and 80_utils_cane_supply.R)...")
tryCatch({
source(here("r_app", "80_utils_agronomic_support.R"))
}, error = function(e) {
stop("Error loading 80_utils_agronomic_support.R: ", e$message)
})
tryCatch({
source(here("r_app", "80_utils_cane_supply.R"))
}, error = function(e) {
stop("Error loading 80_utils_cane_supply.R: ", e$message)
})
# ============================================================================
# PHASE AND STATUS TRIGGER DEFINITIONS
# ============================================================================
PHASE_DEFINITIONS <- data.frame(
phase = c("Germination", "Tillering", "Grand Growth", "Maturation"),
age_start = c(0, 4, 17, 39),
age_end = c(6, 16, 39, 200),
stringsAsFactors = FALSE
)
STATUS_TRIGGERS <- data.frame(
trigger = c(
"germination_started",
"germination_complete",
"stress_detected_whole_field",
"strong_recovery",
"growth_on_track",
"maturation_progressing",
"harvest_ready"
),
age_min = c(0, 0, NA, NA, 4, 39, 45),
age_max = c(6, 6, NA, NA, 39, 200, 200),
description = c(
"10% of field CI > 2",
"70% of field CI >= 2",
"CI decline > -1.5 + low CV",
"CI increase > +1.5",
"CI increasing consistently",
"High CI, stable/declining",
"Age 45+ weeks (ready to harvest)"
),
stringsAsFactors = FALSE
)
main <- function() {
# Parse command-line arguments
args <- commandArgs(trailingOnly = TRUE)
# end_date (arg 1)
# Priority: 1) Command-line arg, 2) Global end_date variable (for recursive calls), 3) Global end_date_str, 4) Sys.Date()
end_date <- if (length(args) >= 1 && !is.na(args[1])) {
# Parse date explicitly in YYYY-MM-DD format from command line
as.Date(args[1], format = "%Y-%m-%d")
} else if (exists("end_date", envir = .GlobalEnv)) {
global_date <- get("end_date", envir = .GlobalEnv)
# Check if it's a valid Date with length > 0
if (is.Date(global_date) && length(global_date) > 0 && !is.na(global_date)) {
global_date
} else if (exists("end_date_str", envir = .GlobalEnv)) {
as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
Sys.Date()
}
} else if (exists("end_date_str", envir = .GlobalEnv)) {
as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
Sys.Date()
}
# project_dir (arg 2)
project_dir <- if (length(args) >= 2 && !is.na(args[2])) {
as.character(args[2])
} else if (exists("project_dir", envir = .GlobalEnv)) {
get("project_dir", envir = .GlobalEnv)
} else {
"angata"
}
# offset (arg 3) - for backward compatibility with old 09
offset <- if (length(args) >= 3 && !is.na(args[3])) {
as.numeric(args[3])
} else {
7
}
# Validate end_date is a proper Date object
if (is.null(end_date) || length(end_date) == 0 || !inherits(end_date, "Date")) {
stop("ERROR: end_date is not valid. Got: ", class(end_date), " with length ", length(end_date))
}
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv)
message("\n", strrep("=", 70))
message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION")
message(strrep("=", 70))
message("Date:", format(end_date, "%Y-%m-%d"))
message("Project:", project_dir)
message("Mode: Conditional KPI calculation based on client type")
message("")
# Load configuration and utilities
# source(here("r_app", "crop_messaging_utils.R"))
tryCatch({
source(here("r_app", "parameters_project.R"))
}, error = function(e) {
stop("Error loading parameters_project.R: ", e$message)
})
# Initialize project directories and configuration
setup <- setup_project_directories(project_dir)
# DETERMINE CLIENT TYPE AND KPI CONFIGURATION
client_type <- get_client_type(project_dir)
client_config <- get_client_kpi_config(client_type)
# Assign to global environment so utilities and downstream scripts can access it
assign("client_config", client_config, envir = .GlobalEnv)
message("Client Type:", client_type)
message("KPI Calculations:", paste(client_config$kpi_calculations, collapse = ", "))
message("Output Formats:", paste(client_config$outputs, collapse = ", "))
# Use centralized paths from setup object (no need for file.path calls)
weekly_mosaic <- setup$weekly_mosaic_dir
daily_vals_dir <- setup$daily_ci_vals_dir
tryCatch({
source(here("r_app", "30_growth_model_utils.R"))
}, error = function(e) {
warning("30_growth_model_utils.R not found - yield prediction KPI will use placeholder data")
})
# CONDITIONAL EXECUTION BASED ON CLIENT TYPE
# ============================================
if (client_config$script_90_compatible && "kpi_summary_tables" %in% client_config$outputs) {
# AURA WORKFLOW: Run 6 farm-level KPIs for Script 90 compatibility
message("\n", strrep("=", 70))
message("AURA WORKFLOW: CALCULATING 6 FARM-LEVEL KPIs (Script 90 compatible)")
message(strrep("=", 70))
# Prepare inputs for KPI calculation (already created by setup_project_directories)
reports_dir_kpi <- setup$kpi_reports_dir
cumulative_CI_vals_dir <- setup$cumulative_CI_vals_dir
# Load field boundaries for AURA workflow (use data_dir from setup)
message("\nLoading field boundaries for AURA KPI calculation...")
tryCatch({
boundaries_result <- load_field_boundaries(setup$data_dir)
if (is.list(boundaries_result) && "field_boundaries_sf" %in% names(boundaries_result)) {
field_boundaries_sf <- boundaries_result$field_boundaries_sf
} else {
field_boundaries_sf <- boundaries_result
}
if (nrow(field_boundaries_sf) == 0) {
stop("No fields loaded from boundaries")
}
message(paste(" ✓ Loaded", nrow(field_boundaries_sf), "fields"))
}, error = function(e) {
stop("ERROR loading field boundaries: ", e$message)
})
# Load harvesting data
if (!exists("harvesting_data")) {
warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
}
# Extract current week/year from end_date
current_week <- as.numeric(format(end_date, "%V"))
current_year <- as.numeric(format(end_date, "%G"))
# Call with correct signature
kpi_results <- calculate_all_kpis(
report_date = end_date,
output_dir = reports_dir_kpi,
field_boundaries_sf = field_boundaries_sf,
harvesting_data = harvesting_data,
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
weekly_CI_mosaic = setup$weekly_mosaic_dir,
reports_dir = reports_dir_kpi,
project_dir = project_dir
)
cat("\n=== AURA KPI CALCULATION COMPLETE ===\n")
cat("Summary tables saved for Script 90 integration\n")
cat("Output directory:", reports_dir_kpi, "\n\n")
} else if (client_config$script_91_compatible && "field_analysis_excel" %in% client_config$outputs) {
# CANE_SUPPLY WORKFLOW: Run per-field analysis with phase assignment
message("\n", strrep("=", 70))
message("CANE_SUPPLY WORKFLOW: PER-FIELD ANALYSIS (Script 91 compatible)")
message(strrep("=", 70))
# Set reports_dir for CANE_SUPPLY workflow (used by export functions)
reports_dir <- setup$kpi_reports_dir
data_dir <- setup$data_dir
# Continue with existing per-field analysis code below
message("\n", strrep("-", 70))
message("PHASE 1: PER-FIELD WEEKLY ANALYSIS ")
message(strrep("-", 70))
weeks <- calculate_week_numbers(end_date)
current_week <- weeks$current_week
current_year <- weeks$current_year
previous_week <- weeks$previous_week
previous_year <- weeks$previous_year
message(paste("Week:", current_week, "/ Year (ISO 8601):", current_year))
# Find per-field weekly mosaics
message("Finding per-field weekly mosaics...")
if (!dir.exists(weekly_mosaic)) {
stop(paste("ERROR: weekly_mosaic directory not found:", weekly_mosaic,
"\nScript 40 (mosaic creation) must be run first."))
}
field_dirs <- list.dirs(weekly_mosaic, full.names = FALSE, recursive = FALSE)
field_dirs <- field_dirs[field_dirs != ""]
if (length(field_dirs) == 0) {
stop(paste("ERROR: No field subdirectories found in:", weekly_mosaic,
"\nScript 40 must create weekly_mosaic/{FIELD}/ structure."))
}
# Verify we have mosaics for this week
single_file_pattern <- sprintf("week_%02d_%d\\.tif", current_week, current_year)
per_field_files <- c()
for (field in field_dirs) {
field_mosaic_dir <- file.path(weekly_mosaic, field)
files <- list.files(field_mosaic_dir, pattern = single_file_pattern, full.names = TRUE)
if (length(files) > 0) {
per_field_files <- c(per_field_files, files)
}
}
if (length(per_field_files) == 0) {
stop(paste("ERROR: No mosaics found for week", current_week, "year", current_year,
"\nExpected pattern:", single_file_pattern,
"\nChecked:", weekly_mosaic))
}
message(paste(" ✓ Found", length(per_field_files), "per-field weekly mosaics"))
mosaic_mode <- "per-field"
mosaic_dir <- weekly_mosaic
# Load field boundaries
tryCatch({
boundaries_result <- load_field_boundaries(data_dir)
if (is.list(boundaries_result) && "field_boundaries_sf" %in% names(boundaries_result)) {
field_boundaries_sf <- boundaries_result$field_boundaries_sf
} else {
field_boundaries_sf <- boundaries_result
}
if (nrow(field_boundaries_sf) == 0) {
stop("No fields loaded from boundaries")
}
message(paste(" Loaded", nrow(field_boundaries_sf), "fields"))
}, error = function(e) {
stop("ERROR loading field boundaries: ", e$message)
})
message("Loading historical field data for trend calculations...")
# Load up to 8 weeks (max of 4-week and 8-week trend requirements)
# Function gracefully handles missing weeks and loads whatever exists
num_weeks_to_load <- max(WEEKS_FOR_FOUR_WEEK_TREND, WEEKS_FOR_CV_TREND_LONG) # Always 8
message(paste(" Attempting to load up to", num_weeks_to_load, "weeks of historical data..."))
# Only auto-generate on first call (not in recursive calls from within load_historical_field_data)
allow_auto_gen <- !exists("_INSIDE_AUTO_GENERATE", envir = .GlobalEnv)
historical_data <- load_historical_field_data(project_dir, current_week, current_year, reports_dir,
num_weeks = num_weeks_to_load,
auto_generate = allow_auto_gen,
field_boundaries_sf = field_boundaries_sf,
daily_vals_dir = daily_vals_dir)
# Load harvest.xlsx for planting dates (season_start)
message("\nLoading harvest data from harvest.xlsx for planting dates...")
harvest_file_path <- file.path(data_dir, "harvest.xlsx")
harvesting_data <- tryCatch({
if (file.exists(harvest_file_path)) {
harvest_raw <- readxl::read_excel(harvest_file_path)
harvest_raw$season_start <- as.Date(harvest_raw$season_start)
harvest_raw$season_end <- as.Date(harvest_raw$season_end)
message(paste(" ✓ Loaded harvest data:", nrow(harvest_raw), "rows"))
harvest_raw
} else {
message(paste(" WARNING: harvest.xlsx not found at", harvest_file_path))
NULL
}
}, error = function(e) {
message(paste(" ERROR loading harvest.xlsx:", e$message))
NULL
})
planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
# Validate planting_dates
if (is.null(planting_dates) || nrow(planting_dates) == 0) {
message("WARNING: No planting dates available. Using NA for all fields.")
planting_dates <- data.frame(
field_id = field_boundaries_sf$field,
planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)),
stringsAsFactors = FALSE
)
}
# Build per-field configuration
message("\nPreparing mosaic configuration for statistics calculation...")
message(" ✓ Using per-field mosaic architecture (1 TIFF per field)")
# Per-field mode: each field has its own TIFF in weekly_mosaic/{FIELD}/week_*.tif
field_grid <- list(
mosaic_dir = mosaic_dir,
mode = "per-field"
)
message("\nUsing modular RDS-based approach for weekly statistics...")
# Load/calculate CURRENT week stats (from tiles or RDS cache)
message("\n1. Loading/calculating CURRENT week statistics (week", current_week, ")...")
current_stats <- load_or_calculate_weekly_stats(
week_num = current_week,
year = current_year,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = field_grid$mosaic_dir,
reports_dir = reports_dir,
report_date = end_date
)
message(paste(" ✓ Loaded/calculated stats for", nrow(current_stats), "fields in current week"))
# Load/calculate PREVIOUS week stats (from RDS cache or tiles)
message("\n2. Loading/calculating PREVIOUS week statistics (week", previous_week, ")...")
# Calculate report date for previous week (7 days before current)
prev_report_date <- end_date - 7
prev_stats <- load_or_calculate_weekly_stats(
week_num = previous_week,
year = previous_year,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = field_grid$mosaic_dir,
reports_dir = reports_dir,
report_date = prev_report_date
)
message(paste(" ✓ Loaded/calculated stats for", nrow(prev_stats), "fields in previous week"))
message(paste(" Columns in prev_stats:", paste(names(prev_stats), collapse = ", ")))
message(paste(" CV column non-NA values in prev_stats:", sum(!is.na(prev_stats$CV))))
# Apply trend calculations (requires both weeks)
message("\n3. Calculating trend columns...")
current_stats <- calculate_kpi_trends(current_stats, prev_stats,
project_dir = project_dir,
reports_dir = reports_dir,
current_week = current_week,
year = current_year)
message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_of_weeks_analysed"))
# Load weekly harvest probabilities from script 31 (if available)
# Note: Script 31 saves to reports/kpis/field_stats/ (not field_level)
message("\n4. Loading harvest probabilities from script 31...")
harvest_prob_dir <- file.path(data_dir, "..", "reports", "kpis", "field_stats")
harvest_prob_file <- file.path(harvest_prob_dir,
sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year))
message(paste(" Looking for:", harvest_prob_file))
imminent_prob_data <- tryCatch({
if (file.exists(harvest_prob_file)) {
prob_df <- readr::read_csv(harvest_prob_file, show_col_types = FALSE,
col_types = readr::cols(.default = readr::col_character()))
message(paste(" ✓ Loaded harvest probabilities for", nrow(prob_df), "fields"))
prob_df %>%
select(field, imminent_prob, detected_prob) %>%
rename(Field_id = field, Imminent_prob_actual = imminent_prob, Detected_prob = detected_prob)
} else {
message(paste(" INFO: Harvest probabilities not available (script 31 not run)"))
NULL
}
}, error = function(e) {
message(paste(" WARNING: Could not load harvest probabilities:", e$message))
NULL
})
# ============================================================================
# CALCULATE GAP FILLING KPI (2σ method from kpi_utils.R)
# ============================================================================
message("\nCalculating gap filling scores (2σ method)...")
# Process per-field mosaics
message(paste(" Using per-field mosaics for", length(per_field_files), "fields"))
field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field)
process_gap_for_field <- function(field_file) {
field_id <- basename(dirname(field_file))
field_bounds <- field_boundaries_by_id[[field_id]]
if (is.null(field_bounds) || nrow(field_bounds) == 0) {
return(data.frame(Field_id = field_id, gap_score = NA_real_))
}
tryCatch({
field_raster <- terra::rast(field_file)
ci_band_name <- "CI"
if (!(ci_band_name %in% names(field_raster))) {
return(data.frame(Field_id = field_id, gap_score = NA_real_))
}
field_ci_band <- field_raster[[ci_band_name]]
names(field_ci_band) <- "CI"
gap_result <- calculate_gap_filling_kpi(field_ci_band, field_bounds)
if (is.null(gap_result) || is.null(gap_result$field_results) || nrow(gap_result$field_results) == 0) {
return(data.frame(Field_id = field_id, gap_score = NA_real_))
}
gap_scores <- gap_result$field_results
gap_scores$Field_id <- gap_scores$field
gap_scores <- gap_scores[, c("Field_id", "gap_score")]
stats::aggregate(gap_score ~ Field_id, data = gap_scores, FUN = function(x) mean(x, na.rm = TRUE))
}, error = function(e) {
message(paste(" WARNING: Gap score failed for field", field_id, ":", e$message))
data.frame(Field_id = field_id, gap_score = NA_real_)
})
}
# Process fields sequentially with progress bar
message(" Processing gap scores for ", length(per_field_files), " fields...")
pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50)
results_list <- lapply(seq_along(per_field_files), function(idx) {
result <- process_gap_for_field(per_field_files[[idx]])
utils::setTxtProgressBar(pb, idx)
result
})
close(pb)
gap_scores_df <- dplyr::bind_rows(results_list)
if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
gap_scores_df <- gap_scores_df %>%
dplyr::group_by(Field_id) %>%
dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop")
message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields"))
message(paste(" Gap score range:", round(min(gap_scores_df$gap_score, na.rm=TRUE), 2), "-", round(max(gap_scores_df$gap_score, na.rm=TRUE), 2), "%"))
} else {
message(" WARNING: No gap scores calculated from per-field mosaics")
gap_scores_df <- NULL
}
# ============================================================================
# Build final output dataframe with all 22 columns (including Gap_score)
# ============================================================================
message("\nBuilding final field analysis output...")
# Pre-calculate acreages with geometry validation
# This avoids geometry errors during field_analysis construction
acreage_lookup <- tryCatch({
lookup_df <- field_boundaries_sf %>%
sf::st_drop_geometry() %>%
as.data.frame() %>%
mutate(
geometry_valid = sapply(seq_len(nrow(field_boundaries_sf)), function(idx) {
tryCatch({
sf::st_is_valid(field_boundaries_sf[idx, ])
}, error = function(e) FALSE)
}),
area_ha = 0
)
# Calculate area for valid geometries
for (idx in which(lookup_df$geometry_valid)) {
tryCatch({
area_m2 <- as.numeric(sf::st_area(field_boundaries_sf[idx, ]))
lookup_df$area_ha[idx] <- area_m2 / 10000
}, error = function(e) {
lookup_df$area_ha[idx] <<- NA_real_
})
}
# Convert hectares to acres
lookup_df %>%
mutate(acreage = area_ha / 0.404686) %>%
select(field, acreage)
}, error = function(e) {
message(paste("Warning: Could not calculate acreages from geometries -", e$message))
data.frame(field = character(0), acreage = numeric(0))
})
field_analysis_df <- current_stats %>%
mutate(
# Column 2: Farm_Section (user fills)
Farm_Section = NA_character_,
# Column 3: Field_name (from GeoJSON - already have Field_id, can look up)
Field_name = Field_id,
# Column 4: Acreage (calculate from geometry)
Acreage = {
acreages_vec <- acreage_lookup$acreage[match(Field_id, acreage_lookup$field)]
if_else(is.na(acreages_vec), 0, acreages_vec)
},
# Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change)
# Column 7: Four_week_trend (from current_stats)
# Column 8: Last_harvest_or_planting_date (from harvest.xlsx - season_start)
Last_harvest_or_planting_date = {
planting_dates$planting_date[match(Field_id, planting_dates$field_id)]
},
# Column 9: Age_week (calculated from report date and planting date)
Age_week = {
sapply(seq_len(nrow(current_stats)), function(idx) {
planting_dt <- Last_harvest_or_planting_date[idx]
if (is.na(planting_dt)) {
return(NA_real_)
}
round(as.numeric(difftime(end_date, planting_dt, units = "weeks")), 0)
})
},
# Column 10: Phase (recalculate based on updated Age_week)
Phase = {
sapply(Age_week, function(age) {
if (is.na(age)) return(NA_character_)
if (age >= 0 & age < 4) return("Germination")
if (age >= 4 & age < 17) return("Tillering")
if (age >= 17 & age < 39) return("Grand Growth")
if (age >= 39) return("Maturation")
NA_character_
})
},
# Column 11: nmr_of_weeks_analysed (already in current_stats from calculate_kpi_trends)
# Column 12: Germination_progress (calculated here from CI values)
# Bin Pct_pixels_CI_gte_2 into 10% intervals: 0-10%, 10-20%, ..., 80-90%, 90-95%, 95-100%
Germination_progress = sapply(Pct_pixels_CI_gte_2, function(pct) {
if (is.na(pct)) return(NA_character_)
if (pct >= 95) return("95-100%")
else if (pct >= 90) return("90-95%")
else if (pct >= 80) return("80-90%")
else if (pct >= 70) return("70-80%")
else if (pct >= 60) return("60-70%")
else if (pct >= 50) return("50-60%")
else if (pct >= 40) return("40-50%")
else if (pct >= 30) return("30-40%")
else if (pct >= 20) return("20-30%")
else if (pct >= 10) return("10-20%")
else return("0-10%")
}),
# Column 13: Imminent_prob (from script 31 or NA if not available)
Imminent_prob = {
if (!is.null(imminent_prob_data)) {
imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)]
} else {
rep(NA_real_, nrow(current_stats))
}
},
# Column 14: Status_Alert (based on harvest probability + crop health status)
# Priority order: Ready for harvest-check → Strong decline → Harvested/bare → NA
Status_Alert = {
sapply(seq_len(nrow(current_stats)), function(idx) {
imminent_prob <- Imminent_prob[idx]
age_w <- Age_week[idx]
weekly_ci_chg <- Weekly_ci_change[idx]
mean_ci_val <- Mean_CI[idx]
# Priority 1: Ready for harvest-check (imminent + mature cane ≥12 months)
if (!is.na(imminent_prob) && imminent_prob > 0.5 && !is.na(age_w) && age_w >= 52) {
return("Ready for harvest-check")
}
# Priority 2: Strong decline in crop health (drop ≥2 points but still >1.5)
if (!is.na(weekly_ci_chg) && weekly_ci_chg <= -2.0 && !is.na(mean_ci_val) && mean_ci_val > 1.5) {
return("Strong decline in crop health")
}
# Priority 3: Harvested/bare (Mean CI < 1.5)
if (!is.na(mean_ci_val) && mean_ci_val < 1.5) {
return("Harvested/bare")
}
# Fallback: no alert
NA_character_
})
},
# Columns 15-16: CI-based columns already in current_stats (CI_range, CI_Percentiles)
# Column 17: Already in current_stats (CV)
# Column 18: Already in current_stats (CV_Trend_Short_Term)
# Column 19: CV_Trend_Long_Term (from current_stats - raw slope value)
# Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope)
# 3 classes: More uniform (slope < -0.01), Stable uniformity (-0.01 to 0.01), Less uniform (slope > 0.01)
CV_Trend_Long_Term_Category = {
sapply(current_stats$CV_Trend_Long_Term, function(slope) {
if (is.na(slope)) {
return(NA_character_)
} else if (slope < -0.01) {
return("More uniform")
} else if (slope > 0.01) {
return("Less uniform")
} else {
return("Stable uniformity")
}
})
},
# Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category)
# Bin Cloud_pct_clear into 10% intervals: 0-10%, 10-20%, ..., 80-90%, 90-95%, 95-100%
Cloud_pct_clear = sapply(Cloud_pct_clear, function(pct) {
if (is.na(pct)) return(NA_character_)
if (pct >= 95) return("95-100%")
else if (pct >= 90) return("90-95%")
else if (pct >= 80) return("80-90%")
else if (pct >= 70) return("70-80%")
else if (pct >= 60) return("60-70%")
else if (pct >= 50) return("50-60%")
else if (pct >= 40) return("40-50%")
else if (pct >= 30) return("30-40%")
else if (pct >= 20) return("20-30%")
else if (pct >= 10) return("10-20%")
else return("0-10%")
}),
# Column 22: Gap_score (2σ below median - from kpi_utils.R)
Gap_score = {
if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
# Debug: Print first few gap scores
message(sprintf(" Joining %d gap scores to field_analysis (first 3: %s)",
nrow(gap_scores_df),
paste(head(gap_scores_df$gap_score, 3), collapse=", ")))
message(sprintf(" First 3 Field_ids in gap_scores_df: %s",
paste(head(gap_scores_df$Field_id, 3), collapse=", ")))
message(sprintf(" First 3 Field_ids in current_stats: %s",
paste(head(current_stats$Field_id, 3), collapse=", ")))
gap_scores_df$gap_score[match(current_stats$Field_id, gap_scores_df$Field_id)]
} else {
rep(NA_real_, nrow(current_stats))
}
}
) %>%
select(
all_of(c("Field_id", "Farm_Section", "Field_name", "Acreage", "Status_Alert",
"Last_harvest_or_planting_date", "Age_week", "Phase",
"Germination_progress",
"Mean_CI", "Weekly_ci_change", "Four_week_trend", "CI_range", "CI_Percentiles",
"CV", "CV_Trend_Short_Term", "CV_Trend_Long_Term", "CV_Trend_Long_Term_Category",
"Imminent_prob", "Cloud_pct_clear", "Cloud_category", "Gap_score"))
)
message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 22 columns (including Gap_score)"))
export_paths <- export_field_analysis_excel(
field_analysis_df,
NULL,
project_dir,
current_week,
current_year,
reports_dir
)
cat("\n--- Per-field Results (first 10) ---\n")
available_cols <- c("Field_id", "Acreage", "Age_week", "Mean_CI", "Four_week_trend", "Status_Alert", "Cloud_category")
available_cols <- available_cols[available_cols %in% names(field_analysis_df)]
if (length(available_cols) > 0) {
print(head(field_analysis_df[, available_cols], 10))
}
# ========== FARM-LEVEL KPI AGGREGATION ==========
# Aggregate the per-field analysis into farm-level summary statistics
cat("\n=== CALCULATING FARM-LEVEL KPI SUMMARY ===\n")
# Filter to only fields that have actual data (non-NA CI and valid acreage)
field_data <- field_analysis_df %>%
filter(!is.na(Mean_CI) & !is.na(Acreage)) %>%
filter(Acreage > 0)
if (nrow(field_data) > 0) {
if (nrow(field_data) > 0) {
# Create summary statistics
farm_summary <- list()
# 1. PHASE DISTRIBUTION
phase_dist <- field_data %>%
group_by(Phase) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Phase)
farm_summary$phase_distribution <- phase_dist
# 2. STATUS ALERT DISTRIBUTION
status_dist <- field_data %>%
group_by(Status_Alert) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Status_Alert)
farm_summary$status_distribution <- status_dist
# 3. CLOUD COVERAGE DISTRIBUTION
cloud_dist <- field_data %>%
group_by(Cloud_category) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Cloud_category)
farm_summary$cloud_distribution <- cloud_dist
# 4. OVERALL STATISTICS
farm_summary$overall_stats <- data.frame(
total_fields = nrow(field_data),
total_acreage = sum(field_data$Acreage, na.rm = TRUE),
mean_ci = round(mean(field_data$Mean_CI, na.rm = TRUE), 2),
median_ci = round(median(field_data$Mean_CI, na.rm = TRUE), 2),
mean_cv = round(mean(field_data$CV, na.rm = TRUE), 4),
week = current_week,
year = current_year,
date = as.character(end_date)
)
# Print summaries
cat("\n--- PHASE DISTRIBUTION ---\n")
print(phase_dist)
cat("\n--- STATUS TRIGGER DISTRIBUTION ---\n")
print(status_dist)
cat("\n--- CLOUD COVERAGE DISTRIBUTION ---\n")
print(cloud_dist)
cat("\n--- OVERALL FARM STATISTICS ---\n")
print(farm_summary$overall_stats)
farm_kpi_results <- farm_summary
} else {
farm_kpi_results <- NULL
}
} else {
farm_kpi_results <- NULL
}
# ========== FINAL SUMMARY ==========
cat("\n", strrep("=", 70), "\n")
cat("80_CALCULATE_KPIs.R - COMPLETION SUMMARY\n")
cat(strrep("=", 70), "\n")
cat("Per-field analysis fields analyzed:", nrow(field_analysis_df), "\n")
cat("Excel export:", export_paths$excel, "\n")
cat("RDS export:", export_paths$rds, "\n")
cat("CSV export:", export_paths$csv, "\n")
if (!is.null(farm_kpi_results)) {
cat("\nFarm-level KPIs: CALCULATED\n")
} else {
cat("\nFarm-level KPIs: SKIPPED (no valid tile data extracted)\n")
}
cat("\n✓ Consolidated KPI calculation complete!\n")
cat(" - Per-field data exported\n")
cat(" - Farm-level KPIs calculated\n")
cat(" - All outputs in:", reports_dir, "\n\n")
} else {
# Unknown client type - log warning and exit
warning(sprintf("Unknown client type: %s - no workflow matched", client_type))
cat("\n⚠ Warning: Client type '", client_type, "' does not match any known workflow\n", sep = "")
cat("Expected: 'agronomic_support' (aura) or 'cane_supply' (angata, etc.)\n")
cat("Check CLIENT_TYPE_MAP in parameters_project.R\n\n")
}
}
if (sys.nframe() == 0) {
main()
}