From 1500bbcb1ce4dcf49bbda253a53cdb376fc11208 Mon Sep 17 00:00:00 2001
From: Timon <timon@resiliencebv.com>
Date: Wed, 18 Feb 2026 13:26:32 +0100
Subject: [PATCH] Refactor KPI calculations and reporting utilities; normalize
 field details columns, update area change metrics, and enhance .gitignore for
 PNG exceptions.

---
 python_app/harvest_date_pred_utils.py         |  4 +-
 r_app/.gitignore                              |  3 +-
 r_app/80_utils_agronomic_support.R            | 37 +++++-----
 r_app/80_utils_cane_supply.R                  | 34 +++------
 r_app/80_utils_common.R                       |  8 ++-
 ..._CI_report_with_kpis_agronomic_support.Rmd | 69 +++++++------------
 r_app/report_utils.R                          | 31 ++++++++-
 7 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py
index eacb11f..d746e3a 100644
--- a/python_app/harvest_date_pred_utils.py
+++ b/python_app/harvest_date_pred_utils.py
@@ -303,9 +303,9 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame:
 def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device, 
                               threshold=0.45, consecutive_days=2):
     """
-    Phase 1: Growing window detection with DOY season reset.
+    Phase 1: Growing window detection with DAH season reset.
     
-    For each detected harvest, reset DOY counter for the next season.
+    For each detected harvest, reset DAH counter for the next season.
     This allows the model to detect multiple consecutive harvests in multi-year data.
     """
     harvest_dates = []
diff --git a/r_app/.gitignore b/r_app/.gitignore
index 7cd7b7e..3efde43 100644
--- a/r_app/.gitignore
+++ b/r_app/.gitignore
@@ -14,8 +14,7 @@ renv
 
 # EXCEPTIONS: Explicitly track intentional PNG assets
 # Uncomment or add lines below for PNG files that should be committed to git
-!r_app/CI_graph_example.png
-
+!CI_graph_example.png
 # Ignore files related to Rproj
 .Rproj.user/
 .Rhistory
diff --git a/r_app/80_utils_agronomic_support.R b/r_app/80_utils_agronomic_support.R
index 9dcf3db..9354d44 100644
--- a/r_app/80_utils_agronomic_support.R
+++ b/r_app/80_utils_agronomic_support.R
@@ -181,12 +181,18 @@ calculate_field_uniformity_kpi <- function(ci_pixels_by_field, field_boundaries_
 #' @param previous_stats Previous week field statistics
 #'
 #' @return Data frame with field-level CI changes
-calculate_area_change_kpi <- function(current_stats, previous_stats) {
+calculate_area_change_kpi <- function(current_stats, previous_stats, field_boundaries_sf = NULL) {
+  
+  # Initialize field index vector
+  field_idx_vec <- seq_len(nrow(current_stats))
+  if (!is.null(field_boundaries_sf) && "Field_id" %in% names(current_stats)) {
+    field_idx_vec <- match(current_stats$Field_id, field_boundaries_sf$field)
+  }
   
   # Initialize result data frame
   result <- data.frame(
-    field_idx = seq_len(nrow(current_stats)),
-    mean_ci_pct_change = NA_real_,
+    field_idx = field_idx_vec,
+    mean_ci_abs_change = NA_real_,
     interpretation = NA_character_,
     stringsAsFactors = FALSE
   )
@@ -223,19 +229,19 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
     # Find matching previous CI value
     prev_ci <- prev_lookup[[as.character(current_field_id)]]
     
-    if (!is.null(prev_ci) && !is.na(prev_ci) && !is.na(current_ci) && prev_ci > 0) {
-      # Calculate percentage change
-      pct_change <- ((current_ci - prev_ci) / prev_ci) * 100
-      result$mean_ci_pct_change[i] <- round(pct_change, 2)
+    if (!is.null(prev_ci) && !is.na(prev_ci) && !is.na(current_ci)) {
+      # Calculate absolute change (CI units)
+      abs_change <- current_ci - prev_ci
+      result$mean_ci_abs_change[i] <- round(abs_change, 2)
       
       # Add interpretation
-      if (pct_change > 15) {
+      if (abs_change > 0.5) {
         result$interpretation[i] <- "Rapid growth"
-      } else if (pct_change > 5) {
+      } else if (abs_change > 0.2) {
         result$interpretation[i] <- "Positive growth"
-      } else if (pct_change > -5) {
+      } else if (abs_change >= -0.2) {
         result$interpretation[i] <- "Stable"
-      } else if (pct_change > -15) {
+      } else if (abs_change >= -0.5) {
         result$interpretation[i] <- "Declining"
       } else {
         result$interpretation[i] <- "Rapid decline"
@@ -243,8 +249,7 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
     } else {
       result$interpretation[i] <- "No previous data"
     }
-  }
-  
+  }  
   return(result)
 }
 
@@ -557,7 +562,7 @@ create_summary_tables <- function(all_kpis) {
       select(field_idx, cv_value, uniformity_category, interpretation),
     
     area_change = all_kpis$area_change %>%
-      select(field_idx, mean_ci_pct_change, interpretation),
+      select(field_idx, mean_ci_abs_change, interpretation),
     
     tch_forecast = all_kpis$tch_forecasted %>%
       select(field_idx, tch_forecasted),
@@ -633,7 +638,7 @@ create_field_detail_table <- function(field_boundaries_sf, all_kpis, current_wee
   result <- result %>%
     left_join(
       all_kpis$area_change %>% 
-        select(field_idx, Weekly_CI_Change = mean_ci_pct_change, 
+        select(field_idx, Weekly_CI_Change = mean_ci_abs_change, 
                Area_Change_Interpretation = interpretation),
       by = "field_idx"
     )
@@ -918,7 +923,7 @@ calculate_all_field_analysis_agronomic_support <- function(
   } else {
     area_change_kpi <- data.frame(
       field_idx = seq_len(nrow(field_boundaries_sf)),
-      mean_ci_pct_change = NA_real_,
+      mean_ci_abs_change = NA_real_,
       interpretation = rep("No previous data", nrow(field_boundaries_sf))
     )
   }
diff --git a/r_app/80_utils_cane_supply.R b/r_app/80_utils_cane_supply.R
index 6de2778..693fe0f 100644
--- a/r_app/80_utils_cane_supply.R
+++ b/r_app/80_utils_cane_supply.R
@@ -45,13 +45,16 @@ CI_CHANGE_INCREASE_THRESHOLD <- 0.5    # Weekly CI change threshold for increase
 #' @return data.frame with field and acreage columns
 calculate_field_acreages <- function(field_boundaries_sf) {
   tryCatch({
-    lookup_df <- field_boundaries_sf %>%
+    # Project to equal-area CRS (EPSG:6933) for accurate area calculations
+    field_boundaries_proj <- sf::st_transform(field_boundaries_sf, "EPSG:6933")
+    
+    lookup_df <- field_boundaries_proj %>%
       sf::st_drop_geometry() %>%
       as.data.frame() %>%
       mutate(
-        geometry_valid = sapply(seq_len(nrow(field_boundaries_sf)), function(idx) {
+        geometry_valid = sapply(seq_len(nrow(field_boundaries_proj)), function(idx) {
           tryCatch({
-            sf::st_is_valid(field_boundaries_sf[idx, ])
+            sf::st_is_valid(field_boundaries_proj[idx, ])
           }, error = function(e) FALSE)
         }),
         area_ha = 0
@@ -61,7 +64,7 @@ calculate_field_acreages <- function(field_boundaries_sf) {
     valid_indices <- which(lookup_df$geometry_valid)
     areas_ha <- vapply(valid_indices, function(idx) {
       tryCatch({
-        area_m2 <- as.numeric(sf::st_area(field_boundaries_sf[idx, ]))
+        area_m2 <- as.numeric(sf::st_area(field_boundaries_proj[idx, ]))
         area_m2 / 10000
       }, error = function(e) NA_real_)
     }, numeric(1))
@@ -555,23 +558,7 @@ calculate_field_analysis_cane_supply <- function(setup,
   
   # ========== PHASE 3: LOAD PLANTING DATES ==========
   message("\nLoading harvest data from harvest.xlsx for planting dates...")
-  harvest_file_path <- file.path(data_dir, "harvest.xlsx")
-  
-  harvesting_data <- tryCatch({
-    if (file.exists(harvest_file_path)) {
-      harvest_raw <- readxl::read_excel(harvest_file_path)
-      harvest_raw$season_start <- as.Date(harvest_raw$season_start)
-      harvest_raw$season_end <- as.Date(harvest_raw$season_end)
-      message(paste("  ✓ Loaded harvest data:", nrow(harvest_raw), "rows"))
-      harvest_raw
-    } else {
-      message(paste("  WARNING: harvest.xlsx not found at", harvest_file_path))
-      NULL
-    }
-  }, error = function(e) {
-    message(paste("  ERROR loading harvest.xlsx:", e$message))
-    NULL
-  })
+  harvesting_data <- load_harvesting_data(data_dir)
   
   planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
   
@@ -628,9 +615,8 @@ calculate_field_analysis_cane_supply <- function(setup,
   
   # ========== PHASE 6: LOAD HARVEST PROBABILITIES ==========
   message("\n4. Loading harvest probabilities from script 31...")
-  harvest_prob_dir <- setup$kpi_field_stats_dir
-  harvest_prob_file <- file.path(harvest_prob_dir, 
-                                  sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year))
+  # Use get_harvest_output_path() to safely build path (avoids NULL setup$kpi_field_stats_dir)
+  harvest_prob_file <- get_harvest_output_path(project_dir, current_week, current_year)
   message(paste("  Looking for:", harvest_prob_file))
   
   imminent_prob_data <- tryCatch({
diff --git a/r_app/80_utils_common.R b/r_app/80_utils_common.R
index d3d2ca2..071b025 100644
--- a/r_app/80_utils_common.R
+++ b/r_app/80_utils_common.R
@@ -406,7 +406,7 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
       # Gap score using 2σ below median to detect outliers
       median_ci <- median(valid_values)
       sd_ci <- sd(valid_values)
-      outlier_threshold <- median_ci - (2 * sd_ci)
+      outlier_threshold <- median_ci - (1 * sd_ci)
       low_ci_pixels <- sum(valid_values < outlier_threshold)
       total_pixels <- length(valid_values)
       gap_score <- round((low_ci_pixels / total_pixels) * 100, 2)
@@ -415,7 +415,8 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
       gap_level <- dplyr::case_when(
         gap_score < 10 ~ "Minimal",
         gap_score < 25 ~ "Moderate", 
-        TRUE ~ "Significant"
+        gap_score >= 25 ~ "Significant",
+        TRUE ~ NA_character_
       )
 
       field_results <- rbind(field_results, data.frame(
@@ -1603,7 +1604,8 @@ calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cu
       dplyr::group_by(sub_field, season) %>%
       dplyr::slice(which.max(DAH)) %>%
       dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DAH, season, sub_area) %>%
-      dplyr::mutate(CI_per_day = cumulative_CI / DAH)
+      dplyr::mutate(CI_per_day = ifelse(DAH > 0, cumulative_CI / DAH, NA_real_))
+
 
     # Define predictors and response variables
     predictors <- c("cumulative_CI", "DAH", "CI_per_day")
diff --git a/r_app/90_CI_report_with_kpis_agronomic_support.Rmd b/r_app/90_CI_report_with_kpis_agronomic_support.Rmd
index 60f9d93..5b8f268 100644
--- a/r_app/90_CI_report_with_kpis_agronomic_support.Rmd
+++ b/r_app/90_CI_report_with_kpis_agronomic_support.Rmd
@@ -179,24 +179,15 @@ if (dir.exists(kpi_data_dir)) {
         safe_log(paste("✓ Loaded field_details_table with", nrow(field_details_table), "fields"))
         safe_log(paste("  Columns:", paste(names(field_details_table), collapse=", ")))
         
-        # NORMALIZATION: Ensure critical column names match downstream expectations
-        # Handle both "Field" and "Field_id" naming conventions
-        if ("Field" %in% names(field_details_table) && !("Field_id" %in% names(field_details_table))) {
-          field_details_table <- field_details_table %>%
-            dplyr::rename(Field_id = Field)
-          safe_log("  ✓ Normalized: renamed Field → Field_id")
-        }
+        # NORMALIZATION: Normalize column structure (Field→Field_id rename + expected_cols injection)
+        field_details_table <- normalize_field_details_columns(field_details_table)
         
         # Normalize other common column name variations
         column_mappings <- list(
           c("CV Value", "CV"),
-          c("CV", "CV"),  # Keep as-is
           c("Mean CI", "Mean_CI"),
-          c("Mean_CI", "Mean_CI"),  # Keep as-is
           c("Yield Forecast (t/ha)", "TCH_Forecasted"),
-          c("TCH_Forecasted", "TCH_Forecasted"),  # Keep as-is
           c("Gap Score", "Gap_Score"),
-          c("Gap_Score", "Gap_Score"),  # Keep as-is
           c("Growth Uniformity", "Growth_Uniformity"),
           c("Decline Risk", "Decline_Risk"),
           c("Patchiness Risk", "Patchiness_Risk"),
@@ -721,6 +712,7 @@ generate_field_alerts <- function(field_details_table) {
         highest_patchiness_risk = case_when(
           any(`Patchiness Risk` == "High") ~ "High",
           any(`Patchiness Risk` == "Medium") ~ "Medium",
+          any(`Patchiness Risk` == "Low") ~ "Low",
           any(`Patchiness Risk` == "Minimal") ~ "Minimal",
           TRUE ~ "Unknown"
         ),
@@ -1510,25 +1502,9 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
   }
   
   # Join field sizes and ages to KPI data, simplified column selection
-  # DEFENSIVE: Normalize field_details_table column names one more time before joining
-  # Ensure it has Field_id column (regardless of whether it was from RDS or fallback)
-  if (!is.null(field_details_table) && nrow(field_details_table) > 0) {
-    # If Field exists but Field_id doesn't, rename it
-    if ("Field" %in% names(field_details_table) && !("Field_id" %in% names(field_details_table))) {
-      field_details_table <- field_details_table %>%
-        dplyr::rename(Field_id = Field)
-    }
-    
-    # Ensure all expected KPI columns exist; add as NA if missing
-    expected_cols <- c("Field_id", "Mean_CI", "CV", "TCH_Forecasted", "Gap_Score", 
-                       "Trend_Interpretation", "Weekly_CI_Change", "Uniformity_Interpretation",
-                       "Decline_Severity", "Patchiness_Risk")
-    for (col in expected_cols) {
-      if (!col %in% names(field_details_table)) {
-        field_details_table[[col]] <- NA
-      }
-    }
-  }
+  # DEFENSIVE: Normalize field_details_table column structure before joining
+  # Uses shared normalization function to ensure Field_id exists and all expected columns are present
+  field_details_table <- normalize_field_details_columns(field_details_table)
   
   field_details_clean <- field_details_table %>%
     left_join(field_sizes_df, by = c("Field_id" = "field")) %>%
@@ -1540,7 +1516,7 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
       field_size_acres = round(field_size_acres, 1),
       Mean_CI = round(Mean_CI, 2),
       CV = round(CV, 2),
-      Gap_Score = round(Gap_Score, 0),
+      Gap_Score = round(Gap_Score, 2),
       TCH_Forecasted = round(TCH_Forecasted, 1)
     )
   
@@ -1554,7 +1530,7 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
         `Mean CI` = Mean_CI,
         `Weekly CI Change` = Weekly_CI_Change,
         `Yield Forecast (t/ha)` = TCH_Forecasted,
-        `Gap Score` = Gap_Score,
+        `Gap Score %` = Gap_Score,
         `Decline Risk` = Decline_Severity,
         `Patchiness Risk` = Patchiness_Risk,
         `CV Value` = CV
@@ -1566,7 +1542,7 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
         `Field Size (acres)` = field_size_acres,
         `Mean CI` = Mean_CI,
         `Yield Forecast (t/ha)` = TCH_Forecasted,
-        `Gap Score` = Gap_Score,
+        `Gap Score %` = Gap_Score,
         `Decline Risk` = Decline_Severity,
         `Patchiness Risk` = Patchiness_Risk,
         `CV Value` = CV
@@ -1639,13 +1615,13 @@ CI values typically range from 0 (bare soil or severely stressed vegetation) to
      - **Why it matters:** Uniform fields are easier to manage and typically produce better yields. Uneven growth suggests irrigation problems, fertility gaps, pests, or disease.
    
    - **KPI 2: Area Change (Weekly Growth)** — Tracks week-over-week CI changes to detect rapid improvements or declines
-     - **Calculation:** ((Current Mean CI − Previous Mean CI) / Previous Mean CI) × 100
+     - **Calculation:** Current Mean CI − Previous Mean CI (absolute change in CI units)
      - **Categories:**
-       - **Rapid growth:** > 15% increase (excellent weekly progress)
-       - **Positive growth:** 5–15% increase (steady improvement)
-       - **Stable:** −5% to +5% (field maintained, no significant change)
-       - **Declining:** −5% to −15% (slow decline, warrant closer monitoring)
-       - **Rapid decline:** < −15% (alert: urgent issue requiring investigation)
+       - **Rapid growth:** > +0.5 (excellent weekly progress)
+       - **Positive growth:** +0.2 to +0.5 (steady improvement)
+       - **Stable:** −0.2 to +0.2 (field maintained, no significant change)
+       - **Declining:** −0.5 to −0.2 (slow decline, warrant closer monitoring)
+       - **Rapid decline:** < −0.5 (alert: urgent issue requiring investigation)
      - **Why it matters:** Week-to-week changes reveal developing problems early, enabling timely intervention.
    
    - **KPI 3: TCH Forecasted (Yield Prediction)** — Predicts final harvest tonnage for mature fields
@@ -1692,12 +1668,15 @@ CI values typically range from 0 (bare soil or severely stressed vegetation) to
      Your scouting and remediation strategy should differ: Field A might need systemic irrigation adjustment or disease management; Field B might need soil remediation in the affected corner. **Patchiness tells you *where to focus your effort*.**
    
    - **KPI 6: Gap Score (Establishment Quality)** — Quantifies field gaps and areas of poor crop establishment
-     - **Metric:** Percentage of field area with very low CI values (lowest 25% of pixel distribution)
-     - **Levels:**
-       - **Minimal:** < 10% of field (good establishment, limited replanting needed)
-       - **Moderate:** 10–25% of field (monitor gap expansion, coordinate with agronomy)
-       - **Significant:** ≥ 25% of field (consider targeted replanting or rehabilitation)
-     - **Why it matters:** High gap scores indicate potential yield losses and may warrant management intervention.
+     - **Calculation Method:** Statistical outlier detection (2σ method)
+       - Identifies pixels with CI below: **Median CI − (2 × Standard Deviation)**
+       - Calculates: **Gap Score = (Outlier Pixels / Total Pixels) × 100**
+       - Example: If 2 of 100 pixels fall below threshold → Gap Score = 2%
+     - **Score Ranges & Interpretation:**
+       - **0–10%:** Minimal gaps (excellent establishment, healthy field)
+       - **10–25%:** Moderate gaps (monitor for expansion, coordinate with agronomy)
+       - **≥ 25%:** Significant gaps (consider targeted replanting or rehabilitation)
+     - **Why it matters:** Gap scores reveal areas of poor establishment that may indicate early growth problems or harvest-related residue issues. Lower is better (0–3% is typical for healthy fields).
 
 2. **Overview Map: Growth on Farm:**  
    Provides a traffic light overview of field-by-field growth status for quick prioritization and reporting.
diff --git a/r_app/report_utils.R b/r_app/report_utils.R
index 8b3afa1..38ecbd2 100644
--- a/r_app/report_utils.R
+++ b/r_app/report_utils.R
@@ -1165,4 +1165,33 @@ generate_field_kpi_summary <- function(field_name, field_details_table, CI_quadr
   })
 }
 
-
+#' Normalize field_details_table column structure
+#'
+#' Standardizes column names and ensures all expected KPI columns exist.
+#' Handles Field → Field_id rename and injects missing columns as NA.
+#'
+#' @param field_details_table data.frame to normalize
+#' @return data.frame with standardized column structure
+normalize_field_details_columns <- function(field_details_table) {
+  if (is.null(field_details_table) || nrow(field_details_table) == 0) {
+    return(field_details_table)
+  }
+  
+  # Rename Field → Field_id if needed
+  if ("Field" %in% names(field_details_table) && !("Field_id" %in% names(field_details_table))) {
+    field_details_table <- field_details_table %>%
+      dplyr::rename(Field_id = Field)
+  }
+  
+  # Ensure all expected KPI columns exist; add as NA if missing
+  expected_cols <- c("Field_id", "Mean_CI", "CV", "TCH_Forecasted", "Gap_Score", 
+                     "Trend_Interpretation", "Weekly_CI_Change", "Uniformity_Interpretation",
+                     "Decline_Severity", "Patchiness_Risk")
+  for (col in expected_cols) {
+    if (!col %in% names(field_details_table)) {
+      field_details_table[[col]] <- NA
+    }
+  }
+  
+  return(field_details_table)
+}