Refactor KPI calculations and reporting utilities; normalize field details columns, update area change metrics, and enhance .gitignore for PNG exceptions.

2026-02-18 13:26:32 +01:00 · 2026-02-18 13:26:32 +01:00 · 1500bbcb1c
parent b2d4093601
commit 1500bbcb1c
7 changed files with 93 additions and 93 deletions
--- a/python_app/harvest_date_pred_utils.py
+++ b/python_app/harvest_date_pred_utils.py
@ -303,9 +303,9 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame:
 def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device, 
                              threshold=0.45, consecutive_days=2):
    """
-    Phase 1: Growing window detection with DOY season reset.
+    Phase 1: Growing window detection with DAH season reset.
    
-    For each detected harvest, reset DOY counter for the next season.
+    For each detected harvest, reset DAH counter for the next season.
    This allows the model to detect multiple consecutive harvests in multi-year data.
    """
    harvest_dates = []
--- a/r_app/.gitignore
+++ b/r_app/.gitignore
@ -14,8 +14,7 @@ renv

 # EXCEPTIONS: Explicitly track intentional PNG assets
 # Uncomment or add lines below for PNG files that should be committed to git
-!r_app/CI_graph_example.png
-
+!CI_graph_example.png
 # Ignore files related to Rproj
 .Rproj.user/
 .Rhistory
--- a/r_app/80_utils_agronomic_support.R
+++ b/r_app/80_utils_agronomic_support.R
@ -181,12 +181,18 @@ calculate_field_uniformity_kpi <- function(ci_pixels_by_field, field_boundaries_
 #' @param previous_stats Previous week field statistics
 #'
 #' @return Data frame with field-level CI changes
-calculate_area_change_kpi <- function(current_stats, previous_stats) {
+calculate_area_change_kpi <- function(current_stats, previous_stats, field_boundaries_sf = NULL) {
+  
+  # Initialize field index vector
+  field_idx_vec <- seq_len(nrow(current_stats))
+  if (!is.null(field_boundaries_sf) && "Field_id" %in% names(current_stats)) {
+    field_idx_vec <- match(current_stats$Field_id, field_boundaries_sf$field)
+  }
  
  # Initialize result data frame
  result <- data.frame(
-    field_idx = seq_len(nrow(current_stats)),
-    mean_ci_pct_change = NA_real_,
+    field_idx = field_idx_vec,
+    mean_ci_abs_change = NA_real_,
    interpretation = NA_character_,
    stringsAsFactors = FALSE
  )
@ -223,19 +229,19 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
    # Find matching previous CI value
    prev_ci <- prev_lookup[[as.character(current_field_id)]]
    
-    if (!is.null(prev_ci) && !is.na(prev_ci) && !is.na(current_ci) && prev_ci > 0) {
-      # Calculate percentage change
-      pct_change <- ((current_ci - prev_ci) / prev_ci) * 100
-      result$mean_ci_pct_change[i] <- round(pct_change, 2)
+    if (!is.null(prev_ci) && !is.na(prev_ci) && !is.na(current_ci)) {
+      # Calculate absolute change (CI units)
+      abs_change <- current_ci - prev_ci
+      result$mean_ci_abs_change[i] <- round(abs_change, 2)
      
      # Add interpretation
-      if (pct_change > 15) {
+      if (abs_change > 0.5) {
        result$interpretation[i] <- "Rapid growth"
-      } else if (pct_change > 5) {
+      } else if (abs_change > 0.2) {
        result$interpretation[i] <- "Positive growth"
-      } else if (pct_change > -5) {
+      } else if (abs_change >= -0.2) {
        result$interpretation[i] <- "Stable"
-      } else if (pct_change > -15) {
+      } else if (abs_change >= -0.5) {
        result$interpretation[i] <- "Declining"
      } else {
        result$interpretation[i] <- "Rapid decline"
@ -244,7 +250,6 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
      result$interpretation[i] <- "No previous data"
    }
  }  
-  
  return(result)
 }

@ -557,7 +562,7 @@ create_summary_tables <- function(all_kpis) {
      select(field_idx, cv_value, uniformity_category, interpretation),
    
    area_change = all_kpis$area_change %>%
-      select(field_idx, mean_ci_pct_change, interpretation),
+      select(field_idx, mean_ci_abs_change, interpretation),
    
    tch_forecast = all_kpis$tch_forecasted %>%
      select(field_idx, tch_forecasted),
@ -633,7 +638,7 @@ create_field_detail_table <- function(field_boundaries_sf, all_kpis, current_wee
  result <- result %>%
    left_join(
      all_kpis$area_change %>% 
-        select(field_idx, Weekly_CI_Change = mean_ci_pct_change, 
+        select(field_idx, Weekly_CI_Change = mean_ci_abs_change, 
               Area_Change_Interpretation = interpretation),
      by = "field_idx"
    )
@ -918,7 +923,7 @@ calculate_all_field_analysis_agronomic_support <- function(
  } else {
    area_change_kpi <- data.frame(
      field_idx = seq_len(nrow(field_boundaries_sf)),
-      mean_ci_pct_change = NA_real_,
+      mean_ci_abs_change = NA_real_,
      interpretation = rep("No previous data", nrow(field_boundaries_sf))
    )
  }
--- a/r_app/80_utils_cane_supply.R
+++ b/r_app/80_utils_cane_supply.R
@ -45,13 +45,16 @@ CI_CHANGE_INCREASE_THRESHOLD <- 0.5    # Weekly CI change threshold for increase
 #' @return data.frame with field and acreage columns
 calculate_field_acreages <- function(field_boundaries_sf) {
  tryCatch({
-    lookup_df <- field_boundaries_sf %>%
+    # Project to equal-area CRS (EPSG:6933) for accurate area calculations
+    field_boundaries_proj <- sf::st_transform(field_boundaries_sf, "EPSG:6933")
+    
+    lookup_df <- field_boundaries_proj %>%
      sf::st_drop_geometry() %>%
      as.data.frame() %>%
      mutate(
-        geometry_valid = sapply(seq_len(nrow(field_boundaries_sf)), function(idx) {
+        geometry_valid = sapply(seq_len(nrow(field_boundaries_proj)), function(idx) {
          tryCatch({
-            sf::st_is_valid(field_boundaries_sf[idx, ])
+            sf::st_is_valid(field_boundaries_proj[idx, ])
          }, error = function(e) FALSE)
        }),
        area_ha = 0
@ -61,7 +64,7 @@ calculate_field_acreages <- function(field_boundaries_sf) {
    valid_indices <- which(lookup_df$geometry_valid)
    areas_ha <- vapply(valid_indices, function(idx) {
      tryCatch({
-        area_m2 <- as.numeric(sf::st_area(field_boundaries_sf[idx, ]))
+        area_m2 <- as.numeric(sf::st_area(field_boundaries_proj[idx, ]))
        area_m2 / 10000
      }, error = function(e) NA_real_)
    }, numeric(1))
@ -555,23 +558,7 @@ calculate_field_analysis_cane_supply <- function(setup,
  
  # ========== PHASE 3: LOAD PLANTING DATES ==========
  message("\nLoading harvest data from harvest.xlsx for planting dates...")
-  harvest_file_path <- file.path(data_dir, "harvest.xlsx")
-  
-  harvesting_data <- tryCatch({
-    if (file.exists(harvest_file_path)) {
-      harvest_raw <- readxl::read_excel(harvest_file_path)
-      harvest_raw$season_start <- as.Date(harvest_raw$season_start)
-      harvest_raw$season_end <- as.Date(harvest_raw$season_end)
-      message(paste("  ✓ Loaded harvest data:", nrow(harvest_raw), "rows"))
-      harvest_raw
-    } else {
-      message(paste("  WARNING: harvest.xlsx not found at", harvest_file_path))
-      NULL
-    }
-  }, error = function(e) {
-    message(paste("  ERROR loading harvest.xlsx:", e$message))
-    NULL
-  })
+  harvesting_data <- load_harvesting_data(data_dir)
  
  planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
  
@ -628,9 +615,8 @@ calculate_field_analysis_cane_supply <- function(setup,
  
  # ========== PHASE 6: LOAD HARVEST PROBABILITIES ==========
  message("\n4. Loading harvest probabilities from script 31...")
-  harvest_prob_dir <- setup$kpi_field_stats_dir
-  harvest_prob_file <- file.path(harvest_prob_dir, 
-                                  sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year))
+  # Use get_harvest_output_path() to safely build path (avoids NULL setup$kpi_field_stats_dir)
+  harvest_prob_file <- get_harvest_output_path(project_dir, current_week, current_year)
  message(paste("  Looking for:", harvest_prob_file))
  
  imminent_prob_data <- tryCatch({
--- a/r_app/80_utils_common.R
+++ b/r_app/80_utils_common.R
@ -406,7 +406,7 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
      # Gap score using 2σ below median to detect outliers
      median_ci <- median(valid_values)
      sd_ci <- sd(valid_values)
-      outlier_threshold <- median_ci - (2 * sd_ci)
+      outlier_threshold <- median_ci - (1 * sd_ci)
      low_ci_pixels <- sum(valid_values < outlier_threshold)
      total_pixels <- length(valid_values)
      gap_score <- round((low_ci_pixels / total_pixels) * 100, 2)
@ -415,7 +415,8 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
      gap_level <- dplyr::case_when(
        gap_score < 10 ~ "Minimal",
        gap_score < 25 ~ "Moderate", 
-        TRUE ~ "Significant"
+        gap_score >= 25 ~ "Significant",
+        TRUE ~ NA_character_
      )

      field_results <- rbind(field_results, data.frame(
@ -1603,7 +1604,8 @@ calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cu
      dplyr::group_by(sub_field, season) %>%
      dplyr::slice(which.max(DAH)) %>%
      dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DAH, season, sub_area) %>%
-      dplyr::mutate(CI_per_day = cumulative_CI / DAH)
+      dplyr::mutate(CI_per_day = ifelse(DAH > 0, cumulative_CI / DAH, NA_real_))
+

    # Define predictors and response variables
    predictors <- c("cumulative_CI", "DAH", "CI_per_day")
--- a/r_app/90_CI_report_with_kpis_agronomic_support.Rmd
+++ b/r_app/90_CI_report_with_kpis_agronomic_support.Rmd
@ -179,24 +179,15 @@ if (dir.exists(kpi_data_dir)) {
        safe_log(paste("✓ Loaded field_details_table with", nrow(field_details_table), "fields"))
        safe_log(paste("  Columns:", paste(names(field_details_table), collapse=", ")))
        
-        # NORMALIZATION: Ensure critical column names match downstream expectations
-        # Handle both "Field" and "Field_id" naming conventions
-        if ("Field" %in% names(field_details_table) && !("Field_id" %in% names(field_details_table))) {
-          field_details_table <- field_details_table %>%
-            dplyr::rename(Field_id = Field)
-          safe_log("  ✓ Normalized: renamed Field → Field_id")
-        }
+        # NORMALIZATION: Normalize column structure (Field→Field_id rename + expected_cols injection)
+        field_details_table <- normalize_field_details_columns(field_details_table)
        
        # Normalize other common column name variations
        column_mappings <- list(
          c("CV Value", "CV"),
-          c("CV", "CV"),  # Keep as-is
          c("Mean CI", "Mean_CI"),
-          c("Mean_CI", "Mean_CI"),  # Keep as-is
          c("Yield Forecast (t/ha)", "TCH_Forecasted"),
-          c("TCH_Forecasted", "TCH_Forecasted"),  # Keep as-is
          c("Gap Score", "Gap_Score"),
-          c("Gap_Score", "Gap_Score"),  # Keep as-is
          c("Growth Uniformity", "Growth_Uniformity"),
          c("Decline Risk", "Decline_Risk"),
          c("Patchiness Risk", "Patchiness_Risk"),
@ -721,6 +712,7 @@ generate_field_alerts <- function(field_details_table) {
        highest_patchiness_risk = case_when(
          any(`Patchiness Risk` == "High") ~ "High",
          any(`Patchiness Risk` == "Medium") ~ "Medium",
+          any(`Patchiness Risk` == "Low") ~ "Low",
          any(`Patchiness Risk` == "Minimal") ~ "Minimal",
          TRUE ~ "Unknown"
        ),
@ -1510,25 +1502,9 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
  }
  
  # Join field sizes and ages to KPI data, simplified column selection
-  # DEFENSIVE: Normalize field_details_table column names one more time before joining
-  # Ensure it has Field_id column (regardless of whether it was from RDS or fallback)
-  if (!is.null(field_details_table) && nrow(field_details_table) > 0) {
-    # If Field exists but Field_id doesn't, rename it
-    if ("Field" %in% names(field_details_table) && !("Field_id" %in% names(field_details_table))) {
-      field_details_table <- field_details_table %>%
-        dplyr::rename(Field_id = Field)
-    }
-    
-    # Ensure all expected KPI columns exist; add as NA if missing
-    expected_cols <- c("Field_id", "Mean_CI", "CV", "TCH_Forecasted", "Gap_Score", 
-                       "Trend_Interpretation", "Weekly_CI_Change", "Uniformity_Interpretation",
-                       "Decline_Severity", "Patchiness_Risk")
-    for (col in expected_cols) {
-      if (!col %in% names(field_details_table)) {
-        field_details_table[[col]] <- NA
-      }
-    }
-  }
+  # DEFENSIVE: Normalize field_details_table column structure before joining
+  # Uses shared normalization function to ensure Field_id exists and all expected columns are present
+  field_details_table <- normalize_field_details_columns(field_details_table)
  
  field_details_clean <- field_details_table %>%
    left_join(field_sizes_df, by = c("Field_id" = "field")) %>%
@ -1540,7 +1516,7 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
      field_size_acres = round(field_size_acres, 1),
      Mean_CI = round(Mean_CI, 2),
      CV = round(CV, 2),
-      Gap_Score = round(Gap_Score, 0),
+      Gap_Score = round(Gap_Score, 2),
      TCH_Forecasted = round(TCH_Forecasted, 1)
    )
  
@ -1554,7 +1530,7 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
        `Mean CI` = Mean_CI,
        `Weekly CI Change` = Weekly_CI_Change,
        `Yield Forecast (t/ha)` = TCH_Forecasted,
-        `Gap Score` = Gap_Score,
+        `Gap Score %` = Gap_Score,
        `Decline Risk` = Decline_Severity,
        `Patchiness Risk` = Patchiness_Risk,
        `CV Value` = CV
@ -1566,7 +1542,7 @@ if (!exists("field_details_table") || is.null(field_details_table) || nrow(field
        `Field Size (acres)` = field_size_acres,
        `Mean CI` = Mean_CI,
        `Yield Forecast (t/ha)` = TCH_Forecasted,
-        `Gap Score` = Gap_Score,
+        `Gap Score %` = Gap_Score,
        `Decline Risk` = Decline_Severity,
        `Patchiness Risk` = Patchiness_Risk,
        `CV Value` = CV
@ -1639,13 +1615,13 @@ CI values typically range from 0 (bare soil or severely stressed vegetation) to
     - **Why it matters:** Uniform fields are easier to manage and typically produce better yields. Uneven growth suggests irrigation problems, fertility gaps, pests, or disease.
   
   - **KPI 2: Area Change (Weekly Growth)** — Tracks week-over-week CI changes to detect rapid improvements or declines
-     - **Calculation:** ((Current Mean CI − Previous Mean CI) / Previous Mean CI) × 100
+     - **Calculation:** Current Mean CI − Previous Mean CI (absolute change in CI units)
     - **Categories:**
-       - **Rapid growth:** > 15% increase (excellent weekly progress)
-       - **Positive growth:** 5–15% increase (steady improvement)
-       - **Stable:** −5% to +5% (field maintained, no significant change)
-       - **Declining:** −5% to −15% (slow decline, warrant closer monitoring)
-       - **Rapid decline:** < −15% (alert: urgent issue requiring investigation)
+       - **Rapid growth:** > +0.5 (excellent weekly progress)
+       - **Positive growth:** +0.2 to +0.5 (steady improvement)
+       - **Stable:** −0.2 to +0.2 (field maintained, no significant change)
+       - **Declining:** −0.5 to −0.2 (slow decline, warrant closer monitoring)
+       - **Rapid decline:** < −0.5 (alert: urgent issue requiring investigation)
     - **Why it matters:** Week-to-week changes reveal developing problems early, enabling timely intervention.
   
   - **KPI 3: TCH Forecasted (Yield Prediction)** — Predicts final harvest tonnage for mature fields
@ -1692,12 +1668,15 @@ CI values typically range from 0 (bare soil or severely stressed vegetation) to
     Your scouting and remediation strategy should differ: Field A might need systemic irrigation adjustment or disease management; Field B might need soil remediation in the affected corner. **Patchiness tells you *where to focus your effort*.**
   
   - **KPI 6: Gap Score (Establishment Quality)** — Quantifies field gaps and areas of poor crop establishment
-     - **Metric:** Percentage of field area with very low CI values (lowest 25% of pixel distribution)
-     - **Levels:**
-       - **Minimal:** < 10% of field (good establishment, limited replanting needed)
-       - **Moderate:** 10–25% of field (monitor gap expansion, coordinate with agronomy)
-       - **Significant:** ≥ 25% of field (consider targeted replanting or rehabilitation)
-     - **Why it matters:** High gap scores indicate potential yield losses and may warrant management intervention.
+     - **Calculation Method:** Statistical outlier detection (2σ method)
+       - Identifies pixels with CI below: **Median CI − (2 × Standard Deviation)**
+       - Calculates: **Gap Score = (Outlier Pixels / Total Pixels) × 100**
+       - Example: If 2 of 100 pixels fall below threshold → Gap Score = 2%
+     - **Score Ranges & Interpretation:**
+       - **0–10%:** Minimal gaps (excellent establishment, healthy field)
+       - **10–25%:** Moderate gaps (monitor for expansion, coordinate with agronomy)
+       - **≥ 25%:** Significant gaps (consider targeted replanting or rehabilitation)
+     - **Why it matters:** Gap scores reveal areas of poor establishment that may indicate early growth problems or harvest-related residue issues. Lower is better (0–3% is typical for healthy fields).

 2. **Overview Map: Growth on Farm:**  
   Provides a traffic light overview of field-by-field growth status for quick prioritization and reporting.
--- a/r_app/report_utils.R
+++ b/r_app/report_utils.R
@ -1165,4 +1165,33 @@ generate_field_kpi_summary <- function(field_name, field_details_table, CI_quadr
  })
 }

+#' Normalize field_details_table column structure
+#'
+#' Standardizes column names and ensures all expected KPI columns exist.
+#' Handles Field → Field_id rename and injects missing columns as NA.
+#'
+#' @param field_details_table data.frame to normalize
+#' @return data.frame with standardized column structure
+normalize_field_details_columns <- function(field_details_table) {
+  if (is.null(field_details_table) || nrow(field_details_table) == 0) {
+    return(field_details_table)
+  }
  
+  # Rename Field → Field_id if needed
+  if ("Field" %in% names(field_details_table) && !("Field_id" %in% names(field_details_table))) {
+    field_details_table <- field_details_table %>%
+      dplyr::rename(Field_id = Field)
+  }
+  
+  # Ensure all expected KPI columns exist; add as NA if missing
+  expected_cols <- c("Field_id", "Mean_CI", "CV", "TCH_Forecasted", "Gap_Score", 
+                     "Trend_Interpretation", "Weekly_CI_Change", "Uniformity_Interpretation",
+                     "Decline_Severity", "Patchiness_Risk")
+  for (col in expected_cols) {
+    if (!col %in% names(field_details_table)) {
+      field_details_table[[col]] <- NA
+    }
+  }
+  
+  return(field_details_table)
+}