From 83951efca21ab88057909c7d763c45e1593f2fb1 Mon Sep 17 00:00:00 2001
From: Timon <timon@resiliencebv.com>
Date: Tue, 24 Feb 2026 13:48:06 +0100
Subject: [PATCH] =?UTF-8?q?Refactor=20gap=20filling=20score=20calculations?=
 =?UTF-8?q?=20to=20use=201=CF=83=20method;=20ensure=20all=20fields=20are?=
 =?UTF-8?q?=20included=20in=20output?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 r_app/80_utils_cane_supply.R   | 47 ++++++++++++++-------
 r_app/80_utils_common.R        | 77 ++++++++++++++++++++--------------
 r_app/MANUAL_PIPELINE_RUNNER.R |  6 +--
 3 files changed, 81 insertions(+), 49 deletions(-)

diff --git a/r_app/80_utils_cane_supply.R b/r_app/80_utils_cane_supply.R
index ca896a1..6b50033 100644
--- a/r_app/80_utils_cane_supply.R
+++ b/r_app/80_utils_cane_supply.R
@@ -80,7 +80,8 @@ calculate_field_acreages <- function(field_boundaries_sf, unit = AREA_UNIT_PREFE
     
     result_df <- data.frame(
       field = field_names,
-    warning(paste("Could not calculate areas from geometries -", e$message))      stringsAsFactors = FALSE
+      area = areas,
+      stringsAsFactors = FALSE
     )
     colnames(result_df) <- c("field", col_name)
     
@@ -213,14 +214,19 @@ calculate_status_alert <- function(imminent_prob, age_week, mean_ci,
 
 
 
-#' Build complete per-field KPI dataframe with all 22 columns
-#' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats
+#' Build complete per-field KPI dataframe - ALWAYS ALL FIELDS from GeoJSON
+#' 
+#' CRITICAL: Always outputs all fields from field_boundaries_sf, regardless of data availability.
+#' Fields without satellite data this week have all KPI columns set to NA.
+#' This is dynamic - output row count equals nrow(field_boundaries_sf), not hardcoded.
+#'
+#' @param current_stats data.frame with current week statistics (may have fewer rows if data is incomplete)
 #' @param planting_dates data.frame with field_id and planting_date columns
 #' @param imminent_prob_data data.frame with Field_id and Imminent_prob_actual columns (or NULL)
-#' @param gap_scores_df data.frame with Field_id and gap_score columns (or NULL)
-#' @param field_boundaries_sf sf object with field geometries
+#' @param gap_scores_df data.frame with Field_id and gap_score columns (guaranteed to have all fields)
+#' @param field_boundaries_sf sf object with field geometries (authoritative source of all field IDs)
 #' @param end_date Date object for current report date
-#' @return data.frame with all 22 KPI columns
+#' @return data.frame with 22 columns and nrow(field_boundaries_sf) rows (all fields, NAs for missing data)
 calculate_all_field_kpis <- function(current_stats, 
                                       planting_dates, 
                                       imminent_prob_data, 
@@ -230,6 +236,14 @@ calculate_all_field_kpis <- function(current_stats,
   
   message("\nBuilding final field analysis output...")
   
+  # CRITICAL: Start with ALL fields from field_boundaries_sf (dynamic, based on GeoJSON)
+  # This ensures output always includes all fields, even if they lack satellite data
+  all_fields <- field_boundaries_sf %>%
+    sf::st_drop_geometry() %>%
+    mutate(Field_id = field) %>%
+    select(Field_id) %>%
+    distinct()
+  
   # Pre-calculate areas using unified function
   acreage_lookup <- calculate_field_acreages(field_boundaries_sf, unit = AREA_UNIT_PREFERENCE)
   
@@ -237,7 +251,10 @@ calculate_all_field_kpis <- function(current_stats,
   unit_label <- get_area_unit_label(AREA_UNIT_PREFERENCE)
   area_col_name <- paste0("Area_", unit_label)
   
-  field_analysis_df <- current_stats %>%
+  # Left-join current_stats to all_fields to preserve all field IDs
+  # Fields without data in current_stats will have NA values
+  field_analysis_df <- all_fields %>%
+    left_join(current_stats, by = "Field_id") %>%
     mutate(
       # Column 2: Farm_Section (user fills manually)
       Farm_Section = NA_character_,
@@ -258,7 +275,7 @@ calculate_all_field_kpis <- function(current_stats,
       
       # Column 9: Age_week (calculated)
       Age_week = {
-        sapply(seq_len(nrow(current_stats)), function(idx) {
+        sapply(seq_len(n()), function(idx) {
           calculate_age_week(Last_harvest_or_planting_date[idx], end_date)
         })
       },
@@ -274,13 +291,13 @@ calculate_all_field_kpis <- function(current_stats,
         if (!is.null(imminent_prob_data)) {
           as.numeric(imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)])
         } else {
-          rep(NA_real_, nrow(current_stats))
+          rep(NA_real_, n())
         }
       },
       
       # Column 14: Status_Alert (multi-priority logic for harvest/milling workflow)
       Status_Alert = {
-        sapply(seq_len(nrow(current_stats)), function(idx) {
+        sapply(seq_len(n()), function(idx) {
           calculate_status_alert(
             imminent_prob = Imminent_prob[idx],
             age_week = Age_week[idx],
@@ -293,17 +310,17 @@ calculate_all_field_kpis <- function(current_stats,
       },
       
       # Column 19b: CV_Trend_Long_Term_Category (categorical slope)
-      CV_Trend_Long_Term_Category = sapply(current_stats$CV_Trend_Long_Term, categorize_cv_trend_long_term),
+      CV_Trend_Long_Term_Category = sapply(CV_Trend_Long_Term, categorize_cv_trend_long_term),
       
       # Column 21: Cloud_pct_clear (binned into intervals)
       Cloud_pct_clear = sapply(Cloud_pct_clear, bin_percentage),
       
-      # Column 22: Gap_score (2σ method)
+      # Column 22: Gap_score (1σ method); gap_scores_df now has all fields
       Gap_score = {
         if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
-          gap_scores_df$gap_score[match(current_stats$Field_id, gap_scores_df$Field_id)]
+          gap_scores_df$gap_score[match(Field_id, gap_scores_df$Field_id)]
         } else {
-          rep(NA_real_, nrow(current_stats))
+          rep(NA_real_, n())
         }
       }
     ) %>%
@@ -316,7 +333,7 @@ calculate_all_field_kpis <- function(current_stats,
                "Imminent_prob", "Cloud_pct_clear", "Cloud_category", "Gap_score"))
     )
   
-  message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 22 columns"))
+  message(paste("✓ Built final output with", nrow(field_analysis_df), "fields (from GeoJSON) and 22 columns"))
   
   return(field_analysis_df)
 }
diff --git a/r_app/80_utils_common.R b/r_app/80_utils_common.R
index f74b7d9..0eb9da5 100644
--- a/r_app/80_utils_common.R
+++ b/r_app/80_utils_common.R
@@ -465,7 +465,7 @@ calculate_regression_slope <- function(values, decimal_places = 2) {
   })
 }
 
-#' Calculate Gap Filling Score KPI (2σ method)
+#' Calculate Gap Filling Score KPI (1σ method)
 #' @param ci_raster Current week CI raster (single band)
 #' @param field_boundaries Field boundaries (sf or SpatVector)
 #' @return List with summary data frame and field-level results data frame
@@ -490,7 +490,7 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
     valid_values <- ci_values[!is.na(ci_values) & is.finite(ci_values)]
 
     if (length(valid_values) > 1) {
-      # Gap score using 2σ below median to detect outliers
+      # Gap score using 1σ below median to detect outliers
       median_ci <- median(valid_values)
       sd_ci <- sd(valid_values)
       outlier_threshold <- median_ci - (1 * sd_ci)
@@ -544,8 +544,17 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
 #' @param field_boundaries_sf sf object with field geometries
 #' @return data.frame with Field_id and gap_score columns
 calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
-  message("\nCalculating gap filling scores (2σ method)...")
-  message(paste("  Using per-field mosaics for", length(per_field_files), "fields"))
+  message("\nCalculating gap filling scores (1σ method)...")
+  message(paste("  Using per-field mosaics for", nrow(field_boundaries_sf), "fields"))
+  
+  # Initialize results with ALL field IDs (1185 for angata), defaulting to NA
+  # This ensures that even fields without satellite data this week are included in output
+  all_field_ids <- field_boundaries_sf$field
+  gap_scores_df <- data.frame(
+    Field_id = all_field_ids,
+    gap_score = NA_real_,
+    stringsAsFactors = FALSE
+  )
   
   field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field)
   
@@ -583,37 +592,43 @@ calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
     })
   }
   
-  # Process fields sequentially with progress bar
-  message("  Processing gap scores for ", length(per_field_files), " fields...")
-  pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50)
-  
-  results_list <- lapply(seq_along(per_field_files), function(idx) {
-    result <- process_gap_for_field(per_field_files[[idx]])
-    utils::setTxtProgressBar(pb, idx)
-    result
-  })
-  close(pb)
-  
-  gap_scores_df <- dplyr::bind_rows(results_list)
-  
-  if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
-    gap_scores_df <- gap_scores_df %>%
+  # Process only fields that have per-field mosaic files
+  n_files <- length(per_field_files)
+  if (n_files > 0) {
+    message("  Processing gap scores for", n_files, "available per-field mosaics...")
+    pb <- utils::txtProgressBar(min = 0, max = n_files, style = 3, width = 50)
+    
+    results_list <- lapply(seq_along(per_field_files), function(idx) {
+      result <- process_gap_for_field(per_field_files[[idx]])
+      utils::setTxtProgressBar(pb, idx)
+      result
+    })
+    close(pb)
+    
+    # Merge calculated results into the complete field list
+    results_df <- dplyr::bind_rows(results_list) %>%
       dplyr::group_by(Field_id) %>%
       dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop")
     
-    message(paste("  ✓ Calculated gap scores for", nrow(gap_scores_df), "fields"))
-    
-    # Guard against all-NA values which would produce Inf/-Inf warnings
-    if (any(is.finite(gap_scores_df$gap_score))) {
-      min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
-      max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
-      message(paste("    Gap score range:", min_score, "-", max_score, "%"))
-    } else {
-      message("    Gap score range: All values are NA (no valid gap scores)")
-    }
+    # Update gap_scores_df with calculated values (overwrite NAs where we have data)
+    gap_scores_df <- gap_scores_df %>%
+      left_join(results_df, by = "Field_id") %>%
+      mutate(gap_score = coalesce(gap_score.y, gap_score.x)) %>%
+      select(Field_id, gap_score)
   } else {
-    message("  WARNING: No gap scores calculated from per-field mosaics")
-    gap_scores_df <- NULL
+    message("  No per-field mosaic files available - all gap scores will be NA")
+  }
+  
+  # Always count how many fields got actual calculations (not NA)
+  n_calculated <- sum(!is.na(gap_scores_df$gap_score))
+  message(paste("  ✓ Gap scores for", nrow(gap_scores_df), "fields (", n_calculated, "calculated,", 
+                nrow(gap_scores_df) - n_calculated, "no data)"))
+  
+  # Guard against all-NA values which would produce Inf/-Inf warnings
+  if (any(is.finite(gap_scores_df$gap_score))) {
+    min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
+    max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
+    message(paste("    Gap score range:", min_score, "-", max_score, "%"))
   }
   
   return(gap_scores_df)
diff --git a/r_app/MANUAL_PIPELINE_RUNNER.R b/r_app/MANUAL_PIPELINE_RUNNER.R
index 0133024..9575bfd 100644
--- a/r_app/MANUAL_PIPELINE_RUNNER.R
+++ b/r_app/MANUAL_PIPELINE_RUNNER.R
@@ -382,7 +382,7 @@
 #   & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [END_DATE] [PROJECT] [OFFSET]
 #
 # Example (2026-02-09, angata, 7-day lookback):
-#   & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-02-09 angata 7
+#   
 #
 # EXPECTED OUTPUT:
 #   File: {PROJECT}_field_analysis_week{WW}_{YYYY}.xlsx
@@ -456,8 +456,8 @@ rmarkdown::render(
 #   rmarkdown::render(
 rmarkdown::render(
   "r_app/91_CI_report_with_kpis_cane_supply.Rmd",
-  params = list(data_dir = "angata", report_date = as.Date("2026-02-04")),
-  output_file = "SmartCane_Report_cane_supply_angata_2026-02-04.docx",
+  params = list(data_dir = "angata", report_date = as.Date("2026-02-23")),
+  output_file = "SmartCane_Report_cane_supply_angata_2026-02-23_en.docx",
   output_dir = "laravel_app/storage/app/angata/reports"
 )
 #