Refactor gap filling score calculations to use 1σ method; ensure all fields are included in output

2026-02-24 13:48:06 +01:00 · 2026-02-24 13:48:06 +01:00 · 83951efca2
parent e0bfbccf0e
commit 83951efca2
3 changed files with 81 additions and 49 deletions
--- a/r_app/80_utils_cane_supply.R
+++ b/r_app/80_utils_cane_supply.R
@ -80,7 +80,8 @@ calculate_field_acreages <- function(field_boundaries_sf, unit = AREA_UNIT_PREFE
    
    result_df <- data.frame(
      field = field_names,
-    warning(paste("Could not calculate areas from geometries -", e$message))      stringsAsFactors = FALSE
+      area = areas,
+      stringsAsFactors = FALSE
    )
    colnames(result_df) <- c("field", col_name)
    
@ -213,14 +214,19 @@ calculate_status_alert <- function(imminent_prob, age_week, mean_ci,



-#' Build complete per-field KPI dataframe with all 22 columns
-#' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats
+#' Build complete per-field KPI dataframe - ALWAYS ALL FIELDS from GeoJSON
+#' 
+#' CRITICAL: Always outputs all fields from field_boundaries_sf, regardless of data availability.
+#' Fields without satellite data this week have all KPI columns set to NA.
+#' This is dynamic - output row count equals nrow(field_boundaries_sf), not hardcoded.
+#'
+#' @param current_stats data.frame with current week statistics (may have fewer rows if data is incomplete)
 #' @param planting_dates data.frame with field_id and planting_date columns
 #' @param imminent_prob_data data.frame with Field_id and Imminent_prob_actual columns (or NULL)
-#' @param gap_scores_df data.frame with Field_id and gap_score columns (or NULL)
-#' @param field_boundaries_sf sf object with field geometries
+#' @param gap_scores_df data.frame with Field_id and gap_score columns (guaranteed to have all fields)
+#' @param field_boundaries_sf sf object with field geometries (authoritative source of all field IDs)
 #' @param end_date Date object for current report date
-#' @return data.frame with all 22 KPI columns
+#' @return data.frame with 22 columns and nrow(field_boundaries_sf) rows (all fields, NAs for missing data)
 calculate_all_field_kpis <- function(current_stats, 
                                      planting_dates, 
                                      imminent_prob_data, 
@ -230,6 +236,14 @@ calculate_all_field_kpis <- function(current_stats,
  
  message("\nBuilding final field analysis output...")
  
+  # CRITICAL: Start with ALL fields from field_boundaries_sf (dynamic, based on GeoJSON)
+  # This ensures output always includes all fields, even if they lack satellite data
+  all_fields <- field_boundaries_sf %>%
+    sf::st_drop_geometry() %>%
+    mutate(Field_id = field) %>%
+    select(Field_id) %>%
+    distinct()
+  
  # Pre-calculate areas using unified function
  acreage_lookup <- calculate_field_acreages(field_boundaries_sf, unit = AREA_UNIT_PREFERENCE)
  
@ -237,7 +251,10 @@ calculate_all_field_kpis <- function(current_stats,
  unit_label <- get_area_unit_label(AREA_UNIT_PREFERENCE)
  area_col_name <- paste0("Area_", unit_label)
  
-  field_analysis_df <- current_stats %>%
+  # Left-join current_stats to all_fields to preserve all field IDs
+  # Fields without data in current_stats will have NA values
+  field_analysis_df <- all_fields %>%
+    left_join(current_stats, by = "Field_id") %>%
    mutate(
      # Column 2: Farm_Section (user fills manually)
      Farm_Section = NA_character_,
@ -258,7 +275,7 @@ calculate_all_field_kpis <- function(current_stats,
      
      # Column 9: Age_week (calculated)
      Age_week = {
-        sapply(seq_len(nrow(current_stats)), function(idx) {
+        sapply(seq_len(n()), function(idx) {
          calculate_age_week(Last_harvest_or_planting_date[idx], end_date)
        })
      },
@ -274,13 +291,13 @@ calculate_all_field_kpis <- function(current_stats,
        if (!is.null(imminent_prob_data)) {
          as.numeric(imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)])
        } else {
-          rep(NA_real_, nrow(current_stats))
+          rep(NA_real_, n())
        }
      },
      
      # Column 14: Status_Alert (multi-priority logic for harvest/milling workflow)
      Status_Alert = {
-        sapply(seq_len(nrow(current_stats)), function(idx) {
+        sapply(seq_len(n()), function(idx) {
          calculate_status_alert(
            imminent_prob = Imminent_prob[idx],
            age_week = Age_week[idx],
@ -293,17 +310,17 @@ calculate_all_field_kpis <- function(current_stats,
      },
      
      # Column 19b: CV_Trend_Long_Term_Category (categorical slope)
-      CV_Trend_Long_Term_Category = sapply(current_stats$CV_Trend_Long_Term, categorize_cv_trend_long_term),
+      CV_Trend_Long_Term_Category = sapply(CV_Trend_Long_Term, categorize_cv_trend_long_term),
      
      # Column 21: Cloud_pct_clear (binned into intervals)
      Cloud_pct_clear = sapply(Cloud_pct_clear, bin_percentage),
      
-      # Column 22: Gap_score (2σ method)
+      # Column 22: Gap_score (1σ method); gap_scores_df now has all fields
      Gap_score = {
        if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
-          gap_scores_df$gap_score[match(current_stats$Field_id, gap_scores_df$Field_id)]
+          gap_scores_df$gap_score[match(Field_id, gap_scores_df$Field_id)]
        } else {
-          rep(NA_real_, nrow(current_stats))
+          rep(NA_real_, n())
        }
      }
    ) %>%
@ -316,7 +333,7 @@ calculate_all_field_kpis <- function(current_stats,
               "Imminent_prob", "Cloud_pct_clear", "Cloud_category", "Gap_score"))
    )
  
-  message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 22 columns"))
+  message(paste("✓ Built final output with", nrow(field_analysis_df), "fields (from GeoJSON) and 22 columns"))
  
  return(field_analysis_df)
 }
--- a/r_app/80_utils_common.R
+++ b/r_app/80_utils_common.R
@ -465,7 +465,7 @@ calculate_regression_slope <- function(values, decimal_places = 2) {
  })
 }

-#' Calculate Gap Filling Score KPI (2σ method)
+#' Calculate Gap Filling Score KPI (1σ method)
 #' @param ci_raster Current week CI raster (single band)
 #' @param field_boundaries Field boundaries (sf or SpatVector)
 #' @return List with summary data frame and field-level results data frame
@ -490,7 +490,7 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
    valid_values <- ci_values[!is.na(ci_values) & is.finite(ci_values)]

    if (length(valid_values) > 1) {
-      # Gap score using 2σ below median to detect outliers
+      # Gap score using 1σ below median to detect outliers
      median_ci <- median(valid_values)
      sd_ci <- sd(valid_values)
      outlier_threshold <- median_ci - (1 * sd_ci)
@ -544,8 +544,17 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
 #' @param field_boundaries_sf sf object with field geometries
 #' @return data.frame with Field_id and gap_score columns
 calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
-  message("\nCalculating gap filling scores (2σ method)...")
-  message(paste("  Using per-field mosaics for", length(per_field_files), "fields"))
+  message("\nCalculating gap filling scores (1σ method)...")
+  message(paste("  Using per-field mosaics for", nrow(field_boundaries_sf), "fields"))
+  
+  # Initialize results with ALL field IDs (1185 for angata), defaulting to NA
+  # This ensures that even fields without satellite data this week are included in output
+  all_field_ids <- field_boundaries_sf$field
+  gap_scores_df <- data.frame(
+    Field_id = all_field_ids,
+    gap_score = NA_real_,
+    stringsAsFactors = FALSE
+  )
  
  field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field)
  
@ -583,37 +592,43 @@ calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
    })
  }
  
-  # Process fields sequentially with progress bar
-  message("  Processing gap scores for ", length(per_field_files), " fields...")
-  pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50)
+  # Process only fields that have per-field mosaic files
+  n_files <- length(per_field_files)
+  if (n_files > 0) {
+    message("  Processing gap scores for", n_files, "available per-field mosaics...")
+    pb <- utils::txtProgressBar(min = 0, max = n_files, style = 3, width = 50)
    
-  results_list <- lapply(seq_along(per_field_files), function(idx) {
-    result <- process_gap_for_field(per_field_files[[idx]])
-    utils::setTxtProgressBar(pb, idx)
-    result
-  })
-  close(pb)
+    results_list <- lapply(seq_along(per_field_files), function(idx) {
+      result <- process_gap_for_field(per_field_files[[idx]])
+      utils::setTxtProgressBar(pb, idx)
+      result
+    })
+    close(pb)
    
-  gap_scores_df <- dplyr::bind_rows(results_list)
-  
-  if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
-    gap_scores_df <- gap_scores_df %>%
+    # Merge calculated results into the complete field list
+    results_df <- dplyr::bind_rows(results_list) %>%
      dplyr::group_by(Field_id) %>%
      dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop")
    
-    message(paste("  ✓ Calculated gap scores for", nrow(gap_scores_df), "fields"))
-    
-    # Guard against all-NA values which would produce Inf/-Inf warnings
-    if (any(is.finite(gap_scores_df$gap_score))) {
-      min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
-      max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
-      message(paste("    Gap score range:", min_score, "-", max_score, "%"))
-    } else {
-      message("    Gap score range: All values are NA (no valid gap scores)")
-    }
+    # Update gap_scores_df with calculated values (overwrite NAs where we have data)
+    gap_scores_df <- gap_scores_df %>%
+      left_join(results_df, by = "Field_id") %>%
+      mutate(gap_score = coalesce(gap_score.y, gap_score.x)) %>%
+      select(Field_id, gap_score)
  } else {
-    message("  WARNING: No gap scores calculated from per-field mosaics")
-    gap_scores_df <- NULL
+    message("  No per-field mosaic files available - all gap scores will be NA")
+  }
+  
+  # Always count how many fields got actual calculations (not NA)
+  n_calculated <- sum(!is.na(gap_scores_df$gap_score))
+  message(paste("  ✓ Gap scores for", nrow(gap_scores_df), "fields (", n_calculated, "calculated,", 
+                nrow(gap_scores_df) - n_calculated, "no data)"))
+  
+  # Guard against all-NA values which would produce Inf/-Inf warnings
+  if (any(is.finite(gap_scores_df$gap_score))) {
+    min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
+    max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
+    message(paste("    Gap score range:", min_score, "-", max_score, "%"))
  }
  
  return(gap_scores_df)
--- a/r_app/MANUAL_PIPELINE_RUNNER.R
+++ b/r_app/MANUAL_PIPELINE_RUNNER.R
@ -382,7 +382,7 @@
 #   & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [END_DATE] [PROJECT] [OFFSET]
 #
 # Example (2026-02-09, angata, 7-day lookback):
-#   & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-02-09 angata 7
+#   
 #
 # EXPECTED OUTPUT:
 #   File: {PROJECT}_field_analysis_week{WW}_{YYYY}.xlsx
@ -456,8 +456,8 @@ rmarkdown::render(
 #   rmarkdown::render(
 rmarkdown::render(
  "r_app/91_CI_report_with_kpis_cane_supply.Rmd",
-  params = list(data_dir = "angata", report_date = as.Date("2026-02-04")),
-  output_file = "SmartCane_Report_cane_supply_angata_2026-02-04.docx",
+  params = list(data_dir = "angata", report_date = as.Date("2026-02-23")),
+  output_file = "SmartCane_Report_cane_supply_angata_2026-02-23_en.docx",
  output_dir = "laravel_app/storage/app/angata/reports"
 )
 #