From 4e94a9a78bba1f136a07f2a99b34c9fcabbdcd60 Mon Sep 17 00:00:00 2001
From: Timon <timon@resiliencebv.com>
Date: Thu, 15 Jan 2026 15:35:16 +0100
Subject: [PATCH] expanding csv table

---
 r_app/80_calculate_kpis.R | 289 +++++++++++++++++++++++++-------------
 1 file changed, 193 insertions(+), 96 deletions(-)

diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R
index 59a9195..816ad83 100644
--- a/r_app/80_calculate_kpis.R
+++ b/r_app/80_calculate_kpis.R
@@ -10,7 +10,7 @@
 #   - Per-field analysis with SC-64 enhancements (4-week trends, CI percentiles, etc.)
 #   - Farm-level KPI calculation (6 metrics for executive overview)
 #   - Parallel processing (tile-aware, 1000+ fields supported)
-#   - Comprehensive Excel + RDS + CSV exports
+#   - Comprehensive Excel + RDS + CSV exports (21 columns per spec)
 #   - Test mode for development
 #
 # COMMAND-LINE USAGE:
@@ -26,6 +26,71 @@
 #   source("r_app/80_calculate_kpis.R")
 #   main()
 
+# ============================================================================
+# EXCEL OUTPUT SPECIFICATION (21 COLUMNS)
+# ============================================================================
+# This script exports 21 columns per the field analysis specification:
+#
+# COMPLETED/IN PROGRESS:
+#  1. Field_id ✓         - Unique field identifier
+#  2. Farm_Section       - Management zone (to be filled by user)
+#  3. Field_name ✓       - Client-facing field name (from GeoJSON)
+#  4. Acreage ✓          - Field size in acres
+#  5. Mean_CI ✓          - Average Chlorophyll Index
+#  6. Weekly_ci_change ✓ - Week-over-week CI change
+#  7. Four_week_trend    - [FUTURE] Trend over 4 weeks (requires historical mosaics)
+#  8. Last_harvest_or_planting_date - [DUMMY for now] Will be from harvest Excel + LSTM (script 31)
+#  9. Age_week ✓         - Weeks since planting
+# 10. Phase (age based) ✓ - Growth phase (Germination, Tillering, Grand Growth, Maturation)
+# 11. nmr_weeks_in_this_phase - [TODO] Weeks spent in current phase (track phase transitions)
+# 12. Germination_progress - [TODO] % pixels with CI >= threshold (default 2, for age < 4 months)
+# 13. Imminent_prob      - [DUMMY for now] Harvest probability (will be from script 31 output)
+# 14. Status_trigger ✓   - Alerts (harvest_ready, stress, etc.)
+# 15. CI_range (min-max) - [TODO] Min and max CI values in field
+# 16. CI_Percentiles ✓   - 10th-90th percentile of CI (p10-p90 format)
+# 17. CV ✓               - Coefficient of variation (field uniformity)
+# 18. CV_Trend_Short_Term - [TODO] 2-week CV trend (current week CV - last week CV)
+# 19. CV_Trend_Long_Term - [FUTURE] 8-week CV slope (requires linear regression, historical mosaics)
+# 20. Cloud_pct_clear ✓  - % field visible (pixel coverage)
+# 21. Cloud_category ✓   - Cloud classification (Clear view / Partial coverage / No image available)
+#
+# IMPLEMENTATION PLAN (ordered by difficulty):
+# ============================================================================
+# PHASE 1 - EASY (Current data only):
+#   [✓] Remove Mean_CI_prev column
+#   [✓] Add Field_name column (from field_boundaries_sf$field)
+#   [✓] Add Farm_Section column (empty, user will fill)
+#   [✓] Add Last_harvest_or_planting_date (use UNIFORM_PLANTING_DATE as dummy)
+#   [✓] Add CI_range (min/max from pixel extraction)
+#   [✓] Add Cloud_pct_clear (% from pixel coverage)
+#   [✓] Column order: Reorder to match spec (1-21)
+#
+# PHASE 1 LIMITATION (Known Issue - To Fix in PHASE 2):
+#   - Fields spanning multiple tiles currently show statistics from first intersecting tile only
+#   - This results in p10 ≈ p90 (few pixels per tile) instead of field-wide percentiles
+#   - FIX: After extracting all tiles, group by field_id and aggregate pixel values across all tiles
+#         before calculating percentiles. This will give true field-wide statistics.
+#
+# PHASE 2 - MEDIUM (Requires computation):
+#   [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV)
+#   [ ] Add Germination_progress (% pixels CI >= GERMINATION_CI_THRESHOLD, configurable)
+#   [ ] Add Imminent_prob column (dummy NA, will merge from script 31 harvest_imminent_weekly.csv)
+#   [ ] Add CV_Trend_Short_Term (requires loading last week's CV values)
+#
+# PHASE 3 - COMPLEX (Requires historical data):
+#   [ ] Add Four_week_trend (CI value difference week vs 4 weeks ago, requires loading prev mosaics)
+#   [ ] Add CV_Trend_Long_Term (8-week slope: linear regression on 8 weeks of CV, suggests lm())
+#   [ ] Load previous week's CSV to cross-check phase transitions and trends
+#
+# NOTES:
+#   - Script 31 (harvest_imminent_weekly.py) outputs: field, imminent_prob, detected_prob, week, year
+#   - Will need to LEFT JOIN on (field, week, year) to populate Imminent_prob
+#   - Phase transition logic: Compare this week's Phase vs last week's Phase from CSV
+#   - For 8-week CV slope: Linear regression slope = (CV_week8 - CV_week1) / 7 weeks (approximately)
+#     or use lm(CV ~ week) on 8-week sequence for proper slope calculation
+#   - Germination_progress only calculated if Age_week < 17 (before end of Tillering phase)
+#   - Cloud_pct_clear calculated as: (pixel_count / expected_pixels) * 100
+
 # ============================================================================
 # *** CONFIGURATION SECTION - MANUALLY DEFINED THRESHOLDS ***
 # ============================================================================
@@ -34,6 +99,10 @@
 TEST_MODE <- TRUE
 TEST_MODE_NUM_WEEKS <- 2
 
+# GERMINATION PROGRESS THRESHOLD
+# Percentage of pixels that must reach this CI value to count as "germinated"
+GERMINATION_CI_THRESHOLD <- 2.0  # Pixels with CI >= 2 count as germinated
+
 # FOUR-WEEK TREND THRESHOLDS
 FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5
 FOUR_WEEK_TREND_GROWTH_MIN <- 0.1
@@ -522,54 +591,41 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
       ))
     }
     
-    # Extract CI values: EXACTLY LIKE SCRIPT 20
-    # Crop to field bounding box first, then extract with sf directly (not terra::vect conversion)
+    # SINGLE EXTRACTION: Get all pixel values for this field, then calculate all stats from it
     field_bbox <- sf::st_bbox(field_sf)
     ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out")
-    extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE)
     
-    # extracted_vals is a data.frame with ID column (field index) + mean value
-    mean_ci_current <- as.numeric(extracted_vals[1, 2])
+    # Extract all pixels in one call (no fun= parameter means we get raw pixel values)
+    all_extracted <- terra::extract(ci_cropped, field_sf)[, 2]
+    current_ci_vals <- all_extracted[!is.na(all_extracted)]
     
-    if (is.na(mean_ci_current)) {
+    if (length(current_ci_vals) == 0) {
       return(data.frame(
         Field_id = field_id,
         error = "No CI values extracted from tiles"
       ))
     }
     
-    # For per-tile extraction, we only have mean from the aggregation function
-    # To get variance/CV, we need to extract all pixels without the fun parameter
-    # But for farm-level purposes, the mean CI is sufficient
-    all_extracted <- terra::extract(ci_cropped, field_sf)[, 2]
-    current_ci_vals <- all_extracted[!is.na(all_extracted)]
+    # Calculate all statistics from the single extraction
+    mean_ci_current <- mean(current_ci_vals, na.rm = TRUE)
+    ci_std <- sd(current_ci_vals, na.rm = TRUE)
+    cv_current <- if (mean_ci_current > 0) ci_std / mean_ci_current else NA_real_
+    range_min <- min(current_ci_vals, na.rm = TRUE)
+    range_max <- max(current_ci_vals, na.rm = TRUE)
+    range_str <- sprintf("%.1f-%.1f", range_min, range_max)
+    ci_percentiles_str <- get_ci_percentiles(current_ci_vals)
     
+    # Cloud coverage from extraction metadata
     num_total <- length(all_extracted)
-    num_data <- sum(!is.na(all_extracted))
+    num_data <- length(current_ci_vals)
     pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0
-    
     cloud_cat <- if (num_data == 0) "No image available"
                  else if (pct_clear >= 99.5) "Clear view"
                  else "Partial coverage"
     cloud_pct <- 100 - pct_clear
     cloud_interval <- round_cloud_to_intervals(pct_clear)
     
-    if (length(current_ci_vals) == 0) {
-      return(data.frame(
-        Field_id = field_id,
-        error = "No CI values extracted"
-      ))
-    }
-    
-    mean_ci_current <- mean(current_ci_vals, na.rm = TRUE)
-    ci_std <- sd(current_ci_vals, na.rm = TRUE)
-    cv_current <- ci_std / mean_ci_current
-    range_min <- min(current_ci_vals, na.rm = TRUE)
-    range_max <- max(current_ci_vals, na.rm = TRUE)
-    range_str <- sprintf("%.1f-%.1f", range_min, range_max)
-    
-    ci_percentiles_str <- get_ci_percentiles(current_ci_vals)
-    
+    # Weekly change (extract previous week same way - single extraction)
     weekly_ci_change <- NA
     previous_ci_vals <- NULL
     
@@ -578,8 +634,8 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
       if (!is.null(previous_ci)) {
         prev_bbox <- sf::st_bbox(field_sf)
         prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out")
-        prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2]
-        previous_ci_vals <- prev_extracted[!is.na(prev_extracted)]
+        prev_extracted_all <- terra::extract(prev_ci_cropped, field_sf)[, 2]
+        previous_ci_vals <- prev_extracted_all[!is.na(prev_extracted_all)]
         if (length(previous_ci_vals) > 0) {
           mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE)
           weekly_ci_change <- mean_ci_current - mean_ci_previous
@@ -743,11 +799,11 @@ generate_field_analysis_summary <- function(field_df) {
   
   total_acreage <- sum(field_df$Acreage, na.rm = TRUE)
   
-  germination_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Germination"], na.rm = TRUE)
-  tillering_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Tillering"], na.rm = TRUE)
-  grand_growth_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Grand Growth"], na.rm = TRUE)
-  maturation_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Maturation"], na.rm = TRUE)
-  unknown_phase_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Unknown"], na.rm = TRUE)
+  germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE)
+  tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE)
+  grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE)
+  maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE)
+  unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE)
   
   harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE)
   stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE)
@@ -1070,55 +1126,58 @@ extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
   #' 
   #' @param ci_band Single CI band from terra raster
   #' @param field_boundaries_sf SF object with field geometries
-  #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count
+  #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, min_ci, max_ci, pixel_count_valid, pixel_count_total
   
-  # Extract all pixels for all fields at once (more efficient than individual calls)
-  all_pixels <- terra::extract(ci_band, field_boundaries_sf)
+  # SINGLE EXTRACTION: Get all pixels for all fields at once (no aggregation function)
+  # Result: data.frame with ID column (field indices) and value column (pixel values)
+  extract_result <- terra::extract(ci_band, field_boundaries_sf)
   
   # Calculate statistics for each field
   stats_list <- list()
   
   for (field_idx in seq_len(nrow(field_boundaries_sf))) {
-    # Extract pixel values for this field (skip ID column 1)
-    pixels <- all_pixels[field_idx, -1, drop = TRUE]
-    pixels <- as.numeric(pixels)
-    pixels <- pixels[!is.na(pixels)]
+    # Get all pixels for this field from the single extraction
+    # extract_result has columns [ID, value] where ID is field index (1-based)
+    field_pixels <- extract_result[extract_result$ID == field_idx, 2]
+    pixels <- as.numeric(field_pixels[!is.na(field_pixels)])  # Remove NAs
     
-    # Only calculate stats if we have pixels
-    if (length(pixels) > 0) {
-      mean_val <- mean(pixels, na.rm = TRUE)
-      
-      # Only calculate CV if mean > 0 (avoid division by zero)
-      if (mean_val > 0) {
-        cv_val <- sd(pixels, na.rm = TRUE) / mean_val
-      } else {
-        cv_val <- NA
-      }
-      
-      p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
-      p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
-      
-      stats_list[[field_idx]] <- data.frame(
-        field_idx = field_idx,
-        mean_ci = mean_val,
-        cv = cv_val,
-        p10 = p10_val,
-        p90 = p90_val,
-        pixel_count = length(pixels),
-        stringsAsFactors = FALSE
-      )
-    } else {
-      # No pixels for this field (doesn't intersect tile)
+    if (length(pixels) == 0) {
+      # No data for this field
       stats_list[[field_idx]] <- data.frame(
         field_idx = field_idx,
         mean_ci = NA_real_,
         cv = NA_real_,
         p10 = NA_real_,
         p90 = NA_real_,
-        pixel_count = 0,
+        min_ci = NA_real_,
+        max_ci = NA_real_,
+        pixel_count_valid = 0,
+        pixel_count_total = 0,
         stringsAsFactors = FALSE
       )
+      next
     }
+    
+    # Calculate all statistics from pixels array
+    mean_val <- mean(pixels, na.rm = TRUE)
+    cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_
+    p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
+    p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
+    min_val <- min(pixels, na.rm = TRUE)
+    max_val <- max(pixels, na.rm = TRUE)
+    
+    stats_list[[field_idx]] <- data.frame(
+      field_idx = field_idx,
+      mean_ci = mean_val,
+      cv = cv_val,
+      p10 = p10_val,
+      p90 = p90_val,
+      min_ci = min_val,
+      max_ci = max_val,
+      pixel_count_valid = length(pixels),
+      pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]),
+      stringsAsFactors = FALSE
+    )
   }
   
   return(dplyr::bind_rows(stats_list))
@@ -1312,6 +1371,13 @@ main <- function() {
         message(paste("    [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data"))
         if (num_with_data > 0) {
           message(paste("    [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", ")))
+          # Check percentiles
+          sample_field <- which(!is.na(current_stats$mean_ci))[2]
+          message(paste("    [DEBUG] Field", sample_field, "- p10:", current_stats$p10[sample_field], 
+                       "p90:", current_stats$p90[sample_field], 
+                       "min:", current_stats$min_ci[sample_field],
+                       "max:", current_stats$max_ci[sample_field],
+                       "valid_pixels:", current_stats$pixel_count_valid[sample_field]))
         }
       }
       
@@ -1338,10 +1404,9 @@ main <- function() {
           }
           
           mean_ci_current <- current_stats$mean_ci[field_idx]
-          pixel_count <- current_stats$pixel_count[field_idx]
           
           # SKIP fields with no data in this tile (they don't intersect this tile)
-          if (is.na(pixel_count) || pixel_count == 0) {
+          if (is.na(current_stats$pixel_count_valid[field_idx]) || current_stats$pixel_count_valid[field_idx] == 0) {
             next
           }
           ci_cv_current <- current_stats$cv[field_idx]
@@ -1370,9 +1435,9 @@ main <- function() {
           # Use the percentiles and mean to create a synthetic distribution for status_trigger
           # For now, use mean CI repeated by pixel count for testing
           # TODO: Consider extracting pixels directly if needed for more complex triggers
-          pixel_count <- current_stats$pixel_count[field_idx]
-          ci_vals_current <- if (pixel_count > 0) {
-            rep(mean_ci_current, pixel_count)  # Simplified: use mean value repeated
+          pixel_count_valid <- current_stats$pixel_count_valid[field_idx]
+          ci_vals_current <- if (!is.na(pixel_count_valid) && pixel_count_valid > 0) {
+            rep(mean_ci_current, pixel_count_valid)  # Simplified: use mean value repeated
           } else {
             numeric(0)
           }
@@ -1393,36 +1458,68 @@ main <- function() {
           phase <- get_phase_by_age(age_weeks)
           status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks)
           
-          # Cloud coverage categorization based on CI value
-          # No data = No image available
-          # CI 0.01 to 95 = Partial coverage
-          # CI >= 95 = Clear view
-          if (is.na(mean_ci_current) || mean_ci_current == 0) {
-            cloud_category <- "No image available"
-            # Set all CI metrics to NA since no valid data
-            ci_change <- NA
-            ci_cv_current <- NA
-            ci_percentile_low <- NA
-            ci_percentile_high <- NA
-          } else if (mean_ci_current >= 95) {
-            cloud_category <- "Clear view"
-          } else {
-            cloud_category <- "Partial coverage"
+          # Calculate Cloud_pct_clear: percentage of field with valid data
+          # Binned to 10% intervals (0, 10, 20, ..., 90, 100)
+          cloud_pct_clear <- {
+            valid_count <- current_stats$pixel_count_valid[field_idx]
+            total_count <- current_stats$pixel_count_total[field_idx]
+            if (!is.na(valid_count) && !is.na(total_count) && total_count > 0) {
+              pct <- (valid_count / total_count) * 100
+              round(pct / 10) * 10
+            } else {
+              NA_real_
+            }
           }
           
-          # Build result row
+          # Cloud categorization based on pixel coverage (Cloud_pct_clear)
+          cloud_category <- if (is.na(cloud_pct_clear)) {
+            "No image available"
+          } else if (cloud_pct_clear >= 90) {
+            "Clear view"
+          } else if (cloud_pct_clear > 0) {
+            "Partial coverage"
+          } else {
+            "No image available"
+          }
+          
+          # Get min/max CI values
+          ci_min <- current_stats$min_ci[field_idx]
+          ci_max <- current_stats$max_ci[field_idx]
+          ci_range <- if (!is.na(ci_min) && !is.na(ci_max)) {
+            sprintf("%.1f-%.1f", ci_min, ci_max)
+          } else {
+            NA_character_
+          }
+          
+          # Get field_name from field_boundaries_sf
+          field_name <- field_sf$field
+          
+          # Build result row (21 columns per specification, in order)
           result_row <- data.frame(
             Field_id = field_id,
+            Farm_Section = NA_character_,
+            Field_name = field_name,
             Acreage = field_area_acres,
             Mean_CI = mean_ci_current,
-            Mean_CI_prev = mean_ci_previous,
-            CI_change = ci_change,
-            CI_CV = ci_cv_current,
-            CI_percentile_low = ci_percentile_low,
-            CI_percentile_high = ci_percentile_high,
+            Weekly_ci_change = ci_change,
+            Four_week_trend = NA_character_,
+            Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
             Age_weeks = age_weeks,
             Phase = phase,
+            nmr_weeks_in_this_phase = NA_real_,
+            Germination_progress = NA_real_,
+            Imminent_prob = NA_real_,
             Status_trigger = status_trigger,
+            CI_range = ci_range,
+            CI_Percentiles = if (!is.na(ci_percentile_low) && !is.na(ci_percentile_high)) {
+              sprintf("%.1f-%.1f", ci_percentile_low, ci_percentile_high)
+            } else {
+              NA_character_
+            },
+            CV = ci_cv_current,
+            CV_Trend_Short_Term = NA_real_,
+            CV_Trend_Long_Term = NA_real_,
+            Cloud_pct_clear = cloud_pct_clear,
             Cloud_category = cloud_category,
             stringsAsFactors = FALSE
           )