added last columns of trends, only harvest imminent and age left to do

2026-01-16 09:10:39 +01:00 · 2026-01-16 09:10:39 +01:00 · 3e4430b3be
parent 8d84c8cab5
commit 3e4430b3be
1 changed files with 301 additions and 61 deletions
--- a/r_app/80_calculate_kpis.R
+++ b/r_app/80_calculate_kpis.R
@ -26,6 +26,55 @@
 #   source("r_app/80_calculate_kpis.R")
 #   main()

+# ============================================================================
+# PENDING WORK - PHASE 4 (Next Sprint)
+# ============================================================================
+# 
+# CRITICAL INTEGRATIONS:
+#
+# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307)
+#    [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv
+#        Columns: field, imminent_prob, detected_prob, week, year
+#    [ ] LEFT JOIN to field_analysis_df by (field, week, year)
+#    [ ] Replace hardcoded "placeholder data" in Status_trigger calculation
+#    [ ] Update column to show actual harvest probability (0-1 or 0-100%)
+#
+# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31)
+#    [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field
+#    [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE
+#    [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
+#    [ ] Removes TEST MODE hardcoding and enables field-specific aging
+#
+# UTILITY FILE REFACTORING (Improve code maintainability):
+#
+# 3. CREATE r_app/80_weekly_stats_utils.R
+#    [ ] Extract functions from lines 250-795 (calculation layer):
+#        - calculate_field_statistics()
+#        - calculate_kpi_trends()
+#        - load_or_calculate_weekly_stats()
+#        - Helper: load_tiles_for_field(), get_tile_ids_for_field()
+#        - Helper: extract_field_statistics_from_ci()
+#    [ ] Clean separation: DATA CALCULATION ONLY (no Excel export)
+#    [ ] Reusable by run_full_pipeline.R and other scripts
+#
+# 4. CREATE r_app/80_report_building_utils.R
+#    [ ] Extract functions from lines 1350-2100+ (output/reporting layer):
+#        - generate_field_analysis_summary()
+#        - export_field_analysis_excel()
+#        - calculate_and_export_farm_kpis()
+#        - Helper: categorize_*, get_*, round_* functions
+#        - Helper: get_phase_by_age(), get_status_trigger()
+#    [ ] Clean separation: OUTPUT/FORMATTING ONLY (consumes calculated stats)
+#    [ ] Reusable for alternative export formats (PDF, HTML, dashboard)
+#
+# TESTING PLAN:
+# [ ] Verify 8-week historical data loads (currently TEST_MODE = 2 weeks only)
+# [ ] Confirm Four_week_trend calculates from 1-4 weeks (graceful degradation)
+# [ ] Confirm CV_Trend_Long_Term uses full 8-week regression (when available)
+# [ ] Load script 31 output and validate imminent_prob population
+#
+# ============================================================================
+
 # ============================================================================
 # EXCEL OUTPUT SPECIFICATION (21 COLUMNS)
 # ============================================================================
@ -114,6 +163,20 @@ FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5
 # CV TREND THRESHOLDS
 CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05

+# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS
+# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend
+#   Negative slope = CV decreasing = field becoming MORE uniform = GOOD
+#   Positive slope = CV increasing = field becoming MORE patchy = BAD
+#   Near zero = Homogenous growth (all crops progressing equally)
+CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.05  # CV decreasing rapidly
+CV_SLOPE_IMPROVEMENT_MIN <- -0.02         # Gradual synchronization
+CV_SLOPE_IMPROVEMENT_MAX <- -0.005        # Becoming uniform
+CV_SLOPE_HOMOGENOUS_MIN <- -0.005         # Stable, uniform growth
+CV_SLOPE_HOMOGENOUS_MAX <- 0.005          # No change in uniformity
+CV_SLOPE_PATCHINESS_MIN <- 0.005          # Minor divergence
+CV_SLOPE_PATCHINESS_MAX <- 0.02           # Growing patchiness
+CV_SLOPE_SEVERE_MIN <- 0.02               # Field fragmentation beginning
+
 # CLOUD COVER ROUNDING INTERVALS
 CLOUD_INTERVALS <- c(0, 50, 60, 70, 80, 90, 100)

@ -404,6 +467,100 @@ calculate_cv_trend <- function(cv_current, cv_previous) {
  return(round(cv_current - cv_previous, 4))
 }

+calculate_four_week_trend <- function(mean_ci_values) {
+  #' Calculate four-week CI trend from available weeks
+  #' 
+  #' Uses whatever weeks are available (1-4 weeks) to estimate trend
+  #' Returns difference between current (most recent) and oldest available week
+  #' 
+  #' @param mean_ci_values vector of Mean_CI values in chronological order (oldest to newest)
+  #' @return numeric: CI difference (current - oldest), rounded to 2 decimals
+  
+  if (is.null(mean_ci_values) || length(mean_ci_values) == 0) {
+    return(NA_real_)
+  }
+  
+  # Remove NAs
+  ci_clean <- mean_ci_values[!is.na(mean_ci_values)]
+  
+  if (length(ci_clean) < 2) {
+    # Need at least 2 weeks to calculate trend
+    return(NA_real_)
+  }
+  
+  # Calculate difference: current - oldest
+  trend <- ci_clean[length(ci_clean)] - ci_clean[1]
+  return(round(trend, 2))
+}
+
+categorize_cv_slope <- function(slope) {
+  #' Categorize CV slope (8-week regression) into field uniformity interpretation
+  #' 
+  #' Slope interpretation:
+  #'   Negative slope = CV decreasing = field becoming MORE uniform = GOOD
+  #'   Positive slope = CV increasing = field becoming MORE patchy = BAD
+  #'   Near zero = Homogenous growth (all crops progressing equally)
+  #' 
+  #' Categories:
+  #'   - "Excellent uniformity": Slope <= -0.02 (CV decreasing, field synchronizing)
+  #'   - "Homogenous growth": -0.02 < slope < 0.005 (stable, uniform growth)
+  #'   - "Minor patchiness": 0.005 <= slope <= 0.02 (CV slowly increasing)
+  #'   - "Severe fragmentation": slope > 0.02 (rapid CV increase, parts diverging)
+  #' 
+  #' @param slope numeric: CV trend slope per week
+  #' @return character: Category string
+  
+  if (is.na(slope)) {
+    return(NA_character_)
+  }
+  
+  if (slope <= CV_SLOPE_IMPROVEMENT_MIN) {
+    return("Excellent uniformity")
+  } else if (slope < CV_SLOPE_HOMOGENOUS_MIN) {
+    return("Homogenous growth")
+  } else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) {
+    return("Homogenous growth")
+  } else if (slope <= CV_SLOPE_PATCHINESS_MAX) {
+    return("Minor patchiness")
+  } else {
+    return("Severe fragmentation")
+  }
+}
+
+calculate_cv_trend_long_term <- function(cv_values) {
+  #' Calculate 8-week CV trend via linear regression slope
+  #' 
+  #' Fits linear regression to CV over available weeks (1-8)
+  #' Returns slope = rate of change per week
+  #' 
+  #' @param cv_values vector of CV values in chronological order (oldest to newest)
+  #' @return numeric: Regression slope (CV change per week), rounded to 4 decimals
+  
+  if (is.null(cv_values) || length(cv_values) == 0) {
+    return(NA_real_)
+  }
+  
+  # Remove NAs
+  cv_clean <- cv_values[!is.na(cv_values)]
+  
+  if (length(cv_clean) < 2) {
+    # Need at least 2 weeks to fit a line
+    return(NA_real_)
+  }
+  
+  # Create week sequence matching data length
+  weeks <- seq_along(cv_clean)
+  
+  # Fit linear model
+  tryCatch({
+    lm_fit <- lm(cv_clean ~ weeks)
+    slope <- coef(lm_fit)["weeks"]
+    return(round(as.numeric(slope), 4))
+  }, error = function(e) {
+    return(NA_real_)
+  })
+}
+
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
@ -498,7 +655,7 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n
 }

 USE_UNIFORM_AGE <- TRUE
-UNIFORM_PLANTING_DATE <- as.Date("2025-01-01")
+UNIFORM_PLANTING_DATE <- as.Date("2026-01-01")

 extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
  if (USE_UNIFORM_AGE) {
@ -688,13 +845,17 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
 # CALCULATE KPI TRENDS (Requires previous week RDS)
 # ============================================================================

-calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
+calculate_kpi_trends <- function(current_stats, prev_stats = NULL, 
+                                  project_dir = NULL, reports_dir = NULL, 
+                                  current_week = NULL, year = NULL) {
  
  message("Calculating KPI trends from current and previous week data")
  
  # Initialize new columns with defaults
  current_stats$Weekly_ci_change <- NA_real_
  current_stats$CV_Trend_Short_Term <- NA_real_
+  current_stats$Four_week_trend <- NA_real_
+  current_stats$CV_Trend_Long_Term <- NA_real_
  current_stats$nmr_weeks_in_this_phase <- 1L
  
  # If no previous week data, return with defaults
@ -704,36 +865,18 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
  }
  
  message(paste("  prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns"))
-  message(paste("  prev_stats columns:", paste(names(prev_stats), collapse = ", ")))
  
  # Build lookup indices for previous week (by Field_id)
  prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id)
  
  # Try to load previous week's field_analysis to get nmr_weeks_in_this_phase history
  prev_field_analysis <- NULL
-  prev_analysis_csv <- file.path(
-    reports_dir, "kpis", "field_analysis",
-    sprintf("%s_field_analysis_week%02d.csv", 
-            paste(strsplit(current_stats$Field_id[1], "")[[1]][1], collapse=""),  # Get project from field
-            as.numeric(format(Sys.Date() - 7, "%V")))  # Approximate previous week
-  )
-  
-  # Better way: construct the previous week number properly
-  current_week_num <- as.numeric(format(Sys.Date(), "%V"))
-  prev_week_num <- current_week_num - 1
-  if (prev_week_num < 1) prev_week_num <- 52
-  
-  # This is a bit tricky - we need the project_dir from the main scope
-  # For now, assume we can infer it or pass it through
-  # Let's use a simpler approach: look for any field_analysis_week* file that's recent
  
  tryCatch({
    analysis_dir <- file.path(reports_dir, "kpis", "field_analysis")
    if (dir.exists(analysis_dir)) {
-      # Find the most recent field_analysis CSV (should be previous week)
      analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE)
      if (length(analysis_files) > 0) {
-        # Sort by modification time and get the most recent
        recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)]
        prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, 
                                               col_select = c(Field_id, nmr_weeks_in_this_phase, Phase))
@ -747,49 +890,139 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
    message(paste("  Using previous field_analysis to track nmr_weeks_in_this_phase"))
  }
  
-  # For each field in current week, lookup previous values
+  # ============================================================
+  # PHASE 3: Load 4-8 weeks of historical data for trend calculations
+  # ============================================================
+  
+  historical_4weeks <- list()
+  historical_8weeks <- list()
+  
+  if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) {
+    message("  Loading historical field_stats for 4-week and 8-week trends...")
+    
+    # Load up to 4 weeks back for four_week_trend
+    for (lookback in 1:4) {
+      target_week <- current_week - lookback
+      if (target_week < 1) target_week <- target_week + 52
+      
+      rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week)
+      rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
+      
+      if (file.exists(rds_path)) {
+        tryCatch({
+          stats_data <- readRDS(rds_path)
+          historical_4weeks[[length(historical_4weeks) + 1]] <- list(
+            week = target_week,
+            stats = stats_data
+          )
+        }, error = function(e) {
+          message(paste("    Warning: Could not load week", target_week, ":", e$message))
+        })
+      }
+    }
+    
+    # Load up to 8 weeks back for cv_trend_long_term
+    for (lookback in 1:8) {
+      target_week <- current_week - lookback
+      if (target_week < 1) target_week <- target_week + 52
+      
+      rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week)
+      rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
+      
+      if (file.exists(rds_path)) {
+        tryCatch({
+          stats_data <- readRDS(rds_path)
+          historical_8weeks[[length(historical_8weeks) + 1]] <- list(
+            week = target_week,
+            stats = stats_data
+          )
+        }, error = function(e) {
+          # Silently skip - we'll work with whatever weeks exist
+        })
+      }
+    }
+    
+    if (length(historical_4weeks) > 0) {
+      message(paste("    Loaded", length(historical_4weeks), "weeks for 4-week trend"))
+    }
+    if (length(historical_8weeks) > 0) {
+      message(paste("    Loaded", length(historical_8weeks), "weeks for 8-week CV trend"))
+    }
+  }
+  
+  # For each field in current week, lookup previous values and calculate trends
  cv_trends_calculated <- 0
+  four_week_trends_calculated <- 0
+  cv_long_term_calculated <- 0
+  
  for (i in seq_len(nrow(current_stats))) {
    field_id <- current_stats$Field_id[i]
    prev_idx <- prev_lookup[field_id]
    
    if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) {
-      # Field exists in previous week - extract row carefully
-      prev_row <- prev_stats[prev_idx, , drop = FALSE]  # Keep as dataframe
+      prev_row <- prev_stats[prev_idx, , drop = FALSE]
      
-      if (nrow(prev_row) == 0) {
-        # Field not found - skip
-        next
-      }
-      
-      # Access values from single-row dataframe
-      prev_mean_ci <- prev_row$Mean_CI[1]
-      prev_cv <- prev_row$CV[1]
-      prev_phase <- prev_row$Phase[1]
-      
-      # Weekly CI change (current Mean_CI - previous Mean_CI)
-      if (!is.na(prev_mean_ci) && !is.na(current_stats$Mean_CI[i])) {
+      # WEEKLY CI CHANGE
+      prev_ci <- prev_row$Mean_CI[1]
+      if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) {
        current_stats$Weekly_ci_change[i] <- 
-          round(current_stats$Mean_CI[i] - prev_mean_ci, 2)
-      }
-      
-      # CV short-term trend (current CV - previous CV)
-      # DEBUG: Check first few fields
-      if (i <= 3) {
-        message(paste("    Field", field_id, "- CV_prev:", prev_cv, "CV_curr:", current_stats$CV[i]))
+          round(current_stats$Mean_CI[i] - prev_ci, 2)
      }
      
+      # CV TREND SHORT TERM (2-week comparison)
+      prev_cv <- prev_row$CV[1]
      if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) {
-        trend_val <- round(current_stats$CV[i] - prev_cv, 4)
-        current_stats$CV_Trend_Short_Term[i] <- trend_val
+        current_stats$CV_Trend_Short_Term[i] <- 
+          calculate_cv_trend(current_stats$CV[i], prev_cv)
        cv_trends_calculated <- cv_trends_calculated + 1
+      }
+      
+      # FOUR-WEEK TREND (if available)
+      if (length(historical_4weeks) > 0) {
+        ci_values_4week <- numeric()
        
-        if (i <= 3) {
-          message(paste("      -> CV_Trend =", trend_val))
+        # Add oldest available weeks (reverse order to get oldest first)
+        for (hist_idx in rev(seq_along(historical_4weeks))) {
+          hist_data <- historical_4weeks[[hist_idx]]$stats
+          hist_field <- which(hist_data$Field_id == field_id)
+          if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) {
+            ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]])
+          }
+        }
+        
+        # Add current week CI
+        ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i])
+        
+        if (length(ci_values_4week) >= 2) {
+          current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week)
+          four_week_trends_calculated <- four_week_trends_calculated + 1
        }
      }
      
-      # Weeks in current phase (track phase transitions)
+      # CV TREND LONG TERM (8-week slope)
+      if (length(historical_8weeks) > 0) {
+        cv_values_8week <- numeric()
+        
+        # Add oldest available weeks (reverse order to get oldest first)
+        for (hist_idx in rev(seq_along(historical_8weeks))) {
+          hist_data <- historical_8weeks[[hist_idx]]$stats
+          hist_field <- which(hist_data$Field_id == field_id)
+          if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) {
+            cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]])
+          }
+        }
+        
+        # Add current week CV
+        cv_values_8week <- c(cv_values_8week, current_stats$CV[i])
+        
+        if (length(cv_values_8week) >= 2) {
+          slope <- calculate_cv_trend_long_term(cv_values_8week)
+          current_stats$CV_Trend_Long_Term[i] <- slope
+          cv_long_term_calculated <- cv_long_term_calculated + 1
+        }
+      }
+      
+      # WEEKS IN CURRENT PHASE (track phase transitions)
      # Use previous field_analysis if available for proper counter progression
      if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
        # Look up this field in previous analysis
@ -810,9 +1043,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
              current_stats$nmr_weeks_in_this_phase[i] <- 1L
            }
          }
-        } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) {
+        } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) {
          # Field not in previous analysis, fall back to prev_stats phase comparison
-          if (current_stats$Phase[i] == prev_phase) {
+          if (current_stats$Phase[i] == prev_row$Phase[1]) {
            current_stats$nmr_weeks_in_this_phase[i] <- 2L
          } else {
            current_stats$nmr_weeks_in_this_phase[i] <- 1L
@ -820,9 +1053,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
        }
      } else {
        # No previous field_analysis available - use phase from prev_stats
-        if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) {
-          if (current_stats$Phase[i] == prev_phase) {
-            # Same phase - increment counter (start with 2)
+        if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) {
+          if (current_stats$Phase[i] == prev_row$Phase[1]) {
+            # Same phase - increment counter (start with 2 since prev week was in this phase)
            current_stats$nmr_weeks_in_this_phase[i] <- 2L
          } else {
            # Phase changed - reset to 1
@ -833,8 +1066,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
    }
  }
  
-  message(paste("  ✓ Calculated", cv_trends_calculated, "CV_Trend_Short_Term values out of", nrow(current_stats), "fields"))
-  message(paste("    CV_Trend_Short_Term non-NA values:", sum(!is.na(current_stats$CV_Trend_Short_Term))))
+  message(paste("  ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields"))
+  message(paste("  ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields"))
+  message(paste("  ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields"))
  return(current_stats)
 }

@ -1702,9 +1936,13 @@ main <- function() {
  
  # Apply trend calculations (requires both weeks)
  message("\n3. Calculating trend columns...")
-  current_stats <- calculate_kpi_trends(current_stats, prev_stats)
+  current_stats <- calculate_kpi_trends(current_stats, prev_stats, 
+                                        project_dir = project_dir,
+                                        reports_dir = reports_dir,
+                                        current_week = current_week,
+                                        year = year)
  
-  message(paste("  ✓ Added Weekly_ci_change, CV_Trend_Short_Term, nmr_weeks_in_this_phase"))
+  message(paste("  ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase"))
  
  # ============================================================================
  # Build final output dataframe with all 21 columns
@ -1758,8 +1996,7 @@ main <- function() {
        if_else(is.na(acreages_vec), 0, acreages_vec)
      },
      # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change)
-      # Column 7: Four_week_trend (Phase 3 future)
-      Four_week_trend = NA_character_,
+      # Column 7: Four_week_trend (from current_stats)
      # Column 8: Last_harvest_or_planting_date (dummy for now)
      Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
      # Columns 9-10: Already in current_stats (Age_week, Phase)
@ -1788,8 +2025,11 @@ main <- function() {
      # Columns 15-16: Already in current_stats (CI_range, CI_Percentiles)
      # Column 17: Already in current_stats (CV)
      # Column 18: Already in current_stats (CV_Trend_Short_Term)
-      # Column 19: CV_Trend_Long_Term (Phase 3 future)
-      CV_Trend_Long_Term = NA_real_,
+      # Column 19: CV_Trend_Long_Term (from current_stats - raw slope value)
+      # Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope)
+      CV_Trend_Long_Term_Category = {
+        sapply(current_stats$CV_Trend_Long_Term, categorize_cv_slope)
+      },
      # Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category)
      .keep = "all"  # Keep all existing columns
    ) %>%
@ -1797,7 +2037,7 @@ main <- function() {
      Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change,
      Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase,
      nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger,
-      CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term,
+      CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, CV_Trend_Long_Term_Category,
      Cloud_pct_clear, Cloud_category
    )