integration lstm 31 and 80 complete - angata kpi file now dynamic

2026-01-18 10:14:08 +01:00 · 2026-01-18 10:14:08 +01:00 · 9f312131d7
parent 1f5add7485
commit 9f312131d7
3 changed files with 166 additions and 70 deletions
--- a/r_app/80_calculate_kpis.R
+++ b/r_app/80_calculate_kpis.R
@ -16,16 +16,15 @@
 # CRITICAL INTEGRATIONS:
 #
 # 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307)
-#    [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv
+#    [✓] Load script 31 output: {project}_week_{WW}_{YYYY}.csv
 #        Columns: field, imminent_prob, detected_prob, week, year
-#    [ ] LEFT JOIN to field_analysis_df by (field, week, year)
-#    [ ] Replace hardcoded "placeholder data" in Status_trigger calculation
-#    [ ] Update column to show actual harvest probability (0-1 or 0-100%)
+#    [✓] LEFT JOIN to field_analysis_df by field
+#    [✓] Use actual harvest probability data instead of placeholder
 #
-# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31)
-#    [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field
-#    [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE
-#    [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
+# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 23)
+#    [✓] Load harvest.xlsx with planting_date (season_start)
+#    [✓] Extract planting dates per field
+#    [✓] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
 #
 # COMMAND-LINE USAGE:
 #   Option 1: Rscript 80_calculate_kpis.R 2026-01-14 angata
@ -43,8 +42,9 @@
 # ============================================================================
 # NEXT INTEGRATIONS (See Linear issues for detailed requirements)
 # ============================================================================
-# 1. Load imminent_prob from script 31 (harvest_imminent_weekly.csv)
-# 2. Load planting_date from harvest.xlsx for field-specific age calculation
+# 1. [✓] Load imminent_prob from script 31 (week_WW_YYYY.csv)
+# 2. [✓] Load planting_date from harvest.xlsx for field-specific age calculation
+# 3. [ ] Improve Status_trigger logic to use actual imminent_prob values
 # ============================================================================

 # ============================================================================
@ -86,8 +86,7 @@ CI_PERCENTILE_HIGH <- 0.90
 GERMINATION_CI_THRESHOLD <- 2.0

 # PLANTING DATE & AGE CONFIGURATION
-USE_UNIFORM_AGE <- TRUE
-UNIFORM_PLANTING_DATE <- as.Date("2026-01-01")
+# Load from harvest.xlsx (scripts 22 & 23) - no fallback to uniform dates

 # HISTORICAL DATA LOOKBACK
 WEEKS_FOR_FOUR_WEEK_TREND <- 4
@ -307,6 +306,26 @@ main <- function() {
                                                 auto_generate = allow_auto_gen,
                                                 field_boundaries_sf = field_boundaries_sf)
  
+  # Load harvest.xlsx for planting dates (season_start)
+  message("\nLoading harvest data from harvest.xlsx for planting dates...")
+  harvest_file_path <- file.path(data_dir, "harvest.xlsx")
+  
+  harvesting_data <- tryCatch({
+    if (file.exists(harvest_file_path)) {
+      harvest_raw <- readxl::read_excel(harvest_file_path)
+      harvest_raw$season_start <- as.Date(harvest_raw$season_start)
+      harvest_raw$season_end <- as.Date(harvest_raw$season_end)
+      message(paste("  ✓ Loaded harvest data:", nrow(harvest_raw), "rows"))
+      harvest_raw
+    } else {
+      message(paste("  WARNING: harvest.xlsx not found at", harvest_file_path))
+      NULL
+    }
+  }, error = function(e) {
+    message(paste("  ERROR loading harvest.xlsx:", e$message))
+    NULL
+  })
+  
  planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
  
  # Validate planting_dates
@ -374,6 +393,28 @@ main <- function() {
  
  message(paste("  ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase"))
  
+  # Load weekly harvest probabilities from script 31 (if available)
+  message("\n4. Loading harvest probabilities from script 31...")
+  harvest_prob_file <- file.path(reports_dir, "kpis", "field_stats", 
+                                   sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, year))
+  message(paste("  Looking for:", harvest_prob_file))
+  
+  imminent_prob_data <- tryCatch({
+    if (file.exists(harvest_prob_file)) {
+      prob_df <- readr::read_csv(harvest_prob_file, show_col_types = FALSE)
+      message(paste("  ✓ Loaded harvest probabilities for", nrow(prob_df), "fields"))
+      prob_df %>%
+        select(field, imminent_prob, detected_prob) %>%
+        rename(Field_id = field, Imminent_prob_actual = imminent_prob, Detected_prob = detected_prob)
+    } else {
+      message(paste("  INFO: Harvest probabilities not available (script 31 not run)"))
+      NULL
+    }
+  }, error = function(e) {
+    message(paste("  WARNING: Could not load harvest probabilities:", e$message))
+    NULL
+  })
+  
  # ============================================================================
  # Build final output dataframe with all 21 columns
  # ============================================================================
@ -427,32 +468,91 @@ main <- function() {
      },
      # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change)
      # Column 7: Four_week_trend (from current_stats)
-      # Column 8: Last_harvest_or_planting_date (dummy for now)
-      Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
-      # Columns 9-10: Already in current_stats (Age_week, Phase)
-      # Column 11: nmr_weeks_in_this_phase (already calculated)
-      # Column 12: Germination_progress (already calculated)
-      # Column 13: Imminent_prob (placeholder)
-      Imminent_prob = "placeholder data",
-      # Column 14: Status_trigger (need to add)
+      # Column 8: Last_harvest_or_planting_date (from harvest.xlsx - season_start)
+      Last_harvest_or_planting_date = {
+        planting_dates$planting_date[match(Field_id, planting_dates$field_id)]
+      },
+      # Column 9: Age_week (calculated from report date and planting date)
+      Age_week = {
+        sapply(seq_len(nrow(current_stats)), function(idx) {
+          planting_dt <- Last_harvest_or_planting_date[idx]
+          if (is.na(planting_dt)) {
+            return(NA_real_)
+          }
+          round(as.numeric(difftime(end_date, planting_dt, units = "weeks")), 1)
+        })
+      },
+      # Column 10: Phase (recalculate based on updated Age_week)
+      Phase = {
+        sapply(Age_week, function(age) {
+          if (is.na(age)) return(NA_character_)
+          if (age >= 0 & age < 4) return("Germination")
+          if (age >= 4 & age < 17) return("Tillering")
+          if (age >= 17 & age < 39) return("Grand Growth")
+          if (age >= 39) return("Maturation")
+          NA_character_
+        })
+      },
+      # Column 11: nmr_weeks_in_this_phase (already in current_stats from calculate_kpi_trends)
+      # Column 12: Germination_progress (calculated here from CI values)
+      Germination_progress = {
+        sapply(seq_len(nrow(current_stats)), function(idx) {
+          age_w <- Age_week[idx]
+          mean_ci_val <- Mean_CI[idx]
+          
+          # Only relevant for germination phase (0-4 weeks)
+          if (is.na(age_w) || age_w < 0 || age_w >= 4) {
+            return(NA_character_)
+          }
+          
+          # Estimate % of field with CI >= germination threshold
+          # Based on mean CI, estimate germination percentage
+          if (mean_ci_val >= 0.4) {
+            return(">80%")
+          } else if (mean_ci_val >= 0.25) {
+            return("50-80%")
+          } else if (mean_ci_val >= 0.1) {
+            return("20-50%")
+          } else {
+            return("<20%")
+          }
+        })
+      },
+      # Column 13: Imminent_prob (from script 31 or NA if not available)
+      Imminent_prob = {
+        if (!is.null(imminent_prob_data)) {
+          imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)]
+        } else {
+          rep(NA_real_, nrow(current_stats))
+        }
+      },
+      # Column 14: Status_trigger (based on harvest probability + growth status)
      Status_trigger = {
        triggers <- sapply(seq_len(nrow(current_stats)), function(idx) {
-          field_id <- current_stats$Field_id[idx]
-          field_idx <- which(field_boundaries_sf$field == field_id)[1]
-          if (is.na(field_idx)) return(NA_character_)
+          imminent_prob <- Imminent_prob[idx]
+          age_w <- Age_week[idx]
+          ci_change <- Weekly_ci_change[idx]
+          phase <- Phase[idx]
          
-          # Reconstruct CI values from Mean_CI for status trigger logic
-          # For now, use simplified approach
-          age_w <- current_stats$Age_week[idx]
-          ci_change <- current_stats$Weekly_ci_change[idx]
+          # Priority 1: Harvest imminent (high probability)
+          if (!is.na(imminent_prob) && imminent_prob > 0.5) {
+            return("harvest_imminent")
+          }
          
-          # Using mean CI as proxy (could be improved with pixel distribution)
-          ci_vals <- rep(current_stats$Mean_CI[idx], 100)
-          get_status_trigger(ci_vals, ci_change, age_w)
+          # Priority 2: Age-based triggers
+          if (!is.na(age_w)) {
+            if (age_w >= 45) return("harvest_ready")
+            if (age_w >= 39) return("maturation_progressing")
+            if (age_w >= 4 & age_w < 39) return("growth_on_track")
+            if (age_w < 4) return("germination_started")
+          }
+          
+          # Fallback
+          NA_character_
        })
        triggers
      },
-      # Columns 15-16: Already in current_stats (CI_range, CI_Percentiles)
+      # Columns 15-16: CI-based columns already in current_stats (CI_range, CI_Percentiles)
      # Column 17: Already in current_stats (CV)
      # Column 18: Already in current_stats (CV_Trend_Short_Term)
      # Column 19: CV_Trend_Long_Term (from current_stats - raw slope value)
@ -464,11 +564,12 @@ main <- function() {
      .keep = "all"  # Keep all existing columns
    ) %>%
    select(
-      Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change,
-      Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase,
-      nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger,
-      CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, CV_Trend_Long_Term_Category,
-      Cloud_pct_clear, Cloud_category
+      all_of(c("Field_id", "Farm_Section", "Field_name", "Acreage", "Mean_CI", "Weekly_ci_change",
+               "Four_week_trend", "Last_harvest_or_planting_date", "Age_week", "Phase",
+               "nmr_weeks_in_this_phase", "Germination_progress", "Imminent_prob", "Status_trigger",
+               "CV", "CV_Trend_Short_Term", "CV_Trend_Long_Term", "CV_Trend_Long_Term_Category",
+               "Cloud_pct_clear", "Cloud_category")),
+      any_of(c("CI_range", "CI_Percentiles"))
    )
  
  message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 21 columns"))
@ -480,6 +581,7 @@ main <- function() {
    summary_statistics_df,
    project_dir,
    current_week,
+    year,
    reports_dir
  )
  
--- a/r_app/80_report_building_utils.R
+++ b/r_app/80_report_building_utils.R
@ -106,7 +106,7 @@ generate_field_analysis_summary <- function(field_df) {
 # EXPORT FUNCTIONS
 # ============================================================================

-export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) {
+export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, year, reports_dir) {
  message("Exporting per-field analysis to Excel, CSV, and RDS...")
  
  field_df_rounded <- field_df %>%
@ -120,7 +120,7 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre
    dir.create(output_subdir, recursive = TRUE)
  }
  
-  excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".xlsx")
+  excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".xlsx")
  excel_path <- file.path(output_subdir, excel_filename)
  excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE)
  
@ -137,18 +137,19 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre
    field_analysis_summary = summary_df_rounded,
    metadata = list(
      current_week = current_week,
+      year = year,
      project = project_dir,
      created_at = Sys.time()
    )
  )
  
-  rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d", current_week), ".rds")
+  rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d_%d", current_week, year), ".rds")
  rds_path <- file.path(reports_dir, "kpis", rds_filename)
  
  saveRDS(kpi_data, rds_path)
  message(paste("✓ Field analysis RDS exported to:", rds_path))
  
-  csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv")
+  csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".csv")
  csv_path <- file.path(output_subdir, csv_filename)
  write_csv(field_df_rounded, csv_path)
  message(paste("✓ Field analysis CSV exported to:", csv_path))
--- a/r_app/80_weekly_stats_utils.R
+++ b/r_app/80_weekly_stats_utils.R
@ -335,21 +335,18 @@ get_status_trigger <- function(ci_values, ci_change, age_weeks) {
 }

 extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
-  if (USE_UNIFORM_AGE) {
-    message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE))
+  # Extract planting dates from harvest.xlsx (season_start column)
+  # Returns: data.frame with columns (field_id, planting_date)
+  
+  if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
+    message("Warning: No harvesting data available - planting dates will be NA.")
    if (!is.null(field_boundaries_sf)) {
      return(data.frame(
        field_id = field_boundaries_sf$field,
-        date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)),
+        planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)),
        stringsAsFactors = FALSE
      ))
-    } else {
-      return(NULL)
    }
-  }
-  
-  if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
-    message("Warning: No harvesting data available.")
    return(NULL)
  }
  
@ -362,7 +359,7 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL)
      filter(!is.na(planting_date)) %>%
      as.data.frame()
    
-    message(paste("Extracted planting dates for", nrow(planting_dates), "fields"))
+    message(paste("Extracted planting dates for", nrow(planting_dates), "fields from harvest.xlsx"))
    return(planting_dates)
  }, error = function(e) {
    message(paste("Error extracting planting dates:", e$message))
@ -431,17 +428,8 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
                     else if (pct_clear >= 99.5) "Clear view"
                     else "Partial coverage"
        
-        age_weeks <- NA_real_
-        if (USE_UNIFORM_AGE) {
-          age_weeks <- as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks"))
-        }
-        phase <- get_phase_by_age(age_weeks)
-        
-        germination_progress <- NA_character_
-        if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks < 17) {
-          pct_ci_ge_threshold <- sum(ci_vals >= GERMINATION_CI_THRESHOLD) / length(ci_vals) * 100
-          germination_progress <- sprintf("%.1f%%", pct_ci_ge_threshold)
-        }
+        # Age_week and Phase are now calculated in main script using actual planting dates
+        # Germination_progress is calculated in main script after Age_week is known
        
        existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id)
        
@ -457,9 +445,6 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
          CI_Percentiles = ci_percentiles_str,
          Cloud_pct_clear = pct_clear,
          Cloud_category = cloud_cat,
-          Age_week = round(age_weeks, 1),
-          Phase = phase,
-          Germination_progress = germination_progress,
          stringsAsFactors = FALSE
        )
        
@ -536,9 +521,13 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
    
    for (lookback in 1:4) {
      target_week <- current_week - lookback
-      if (target_week < 1) target_week <- target_week + 52
+      target_year <- year
+      if (target_week < 1) {
+        target_week <- target_week + 52
+        target_year <- target_year - 1
+      }
      
-      rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week)
+      rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
      rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
      
      if (file.exists(rds_path)) {
@ -556,9 +545,13 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
    
    for (lookback in 1:8) {
      target_week <- current_week - lookback
-      if (target_week < 1) target_week <- target_week + 52
+      target_year <- year
+      if (target_week < 1) {
+        target_week <- target_week + 52
+        target_year <- target_year - 1
+      }
      
-      rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week)
+      rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
      rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
      
      if (file.exists(rds_path)) {
@ -693,7 +686,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
 load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf, 
                                           mosaic_dir, reports_dir, report_date = Sys.Date()) {
  
-  rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, week_num)
+  rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year)
  rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
  
  if (file.exists(rds_path)) {
@ -713,7 +706,7 @@ load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_bo
  saveRDS(stats_df, rds_path)
  message(paste("Saved weekly statistics RDS:", basename(rds_path)))
  
-  csv_filename <- sprintf("%s_field_stats_week%02d.csv", project_dir, week_num)
+  csv_filename <- sprintf("%s_field_stats_week%02d_%d.csv", project_dir, week_num, year)
  csv_path <- file.path(output_dir, csv_filename)
  readr::write_csv(stats_df, csv_path)
  message(paste("Saved weekly statistics CSV:", basename(csv_path)))