diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 9d02fa4..4c657ca 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -26,6 +26,55 @@ # source("r_app/80_calculate_kpis.R") # main() +# ============================================================================ +# PENDING WORK - PHASE 4 (Next Sprint) +# ============================================================================ +# +# CRITICAL INTEGRATIONS: +# +# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307) +# [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv +# Columns: field, imminent_prob, detected_prob, week, year +# [ ] LEFT JOIN to field_analysis_df by (field, week, year) +# [ ] Replace hardcoded "placeholder data" in Status_trigger calculation +# [ ] Update column to show actual harvest probability (0-1 or 0-100%) +# +# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31) +# [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field +# [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE +# [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks") +# [ ] Removes TEST MODE hardcoding and enables field-specific aging +# +# UTILITY FILE REFACTORING (Improve code maintainability): +# +# 3. CREATE r_app/80_weekly_stats_utils.R +# [ ] Extract functions from lines 250-795 (calculation layer): +# - calculate_field_statistics() +# - calculate_kpi_trends() +# - load_or_calculate_weekly_stats() +# - Helper: load_tiles_for_field(), get_tile_ids_for_field() +# - Helper: extract_field_statistics_from_ci() +# [ ] Clean separation: DATA CALCULATION ONLY (no Excel export) +# [ ] Reusable by run_full_pipeline.R and other scripts +# +# 4. CREATE r_app/80_report_building_utils.R +# [ ] Extract functions from lines 1350-2100+ (output/reporting layer): +# - generate_field_analysis_summary() +# - export_field_analysis_excel() +# - calculate_and_export_farm_kpis() +# - Helper: categorize_*, get_*, round_* functions +# - Helper: get_phase_by_age(), get_status_trigger() +# [ ] Clean separation: OUTPUT/FORMATTING ONLY (consumes calculated stats) +# [ ] Reusable for alternative export formats (PDF, HTML, dashboard) +# +# TESTING PLAN: +# [ ] Verify 8-week historical data loads (currently TEST_MODE = 2 weeks only) +# [ ] Confirm Four_week_trend calculates from 1-4 weeks (graceful degradation) +# [ ] Confirm CV_Trend_Long_Term uses full 8-week regression (when available) +# [ ] Load script 31 output and validate imminent_prob population +# +# ============================================================================ + # ============================================================================ # EXCEL OUTPUT SPECIFICATION (21 COLUMNS) # ============================================================================ @@ -114,6 +163,20 @@ FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5 # CV TREND THRESHOLDS CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05 +# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS +# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend +# Negative slope = CV decreasing = field becoming MORE uniform = GOOD +# Positive slope = CV increasing = field becoming MORE patchy = BAD +# Near zero = Homogenous growth (all crops progressing equally) +CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.05 # CV decreasing rapidly +CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # Gradual synchronization +CV_SLOPE_IMPROVEMENT_MAX <- -0.005 # Becoming uniform +CV_SLOPE_HOMOGENOUS_MIN <- -0.005 # Stable, uniform growth +CV_SLOPE_HOMOGENOUS_MAX <- 0.005 # No change in uniformity +CV_SLOPE_PATCHINESS_MIN <- 0.005 # Minor divergence +CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness +CV_SLOPE_SEVERE_MIN <- 0.02 # Field fragmentation beginning + # CLOUD COVER ROUNDING INTERVALS CLOUD_INTERVALS <- c(0, 50, 60, 70, 80, 90, 100) @@ -404,6 +467,100 @@ calculate_cv_trend <- function(cv_current, cv_previous) { return(round(cv_current - cv_previous, 4)) } +calculate_four_week_trend <- function(mean_ci_values) { + #' Calculate four-week CI trend from available weeks + #' + #' Uses whatever weeks are available (1-4 weeks) to estimate trend + #' Returns difference between current (most recent) and oldest available week + #' + #' @param mean_ci_values vector of Mean_CI values in chronological order (oldest to newest) + #' @return numeric: CI difference (current - oldest), rounded to 2 decimals + + if (is.null(mean_ci_values) || length(mean_ci_values) == 0) { + return(NA_real_) + } + + # Remove NAs + ci_clean <- mean_ci_values[!is.na(mean_ci_values)] + + if (length(ci_clean) < 2) { + # Need at least 2 weeks to calculate trend + return(NA_real_) + } + + # Calculate difference: current - oldest + trend <- ci_clean[length(ci_clean)] - ci_clean[1] + return(round(trend, 2)) +} + +categorize_cv_slope <- function(slope) { + #' Categorize CV slope (8-week regression) into field uniformity interpretation + #' + #' Slope interpretation: + #' Negative slope = CV decreasing = field becoming MORE uniform = GOOD + #' Positive slope = CV increasing = field becoming MORE patchy = BAD + #' Near zero = Homogenous growth (all crops progressing equally) + #' + #' Categories: + #' - "Excellent uniformity": Slope <= -0.02 (CV decreasing, field synchronizing) + #' - "Homogenous growth": -0.02 < slope < 0.005 (stable, uniform growth) + #' - "Minor patchiness": 0.005 <= slope <= 0.02 (CV slowly increasing) + #' - "Severe fragmentation": slope > 0.02 (rapid CV increase, parts diverging) + #' + #' @param slope numeric: CV trend slope per week + #' @return character: Category string + + if (is.na(slope)) { + return(NA_character_) + } + + if (slope <= CV_SLOPE_IMPROVEMENT_MIN) { + return("Excellent uniformity") + } else if (slope < CV_SLOPE_HOMOGENOUS_MIN) { + return("Homogenous growth") + } else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) { + return("Homogenous growth") + } else if (slope <= CV_SLOPE_PATCHINESS_MAX) { + return("Minor patchiness") + } else { + return("Severe fragmentation") + } +} + +calculate_cv_trend_long_term <- function(cv_values) { + #' Calculate 8-week CV trend via linear regression slope + #' + #' Fits linear regression to CV over available weeks (1-8) + #' Returns slope = rate of change per week + #' + #' @param cv_values vector of CV values in chronological order (oldest to newest) + #' @return numeric: Regression slope (CV change per week), rounded to 4 decimals + + if (is.null(cv_values) || length(cv_values) == 0) { + return(NA_real_) + } + + # Remove NAs + cv_clean <- cv_values[!is.na(cv_values)] + + if (length(cv_clean) < 2) { + # Need at least 2 weeks to fit a line + return(NA_real_) + } + + # Create week sequence matching data length + weeks <- seq_along(cv_clean) + + # Fit linear model + tryCatch({ + lm_fit <- lm(cv_clean ~ weeks) + slope <- coef(lm_fit)["weeks"] + return(round(as.numeric(slope), 4)) + }, error = function(e) { + return(NA_real_) + }) +} + # ============================================================================ # HELPER FUNCTIONS # ============================================================================ @@ -498,7 +655,7 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n } USE_UNIFORM_AGE <- TRUE -UNIFORM_PLANTING_DATE <- as.Date("2025-01-01") +UNIFORM_PLANTING_DATE <- as.Date("2026-01-01") extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { if (USE_UNIFORM_AGE) { @@ -688,13 +845,17 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, # CALCULATE KPI TRENDS (Requires previous week RDS) # ============================================================================ -calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { +calculate_kpi_trends <- function(current_stats, prev_stats = NULL, + project_dir = NULL, reports_dir = NULL, + current_week = NULL, year = NULL) { message("Calculating KPI trends from current and previous week data") # Initialize new columns with defaults current_stats$Weekly_ci_change <- NA_real_ current_stats$CV_Trend_Short_Term <- NA_real_ + current_stats$Four_week_trend <- NA_real_ + current_stats$CV_Trend_Long_Term <- NA_real_ current_stats$nmr_weeks_in_this_phase <- 1L # If no previous week data, return with defaults @@ -704,36 +865,18 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { } message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns")) - message(paste(" prev_stats columns:", paste(names(prev_stats), collapse = ", "))) # Build lookup indices for previous week (by Field_id) prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) # Try to load previous week's field_analysis to get nmr_weeks_in_this_phase history prev_field_analysis <- NULL - prev_analysis_csv <- file.path( - reports_dir, "kpis", "field_analysis", - sprintf("%s_field_analysis_week%02d.csv", - paste(strsplit(current_stats$Field_id[1], "")[[1]][1], collapse=""), # Get project from field - as.numeric(format(Sys.Date() - 7, "%V"))) # Approximate previous week - ) - - # Better way: construct the previous week number properly - current_week_num <- as.numeric(format(Sys.Date(), "%V")) - prev_week_num <- current_week_num - 1 - if (prev_week_num < 1) prev_week_num <- 52 - - # This is a bit tricky - we need the project_dir from the main scope - # For now, assume we can infer it or pass it through - # Let's use a simpler approach: look for any field_analysis_week* file that's recent tryCatch({ analysis_dir <- file.path(reports_dir, "kpis", "field_analysis") if (dir.exists(analysis_dir)) { - # Find the most recent field_analysis CSV (should be previous week) analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) if (length(analysis_files) > 0) { - # Sort by modification time and get the most recent recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, col_select = c(Field_id, nmr_weeks_in_this_phase, Phase)) @@ -747,49 +890,139 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase")) } - # For each field in current week, lookup previous values + # ============================================================ + # PHASE 3: Load 4-8 weeks of historical data for trend calculations + # ============================================================ + + historical_4weeks <- list() + historical_8weeks <- list() + + if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) { + message(" Loading historical field_stats for 4-week and 8-week trends...") + + # Load up to 4 weeks back for four_week_trend + for (lookback in 1:4) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + tryCatch({ + stats_data <- readRDS(rds_path) + historical_4weeks[[length(historical_4weeks) + 1]] <- list( + week = target_week, + stats = stats_data + ) + }, error = function(e) { + message(paste(" Warning: Could not load week", target_week, ":", e$message)) + }) + } + } + + # Load up to 8 weeks back for cv_trend_long_term + for (lookback in 1:8) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + tryCatch({ + stats_data <- readRDS(rds_path) + historical_8weeks[[length(historical_8weeks) + 1]] <- list( + week = target_week, + stats = stats_data + ) + }, error = function(e) { + # Silently skip - we'll work with whatever weeks exist + }) + } + } + + if (length(historical_4weeks) > 0) { + message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend")) + } + if (length(historical_8weeks) > 0) { + message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend")) + } + } + + # For each field in current week, lookup previous values and calculate trends cv_trends_calculated <- 0 + four_week_trends_calculated <- 0 + cv_long_term_calculated <- 0 + for (i in seq_len(nrow(current_stats))) { field_id <- current_stats$Field_id[i] prev_idx <- prev_lookup[field_id] if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { - # Field exists in previous week - extract row carefully - prev_row <- prev_stats[prev_idx, , drop = FALSE] # Keep as dataframe + prev_row <- prev_stats[prev_idx, , drop = FALSE] - if (nrow(prev_row) == 0) { - # Field not found - skip - next - } - - # Access values from single-row dataframe - prev_mean_ci <- prev_row$Mean_CI[1] - prev_cv <- prev_row$CV[1] - prev_phase <- prev_row$Phase[1] - - # Weekly CI change (current Mean_CI - previous Mean_CI) - if (!is.na(prev_mean_ci) && !is.na(current_stats$Mean_CI[i])) { + # WEEKLY CI CHANGE + prev_ci <- prev_row$Mean_CI[1] + if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) { current_stats$Weekly_ci_change[i] <- - round(current_stats$Mean_CI[i] - prev_mean_ci, 2) - } - - # CV short-term trend (current CV - previous CV) - # DEBUG: Check first few fields - if (i <= 3) { - message(paste(" Field", field_id, "- CV_prev:", prev_cv, "CV_curr:", current_stats$CV[i])) + round(current_stats$Mean_CI[i] - prev_ci, 2) } + # CV TREND SHORT TERM (2-week comparison) + prev_cv <- prev_row$CV[1] if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) { - trend_val <- round(current_stats$CV[i] - prev_cv, 4) - current_stats$CV_Trend_Short_Term[i] <- trend_val + current_stats$CV_Trend_Short_Term[i] <- + calculate_cv_trend(current_stats$CV[i], prev_cv) cv_trends_calculated <- cv_trends_calculated + 1 + } + + # FOUR-WEEK TREND (if available) + if (length(historical_4weeks) > 0) { + ci_values_4week <- numeric() - if (i <= 3) { - message(paste(" -> CV_Trend =", trend_val)) + # Add oldest available weeks (reverse order to get oldest first) + for (hist_idx in rev(seq_along(historical_4weeks))) { + hist_data <- historical_4weeks[[hist_idx]]$stats + hist_field <- which(hist_data$Field_id == field_id) + if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) { + ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]]) + } + } + + # Add current week CI + ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i]) + + if (length(ci_values_4week) >= 2) { + current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week) + four_week_trends_calculated <- four_week_trends_calculated + 1 } } - # Weeks in current phase (track phase transitions) + # CV TREND LONG TERM (8-week slope) + if (length(historical_8weeks) > 0) { + cv_values_8week <- numeric() + + # Add oldest available weeks (reverse order to get oldest first) + for (hist_idx in rev(seq_along(historical_8weeks))) { + hist_data <- historical_8weeks[[hist_idx]]$stats + hist_field <- which(hist_data$Field_id == field_id) + if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) { + cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]]) + } + } + + # Add current week CV + cv_values_8week <- c(cv_values_8week, current_stats$CV[i]) + + if (length(cv_values_8week) >= 2) { + slope <- calculate_cv_trend_long_term(cv_values_8week) + current_stats$CV_Trend_Long_Term[i] <- slope + cv_long_term_calculated <- cv_long_term_calculated + 1 + } + } + + # WEEKS IN CURRENT PHASE (track phase transitions) # Use previous field_analysis if available for proper counter progression if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { # Look up this field in previous analysis @@ -810,9 +1043,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { current_stats$nmr_weeks_in_this_phase[i] <- 1L } } - } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) { + } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { # Field not in previous analysis, fall back to prev_stats phase comparison - if (current_stats$Phase[i] == prev_phase) { + if (current_stats$Phase[i] == prev_row$Phase[1]) { current_stats$nmr_weeks_in_this_phase[i] <- 2L } else { current_stats$nmr_weeks_in_this_phase[i] <- 1L @@ -820,9 +1053,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { } } else { # No previous field_analysis available - use phase from prev_stats - if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) { - if (current_stats$Phase[i] == prev_phase) { - # Same phase - increment counter (start with 2) + if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { + if (current_stats$Phase[i] == prev_row$Phase[1]) { + # Same phase - increment counter (start with 2 since prev week was in this phase) current_stats$nmr_weeks_in_this_phase[i] <- 2L } else { # Phase changed - reset to 1 @@ -833,8 +1066,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { } } - message(paste(" ✓ Calculated", cv_trends_calculated, "CV_Trend_Short_Term values out of", nrow(current_stats), "fields")) - message(paste(" CV_Trend_Short_Term non-NA values:", sum(!is.na(current_stats$CV_Trend_Short_Term)))) + message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields")) + message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields")) + message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields")) return(current_stats) } @@ -1702,9 +1936,13 @@ main <- function() { # Apply trend calculations (requires both weeks) message("\n3. Calculating trend columns...") - current_stats <- calculate_kpi_trends(current_stats, prev_stats) + current_stats <- calculate_kpi_trends(current_stats, prev_stats, + project_dir = project_dir, + reports_dir = reports_dir, + current_week = current_week, + year = year) - message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, nmr_weeks_in_this_phase")) + message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase")) # ============================================================================ # Build final output dataframe with all 21 columns @@ -1758,8 +1996,7 @@ main <- function() { if_else(is.na(acreages_vec), 0, acreages_vec) }, # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change) - # Column 7: Four_week_trend (Phase 3 future) - Four_week_trend = NA_character_, + # Column 7: Four_week_trend (from current_stats) # Column 8: Last_harvest_or_planting_date (dummy for now) Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, # Columns 9-10: Already in current_stats (Age_week, Phase) @@ -1788,8 +2025,11 @@ main <- function() { # Columns 15-16: Already in current_stats (CI_range, CI_Percentiles) # Column 17: Already in current_stats (CV) # Column 18: Already in current_stats (CV_Trend_Short_Term) - # Column 19: CV_Trend_Long_Term (Phase 3 future) - CV_Trend_Long_Term = NA_real_, + # Column 19: CV_Trend_Long_Term (from current_stats - raw slope value) + # Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope) + CV_Trend_Long_Term_Category = { + sapply(current_stats$CV_Trend_Long_Term, categorize_cv_slope) + }, # Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category) .keep = "all" # Keep all existing columns ) %>% @@ -1797,7 +2037,7 @@ main <- function() { Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change, Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase, nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger, - CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, + CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, CV_Trend_Long_Term_Category, Cloud_pct_clear, Cloud_category )