# ============================================================================ # OPERATIONAL HARVEST PREDICTION # Analyze current season growth curves to predict harvest timing # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(terra) library(sf) library(here) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) source(here("r_app", "parameters_project.R")) # ============================================================================ # STEP 1: LOAD DATA # ============================================================================ cat("=== LOADING DATA ===\n\n") # Load CI time series ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate( date = as.Date(Date), week = isoweek(date), year = isoyear(date) ) %>% select( field_id = field, date, week, year, mean_ci = FitData ) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) # Load harvest data harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) fields_with_ci <- unique(time_series_daily$field_id) harvest_data_filtered <- harvest_data %>% filter(field %in% fields_with_ci) %>% arrange(field, season_end) cat("Loaded CI data for", length(fields_with_ci), "fields\n") cat("Loaded harvest data for", length(unique(harvest_data_filtered$field)), "fields\n\n") # ============================================================================ # STEP 2: SEGMENT TIME SERIES BY SEASON # ============================================================================ cat("=== SEGMENTING TIME SERIES INTO INDIVIDUAL SEASONS ===\n\n") # For each field, create seasons based on harvest dates # Season starts day after previous harvest, ends at next harvest create_seasons <- function(field_name, ci_ts, harvest_df) { # Get CI data for this field field_ci <- ci_ts %>% filter(field_id == field_name) %>% arrange(date) # Get harvest dates for this field field_harvests <- harvest_df %>% filter(field == field_name) %>% arrange(season_end) %>% mutate(season_id = row_number()) if (nrow(field_harvests) == 0) { return(NULL) } # Create season segments seasons_list <- list() for (i in 1:nrow(field_harvests)) { # Season start: day after previous harvest (or start of data if first season) if (i == 1) { season_start <- min(field_ci$date) } else { season_start <- field_harvests$season_end[i-1] + 1 } # Season end: current harvest date season_end <- field_harvests$season_end[i] # Extract CI data for this season season_ci <- field_ci %>% filter(date >= season_start, date <= season_end) if (nrow(season_ci) > 0) { season_ci$season_id <- i season_ci$season_start_date <- season_start season_ci$season_end_date <- season_end season_ci$days_in_season <- as.numeric(season_end - season_start) season_ci$days_since_start <- as.numeric(season_ci$date - season_start) season_ci$days_until_harvest <- as.numeric(season_end - season_ci$date) seasons_list[[i]] <- season_ci } } # Add current ongoing season (after last harvest) if (nrow(field_harvests) > 0) { last_harvest <- field_harvests$season_end[nrow(field_harvests)] current_season_start <- last_harvest + 1 current_season_ci <- field_ci %>% filter(date >= current_season_start) if (nrow(current_season_ci) > 0) { current_season_ci$season_id <- nrow(field_harvests) + 1 current_season_ci$season_start_date <- current_season_start current_season_ci$season_end_date <- NA # Unknown - this is what we're predicting current_season_ci$days_in_season <- NA current_season_ci$days_since_start <- as.numeric(current_season_ci$date - current_season_start) current_season_ci$days_until_harvest <- NA seasons_list[[length(seasons_list) + 1]] <- current_season_ci } } if (length(seasons_list) > 0) { return(bind_rows(seasons_list)) } else { return(NULL) } } # Create segmented data for all fields all_seasons <- lapply(fields_with_ci, function(field_name) { seasons <- create_seasons(field_name, time_series_daily, harvest_data_filtered) if (!is.null(seasons)) { seasons$field_id <- field_name } return(seasons) }) %>% bind_rows() cat("Created", nrow(all_seasons), "season-segmented observations\n") cat("Total seasons:", length(unique(paste(all_seasons$field_id, all_seasons$season_id))), "\n\n") # Summary by season season_summary <- all_seasons %>% group_by(field_id, season_id) %>% summarise( season_start = min(season_start_date), season_end = max(season_end_date), n_observations = n(), days_duration = max(days_in_season, na.rm = TRUE), max_ci = max(mean_ci, na.rm = TRUE), is_current = all(is.na(season_end_date)), .groups = "drop" ) cat("Season summary:\n") print(head(season_summary, 20)) # ============================================================================ # STEP 3: GROWTH CURVE ANALYSIS PER SEASON # ============================================================================ cat("\n\n=== ANALYZING GROWTH CURVES PER SEASON ===\n\n") # Smoothing function (Savitzky-Golay style moving average) smooth_ci <- function(ci_values, window = 15) { n <- length(ci_values) if (n < window) window <- max(3, n) smoothed <- rep(NA, n) half_window <- floor(window / 2) for (i in 1:n) { start_idx <- max(1, i - half_window) end_idx <- min(n, i + half_window) smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE) } return(smoothed) } # Detect peak and senescence analyze_season_curve <- function(season_df) { if (nrow(season_df) < 20) { return(list( peak_date = NA, peak_ci = NA, peak_days_since_start = NA, senescence_start_date = NA, senescence_rate = NA, current_phase = "insufficient_data" )) } # Smooth the curve season_df$ci_smooth <- smooth_ci(season_df$mean_ci) # Find peak peak_idx <- which.max(season_df$ci_smooth) peak_date <- season_df$date[peak_idx] peak_ci <- season_df$ci_smooth[peak_idx] peak_days <- season_df$days_since_start[peak_idx] # Check if we're past the peak last_date <- max(season_df$date) is_post_peak <- last_date > peak_date # Calculate senescence rate (slope after peak) if (is_post_peak && peak_idx < nrow(season_df) - 5) { post_peak_data <- season_df[(peak_idx):nrow(season_df), ] # Fit linear model to post-peak data lm_post <- lm(ci_smooth ~ days_since_start, data = post_peak_data) senescence_rate <- coef(lm_post)[2] # Slope senescence_start <- peak_date } else { senescence_rate <- NA senescence_start <- NA } # Determine current phase current_ci <- tail(season_df$ci_smooth, 1) if (is.na(current_ci)) { current_phase <- "unknown" } else if (!is_post_peak) { current_phase <- "growing" } else if (current_ci > 2.5) { current_phase <- "post_peak_maturing" } else { current_phase <- "declining_harvest_approaching" } return(list( peak_date = peak_date, peak_ci = peak_ci, peak_days_since_start = peak_days, senescence_start_date = senescence_start, senescence_rate = senescence_rate, current_phase = current_phase, current_ci = current_ci, last_obs_date = last_date )) } # Analyze each season season_analysis <- all_seasons %>% group_by(field_id, season_id) %>% group_modify(~ { analysis <- analyze_season_curve(.x) as.data.frame(analysis) }) %>% ungroup() # Merge with season summary season_results <- season_summary %>% left_join(season_analysis, by = c("field_id", "season_id")) cat("Analyzed", nrow(season_results), "seasons\n\n") # ============================================================================ # STEP 4: HARVEST TIMING PATTERNS (Historical Analysis) # ============================================================================ cat("=== ANALYZING HISTORICAL HARVEST TIMING PATTERNS ===\n\n") # Look at completed seasons only historical_seasons <- season_results %>% filter(!is_current) %>% mutate( days_peak_to_harvest = as.numeric(season_end - peak_date) ) cat("Historical season statistics (completed harvests):\n\n") cat("Average days from peak to harvest:\n") peak_to_harvest_stats <- historical_seasons %>% filter(!is.na(days_peak_to_harvest)) %>% summarise( mean_days = mean(days_peak_to_harvest, na.rm = TRUE), median_days = median(days_peak_to_harvest, na.rm = TRUE), sd_days = sd(days_peak_to_harvest, na.rm = TRUE), min_days = min(days_peak_to_harvest, na.rm = TRUE), max_days = max(days_peak_to_harvest, na.rm = TRUE) ) print(peak_to_harvest_stats) cat("\n\nPeak CI at harvest time:\n") peak_ci_stats <- historical_seasons %>% filter(!is.na(peak_ci)) %>% summarise( mean_peak_ci = mean(peak_ci, na.rm = TRUE), median_peak_ci = median(peak_ci, na.rm = TRUE), sd_peak_ci = sd(peak_ci, na.rm = TRUE) ) print(peak_ci_stats) cat("\n\nSenescence rate (CI decline per day after peak):\n") senescence_stats <- historical_seasons %>% filter(!is.na(senescence_rate), senescence_rate < 0) %>% summarise( mean_rate = mean(senescence_rate, na.rm = TRUE), median_rate = median(senescence_rate, na.rm = TRUE), sd_rate = sd(senescence_rate, na.rm = TRUE) ) print(senescence_stats) # ============================================================================ # STEP 5: CURRENT SEASON PREDICTIONS # ============================================================================ cat("\n\n=== PREDICTING HARVEST FOR CURRENT ONGOING SEASONS ===\n\n") # Get current seasons current_seasons <- season_results %>% filter(is_current) %>% mutate( # Use historical average to predict harvest predicted_harvest_date = peak_date + peak_to_harvest_stats$mean_days, days_until_predicted_harvest = as.numeric(predicted_harvest_date - last_obs_date), weeks_until_predicted_harvest = days_until_predicted_harvest / 7 ) cat("Current ongoing seasons (ready for harvest prediction):\n\n") current_predictions <- current_seasons %>% mutate( days_since_peak = as.numeric(last_obs_date - peak_date) ) %>% select( field_id, season_id, last_harvest = season_start, last_observation = last_obs_date, current_ci, current_phase, peak_date, peak_ci, days_since_peak, predicted_harvest = predicted_harvest_date, weeks_until_harvest = weeks_until_predicted_harvest ) %>% arrange(weeks_until_harvest) print(current_predictions) cat("\n\nHarvest readiness assessment:\n\n") harvest_alerts <- current_predictions %>% mutate( alert = case_when( current_ci < 2.5 & current_phase == "declining_harvest_approaching" ~ "🚨 HARVEST IMMINENT (CI < 2.5)", current_ci < 3.0 & weeks_until_harvest < 2 ~ "⚠️ HARVEST WITHIN 2 WEEKS", weeks_until_harvest < 4 ~ "💡 HARVEST WITHIN 1 MONTH", current_phase == "growing" ~ "✅ STILL GROWING", TRUE ~ "📊 MONITORING" ) ) %>% select(field_id, current_ci, current_phase, predicted_harvest, alert) print(harvest_alerts) # ============================================================================ # STEP 6: VALIDATION OF PREDICTION METHOD # ============================================================================ cat("\n\n=== VALIDATING PREDICTION METHOD ON HISTORICAL DATA ===\n\n") # For each historical season, predict when harvest would occur using only data up to peak validation_results <- historical_seasons %>% filter(!is.na(peak_date), !is.na(season_end)) %>% mutate( predicted_harvest = peak_date + peak_to_harvest_stats$mean_days, actual_harvest = season_end, prediction_error_days = as.numeric(predicted_harvest - actual_harvest), prediction_error_weeks = prediction_error_days / 7 ) cat("Prediction accuracy metrics:\n\n") accuracy_metrics <- validation_results %>% summarise( n_predictions = n(), mean_error_days = mean(abs(prediction_error_days), na.rm = TRUE), median_error_days = median(abs(prediction_error_days), na.rm = TRUE), rmse_days = sqrt(mean(prediction_error_days^2, na.rm = TRUE)), within_2_weeks = sum(abs(prediction_error_weeks) <= 2, na.rm = TRUE), pct_within_2_weeks = 100 * sum(abs(prediction_error_weeks) <= 2, na.rm = TRUE) / n() ) print(accuracy_metrics) cat("\n\nSample predictions vs actual:\n") print(validation_results %>% select(field_id, season_id, peak_date, predicted_harvest, actual_harvest, prediction_error_weeks) %>% head(15)) # ============================================================================ # SUMMARY # ============================================================================ cat("\n\n=== OPERATIONAL HARVEST PREDICTION SUMMARY ===\n\n") cat("METHODOLOGY:\n") cat("1. Segment CI time series by harvest dates (each season = planting to harvest)\n") cat("2. Smooth CI data to identify peak (maturity point)\n") cat("3. Historical pattern: Average", round(peak_to_harvest_stats$mean_days), "days from peak to harvest\n") cat("4. Current season prediction: Peak date +", round(peak_to_harvest_stats$mean_days), "days\n\n") cat("PREDICTION ACCURACY (Historical Validation):\n") cat(" - Mean absolute error:", round(accuracy_metrics$mean_error_days), "days\n") cat(" - RMSE:", round(accuracy_metrics$rmse_days), "days\n") cat(" - Accuracy within 2 weeks:", round(accuracy_metrics$pct_within_2_weeks), "%\n\n") cat("HARVEST TRIGGER (Operational Rule):\n") cat(" - Primary: CI drops below 2.5 while in declining phase\n") cat(" - Secondary: Predicted harvest date approaches (±2 weeks)\n") cat(" - Confirmation: Visual inspection when both conditions met\n\n") cat("FIELDS READY FOR HARVEST NOW:\n") ready_now <- harvest_alerts %>% filter(grepl("IMMINENT|WITHIN 2 WEEKS", alert)) if (nrow(ready_now) > 0) { print(ready_now) } else { cat(" No fields at immediate harvest stage\n") } cat("\n=== ANALYSIS COMPLETE ===\n")