# ============================================================================ # HARVEST WINDOW PREDICTION - FORWARD-LOOKING SYSTEM # Predict harvest 7-14 days AHEAD for factory logistics planning # ============================================================================ # Use case: Factory needs advance warning when harvest is imminent # Strategy: Detect when field enters "harvest-ready window" based on # sustained low CI indicating crop maturation complete # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) # Navigate to project root if in experiments folder if (basename(getwd()) == "harvest_prediction") { setwd("../../..") } source(here("r_app", "parameters_project.R")) # ============================================================================ # CONFIGURATION # ============================================================================ CONFIG <- list( # Minimum field age before harvest is possible (8 months) min_field_age_days = 240, # CI thresholds for maturity assessment ci_threshold_low = 2.5, # Below this = mature crop ci_threshold_very_low = 1.5, # Below this = very mature/bare patches # Sustained low CI indicates "harvest window" sustained_low_days = 5, # CI below threshold for N consecutive days # Advanced warning levels warning_early = 14, # Days ahead for early warning warning_imminent = 7, # Days ahead for imminent warning # Minimum days since last harvest (ratoon cycle) min_days_since_harvest = 200, # Validation window test_window_days = 21 # Test ±21 days around actual harvest ) cat("=== HARVEST WINDOW PREDICTION CONFIGURATION ===\n\n") cat("Goal: Predict harvest 7-14 days AHEAD for factory planning\n\n") cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n") cat("CI thresholds: Low =", CONFIG$ci_threshold_low, "| Very Low =", CONFIG$ci_threshold_very_low, "\n") cat("Sustained low CI requirement:", CONFIG$sustained_low_days, "consecutive days\n") cat("Warning levels: Early =", CONFIG$warning_early, "days | Imminent =", CONFIG$warning_imminent, "days\n\n") # ============================================================================ # LOAD DATA # ============================================================================ cat("=== LOADING DATA ===\n\n") # Load CI time series ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, mean_ci = FitData) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) # Load harvest data harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) # Get fields with both CI and harvest data fields_with_ci <- unique(time_series_daily$field_id) harvest_data_filtered <- harvest_data %>% filter(field %in% fields_with_ci) %>% arrange(field, season_end) cat("Fields with CI data:", length(fields_with_ci), "\n") cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n") cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n") # ============================================================================ # PREDICTION FUNCTION # ============================================================================ predict_harvest_window <- function(field_ts, check_date, last_harvest_date, config = CONFIG) { # Predict if harvest is likely in next 7-14 days based on sustained low CI # # Logic: # 1. Check field age (must be ≥ 240 days) # 2. Check CI has been below threshold for N consecutive days # 3. Assess severity (low vs very low CI) # 4. Return prediction confidence and expected harvest window # Get current CI current_ci <- field_ts %>% filter(date == check_date) %>% pull(mean_ci) if (length(current_ci) == 0) { return(list( predicted = FALSE, confidence = "no_data", current_ci = NA, consecutive_days_low = 0, field_age = NA, harvest_window = "unknown" )) } # Calculate field age if (is.null(last_harvest_date) || is.na(last_harvest_date)) { # First harvest - use earliest CI date as planting proxy earliest_date <- min(field_ts$date, na.rm = TRUE) field_age <- as.numeric(check_date - earliest_date) } else { field_age <- as.numeric(check_date - last_harvest_date) } # Check minimum age requirement if (field_age < config$min_field_age_days) { return(list( predicted = FALSE, confidence = "too_young", current_ci = current_ci, consecutive_days_low = 0, field_age = field_age, harvest_window = "not_ready" )) } # Count consecutive days with CI below threshold (looking backward from check_date) recent_data <- field_ts %>% filter(date <= check_date, date >= check_date - 30) %>% arrange(desc(date)) consecutive_days_low <- 0 for (i in 1:nrow(recent_data)) { if (recent_data$mean_ci[i] <= config$ci_threshold_low) { consecutive_days_low <- consecutive_days_low + 1 } else { break # Stop at first day above threshold } } # Calculate mean CI over sustained period mean_ci_sustained <- if (consecutive_days_low > 0) { recent_data %>% slice(1:consecutive_days_low) %>% summarise(mean = mean(mean_ci, na.rm = TRUE)) %>% pull(mean) } else { NA } # Determine prediction confidence and harvest window predicted <- FALSE confidence <- "none" harvest_window <- "not_ready" if (consecutive_days_low >= config$sustained_low_days) { predicted <- TRUE # Assess severity based on mean CI during sustained period if (!is.na(mean_ci_sustained) && mean_ci_sustained <= config$ci_threshold_very_low) { confidence <- "imminent" # Very low CI = harvest within 7 days harvest_window <- "7_days" } else { confidence <- "likely" # Low CI = harvest within 7-14 days harvest_window <- "7_14_days" } } else if (consecutive_days_low >= 2) { # Starting to show maturity signals predicted <- TRUE confidence <- "possible" harvest_window <- "14_21_days" } return(list( predicted = predicted, confidence = confidence, current_ci = current_ci, mean_ci_sustained = mean_ci_sustained, consecutive_days_low = consecutive_days_low, field_age = field_age, harvest_window = harvest_window )) } # ============================================================================ # VALIDATION FUNCTION # ============================================================================ validate_harvest_prediction <- function(field_id, test_field = NULL) { # Test prediction accuracy by checking ±21 days around actual harvest dates # Get field data field_ts <- time_series_daily %>% filter(field_id == !!field_id) %>% arrange(date) field_harvests <- harvest_data_filtered %>% filter(field == field_id) %>% arrange(season_end) if (nrow(field_harvests) == 0) { cat("No harvest records for field", field_id, "\n") return(NULL) } cat("\n", rep("=", 80), "\n", sep = "") cat("Testing field:", field_id, "\n") cat("Field has", nrow(field_harvests), "recorded harvest events\n") cat(rep("=", 80), "\n\n", sep = "") all_results <- list() detection_timing <- data.frame() # Test each harvest event for (h in 1:nrow(field_harvests)) { harvest_date <- field_harvests$season_end[h] # Get previous harvest for field age calculation if (h == 1) { last_harvest <- NA } else { last_harvest <- field_harvests$season_end[h - 1] } # Test dates from -21 to +14 days around harvest test_dates_seq <- seq.Date( from = harvest_date - CONFIG$test_window_days, to = harvest_date + 14, by = "1 day" ) # Run prediction for each test date event_results <- data.frame() first_detection <- NULL for (i in 1:length(test_dates_seq)) { test_date <- test_dates_seq[i] days_from_harvest <- as.numeric(test_date - harvest_date) result <- predict_harvest_window(field_ts, test_date, last_harvest, CONFIG) # Track first detection if (result$predicted && is.null(first_detection)) { first_detection <- list( date = test_date, days_before = -days_from_harvest, confidence = result$confidence, consecutive_days = result$consecutive_days_low, mean_ci = result$mean_ci_sustained, harvest_window = result$harvest_window ) } event_results <- bind_rows(event_results, data.frame( harvest_event = h, harvest_date = harvest_date, test_date = test_date, days_from_harvest = days_from_harvest, predicted = result$predicted, confidence = result$confidence, current_ci = result$current_ci, consecutive_days_low = result$consecutive_days_low, field_age = result$field_age, harvest_window = result$harvest_window )) } all_results[[h]] <- event_results # Print harvest event summary cat("--- Harvest Event", h, ":", format(harvest_date, "%Y-%m-%d"), "---\n") if (!is.null(first_detection)) { cat("✓ First prediction:", format(first_detection$date, "%Y-%m-%d"), "(", first_detection$days_before, "days before harvest )\n") cat(" Confidence:", first_detection$confidence, "\n") cat(" Harvest window:", first_detection$harvest_window, "\n") cat(" Consecutive days low CI:", first_detection$consecutive_days, "\n") cat(" Mean CI during period:", round(first_detection$mean_ci, 2), "\n") # Categorize detection timing if (first_detection$days_before >= 7 && first_detection$days_before <= 21) { cat(" ✓ GOOD: Detected in optimal window (7-21 days ahead)\n") } else if (first_detection$days_before > 21) { cat(" ⚠️ EARLY: Detected >21 days ahead\n") } else if (first_detection$days_before >= 0) { cat(" ⚠️ LATE: Detected <7 days ahead\n") } else { cat(" ✗ MISSED: Detected after harvest\n") } } else { cat("✗ No prediction detected\n") } cat("\n") # Build detection timing matrix timing_row <- data.frame(harvest_event = h) for (offset in c(-21, -14, -7, -3, -1, 0, 1, 3, 7, 14)) { detected_on_day <- event_results %>% filter(days_from_harvest == offset) %>% pull(predicted) timing_row[[paste0("d", ifelse(offset >= 0, "_plus_", "_minus_"), abs(offset))]] <- ifelse(length(detected_on_day) > 0 && detected_on_day, "YES", "NO") } detection_timing <- bind_rows(detection_timing, timing_row) } # Print detection timing table cat("\n", rep("=", 80), "\n", sep = "") cat("PREDICTION TIMING TABLE\n") cat("Columns: Days relative to harvest date\n") cat(rep("=", 80), "\n\n", sep = "") print(detection_timing, row.names = FALSE) # Calculate summary statistics all_results_df <- bind_rows(all_results) # Find optimal prediction window (7-21 days before) optimal_detections <- all_results_df %>% filter(predicted == TRUE, days_from_harvest >= -21, days_from_harvest <= -7) %>% group_by(harvest_event) %>% slice(1) %>% # First detection in optimal window ungroup() early_detections <- all_results_df %>% filter(predicted == TRUE, days_from_harvest < -21) %>% group_by(harvest_event) %>% slice(1) %>% ungroup() late_detections <- all_results_df %>% filter(predicted == TRUE, days_from_harvest > -7) %>% group_by(harvest_event) %>% slice(1) %>% ungroup() total_harvests <- nrow(field_harvests) cat("\n", rep("=", 80), "\n", sep = "") cat("VALIDATION SUMMARY\n") cat(rep("=", 80), "\n\n", sep = "") cat("Total harvest events tested:", total_harvests, "\n\n") cat("Predictions in OPTIMAL window (7-21 days ahead):", nrow(optimal_detections), "/", total_harvests, "(", round(100 * nrow(optimal_detections) / total_harvests, 1), "% )\n") cat("Predictions TOO EARLY (>21 days ahead):", nrow(early_detections), "\n") cat("Predictions TOO LATE (<7 days ahead):", nrow(late_detections), "\n") cat("Missed harvests:", total_harvests - nrow(optimal_detections) - nrow(early_detections) - nrow(late_detections), "\n\n") # Overall detection rate detected_total <- all_results_df %>% filter(predicted == TRUE, days_from_harvest <= 0) %>% distinct(harvest_event) %>% nrow() cat("Overall detection rate (any time before harvest):", detected_total, "/", total_harvests, "(", round(100 * detected_total / total_harvests, 1), "% )\n\n") # Return detailed results invisible(list( all_results = all_results_df, detection_timing = detection_timing, optimal_detections = optimal_detections, summary = data.frame( field = field_id, total_harvests = total_harvests, optimal_window = nrow(optimal_detections), too_early = nrow(early_detections), too_late = nrow(late_detections), missed = total_harvests - detected_total, detection_rate = round(100 * detected_total / total_harvests, 1) ) )) } # ============================================================================ # RUN VALIDATION # ============================================================================ # Test on Field 00110 (from your graphs) test_field <- "00110" results <- validate_harvest_prediction(test_field) cat("\n", rep("=", 80), "\n", sep = "") cat("INTERPRETATION FOR FACTORY CLIENT\n") cat(rep("=", 80), "\n\n", sep = "") cat("This system provides ADVANCE WARNING when harvest is likely imminent:\n\n") cat(" 📊 HARVEST WINDOW PREDICTIONS:\n") cat(" - '7_days': Harvest expected within 7 days (IMMINENT)\n") cat(" - '7_14_days': Harvest expected in 7-14 days (LIKELY)\n") cat(" - '14_21_days': Harvest possible in 14-21 days (WATCH)\n\n") cat(" ⚙️ DETECTION LOGIC:\n") cat(" - CI below 2.5 for", CONFIG$sustained_low_days, "consecutive days = crop mature\n") cat(" - Very low CI (<1.5) = harvest imminent (7 days)\n") cat(" - Low CI (1.5-2.5) = harvest likely (7-14 days)\n\n") cat(" 🏭 FACTORY USE CASE:\n") cat(" - Factory gets 7-21 days advance notice to plan logistics\n") cat(" - Can schedule processing capacity and transport\n") cat(" - Avoids surprise harvest deliveries\n\n") cat("=== ANALYSIS COMPLETE ===\n")