# ============================================================================ # DAILY-SCALE HARVEST DETECTION AND VALIDATION # Real-time detection using CI drop patterns # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) # Navigate to project root if in experiments folder if (basename(getwd()) == "harvest_prediction") { setwd("../../..") } source(here("r_app", "parameters_project.R")) # ============================================================================ # CONFIGURATION # ============================================================================ CONFIG <- list( min_field_age_days = 240, # 8 months minimum age ci_threshold = 2.5, # Below this = potential harvest drop_threshold_low = 1.0, # Minimum CI drop to flag drop_threshold_high = 2.0, # Strong CI drop lookback_days_min = 7, # Compare with 7-14 days ago lookback_days_max = 14, confirmation_days = 3, # Days below threshold for confirmation test_window_days = 14 # Test ±14 days around actual harvest ) cat("=== DAILY HARVEST DETECTION CONFIGURATION ===\n\n") cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n") cat("CI threshold:", CONFIG$ci_threshold, "\n") cat("Drop thresholds:", CONFIG$drop_threshold_low, "and", CONFIG$drop_threshold_high, "\n") cat("Lookback window:", CONFIG$lookback_days_min, "-", CONFIG$lookback_days_max, "days\n") cat("Confirmation window:", CONFIG$confirmation_days, "consecutive days\n\n") # ============================================================================ # LOAD DATA # ============================================================================ cat("=== LOADING DATA ===\n\n") # Load CI time series ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, mean_ci = FitData) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) # Load harvest data harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) # Get fields with both CI and harvest data fields_with_ci <- unique(time_series_daily$field_id) harvest_data_filtered <- harvest_data %>% filter(field %in% fields_with_ci) %>% arrange(field, season_end) cat("Fields with CI data:", length(fields_with_ci), "\n") cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n") cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n") # ============================================================================ # DETECTION FUNCTION # ============================================================================ detect_harvest_on_date <- function(field_ts, check_date, last_harvest_date, config = CONFIG) { # Check if harvest can be detected on a specific date # # Args: # field_ts: Daily CI time series for field # check_date: Date to check for harvest detection # last_harvest_date: Previous harvest date (to calculate field age) # config: Detection parameters # # Returns: # List with detection status and metrics # Get CI value on check date current_ci <- field_ts %>% filter(date == check_date) %>% pull(mean_ci) if (length(current_ci) == 0) { return(list( detected = FALSE, reason = "no_data", current_ci = NA, lookback_ci = NA, ci_drop = NA, field_age = NA, consecutive_days_low = 0 )) } # Check field age field_age <- as.numeric(check_date - last_harvest_date) if (field_age < config$min_field_age_days) { return(list( detected = FALSE, reason = "too_young", current_ci = current_ci, lookback_ci = NA, ci_drop = NA, field_age = field_age, consecutive_days_low = 0 )) } # Get lookback period CI (mean of 7-14 days ago) lookback_start <- check_date - config$lookback_days_max lookback_end <- check_date - config$lookback_days_min lookback_ci <- field_ts %>% filter(date >= lookback_start, date <= lookback_end) %>% summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>% pull(mean_ci) if (is.na(lookback_ci) || length(lookback_ci) == 0) { return(list( detected = FALSE, reason = "no_lookback_data", current_ci = current_ci, lookback_ci = NA, ci_drop = NA, field_age = field_age, consecutive_days_low = 0 )) } # Calculate CI drop ci_drop <- lookback_ci - current_ci # Check consecutive days below threshold consecutive_days <- 0 for (i in 0:config$confirmation_days) { test_date <- check_date - i test_ci <- field_ts %>% filter(date == test_date) %>% pull(mean_ci) if (length(test_ci) > 0 && test_ci < config$ci_threshold) { consecutive_days <- consecutive_days + 1 } else { break } } # Detection logic detected <- FALSE confidence <- "none" reason <- "no_trigger" # Check conditions below_threshold <- current_ci < config$ci_threshold strong_drop <- ci_drop >= config$drop_threshold_high moderate_drop <- ci_drop >= config$drop_threshold_low if (below_threshold && strong_drop) { detected <- TRUE reason <- "strong_drop" if (consecutive_days >= 3) { confidence <- "confirmed" } else if (consecutive_days >= 2) { confidence <- "likely" } else { confidence <- "possible" } } else if (below_threshold && moderate_drop) { if (consecutive_days >= 2) { detected <- TRUE reason <- "moderate_drop_confirmed" confidence <- "likely" } else { detected <- TRUE reason <- "moderate_drop" confidence <- "possible" } } else if (below_threshold) { reason <- "below_threshold_no_drop" } else if (strong_drop) { reason <- "strong_drop_above_threshold" } return(list( detected = detected, reason = reason, confidence = confidence, current_ci = current_ci, lookback_ci = lookback_ci, ci_drop = ci_drop, field_age = field_age, consecutive_days_low = consecutive_days, below_threshold = below_threshold, strong_drop = strong_drop )) } # ============================================================================ # VALIDATION: TEST AROUND KNOWN HARVEST DATES # ============================================================================ cat("=== TESTING DETECTION AROUND KNOWN HARVEST DATES ===\n\n") # Start with first field test_field <- fields_with_ci[1] cat("Testing field:", test_field, "\n\n") # Get field's time series and harvests field_ts <- time_series_daily %>% filter(field_id == test_field) field_harvests <- harvest_data_filtered %>% filter(field == test_field) %>% arrange(season_end) cat("Field has", nrow(field_harvests), "recorded harvest events\n") cat("CI observations:", nrow(field_ts), "days from", min(field_ts$date), "to", max(field_ts$date), "\n\n") # Test each harvest event validation_results <- list() for (h in 1:nrow(field_harvests)) { actual_harvest <- field_harvests$season_end[h] # Get previous harvest for field age calculation if (h == 1) { last_harvest <- field_harvests$season_start[h] - 365 # Assume ~1 year before first record } else { last_harvest <- field_harvests$season_end[h-1] } cat("\n--- Harvest Event", h, ": ", as.character(actual_harvest), " ---\n") cat("Field age at harvest:", round(as.numeric(actual_harvest - last_harvest)), "days\n\n") # Test ±14 days around actual harvest test_dates <- seq.Date( from = actual_harvest - CONFIG$test_window_days, to = actual_harvest + CONFIG$test_window_days, by = "day" ) # Run detection for each test date daily_results <- data.frame() for (i in 1:length(test_dates)) { test_date <- test_dates[i] result <- detect_harvest_on_date(field_ts, test_date, last_harvest) daily_results <- rbind(daily_results, data.frame( test_date = test_date, days_from_actual = as.numeric(test_date - actual_harvest), detected = result$detected, confidence = ifelse(is.null(result$confidence) || length(result$confidence) == 0, "none", result$confidence), reason = result$reason, current_ci = result$current_ci, lookback_ci = result$lookback_ci, ci_drop = result$ci_drop, consecutive_days = result$consecutive_days_low, stringsAsFactors = FALSE )) } # Find first detection first_detection <- daily_results %>% filter(detected == TRUE) %>% arrange(test_date) %>% slice(1) if (nrow(first_detection) > 0) { cat("✓ First detection:", as.character(first_detection$test_date), "(", first_detection$days_from_actual, "days from actual harvest )\n") cat(" Confidence:", first_detection$confidence, "\n") cat(" CI drop:", round(first_detection$ci_drop, 2), "\n\n") } else { cat("✗ No detection within test window\n\n") } # Print detailed daily table cat("Day-by-Day Detection Results:\n") cat(sprintf("%-12s | %-15s | %8s | %10s | %10s | %8s | %8s | %15s\n", "Date", "Days from Actual", "Detected", "Confidence", "Drop (1.0)", "Drop (2.0)", "CI", "Reason")) cat(paste(rep("-", 110), collapse = ""), "\n") for (i in 1:nrow(daily_results)) { row <- daily_results[i, ] # Check both thresholds explicitly drop_1 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 1.0, "YES", "NO") drop_2 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 2.0, "YES", "NO") cat(sprintf("%-12s | %+15d | %8s | %10s | %10s | %10s | %8.2f | %15s\n", as.character(row$test_date), row$days_from_actual, ifelse(row$detected, "YES", "NO"), row$confidence, drop_1, drop_2, ifelse(is.na(row$current_ci), NA, row$current_ci), substr(row$reason, 1, 15))) } cat("\n") # Store results validation_results[[h]] <- list( harvest_id = h, actual_date = actual_harvest, daily_results = daily_results, first_detection = first_detection ) } # ============================================================================ # SUMMARY STATISTICS # ============================================================================ cat("\n\n=== VALIDATION SUMMARY ===\n\n") # Create comprehensive summary showing WHEN detection happens detection_timing_table <- data.frame() for (i in 1:length(validation_results)) { result <- validation_results[[i]] actual_date <- result$actual_date # Get detections in key time windows daily <- result$daily_results # Check specific days day_minus_14 <- daily %>% filter(days_from_actual == -14) %>% pull(detected) day_minus_7 <- daily %>% filter(days_from_actual == -7) %>% pull(detected) day_minus_3 <- daily %>% filter(days_from_actual == -3) %>% pull(detected) day_minus_1 <- daily %>% filter(days_from_actual == -1) %>% pull(detected) day_0 <- daily %>% filter(days_from_actual == 0) %>% pull(detected) day_plus_1 <- daily %>% filter(days_from_actual == 1) %>% pull(detected) day_plus_3 <- daily %>% filter(days_from_actual == 3) %>% pull(detected) day_plus_7 <- daily %>% filter(days_from_actual == 7) %>% pull(detected) day_plus_14 <- daily %>% filter(days_from_actual == 14) %>% pull(detected) # First detection info first_det <- result$first_detection detection_timing_table <- rbind(detection_timing_table, data.frame( harvest_event = i, actual_date = actual_date, detected_minus_14d = ifelse(length(day_minus_14) > 0 && day_minus_14, "YES", "NO"), detected_minus_7d = ifelse(length(day_minus_7) > 0 && day_minus_7, "YES", "NO"), detected_minus_3d = ifelse(length(day_minus_3) > 0 && day_minus_3, "YES", "NO"), detected_minus_1d = ifelse(length(day_minus_1) > 0 && day_minus_1, "YES", "NO"), detected_day_0 = ifelse(length(day_0) > 0 && day_0, "YES", "NO"), detected_plus_1d = ifelse(length(day_plus_1) > 0 && day_plus_1, "YES", "NO"), detected_plus_3d = ifelse(length(day_plus_3) > 0 && day_plus_3, "YES", "NO"), detected_plus_7d = ifelse(length(day_plus_7) > 0 && day_plus_7, "YES", "NO"), detected_plus_14d = ifelse(length(day_plus_14) > 0 && day_plus_14, "YES", "NO"), first_detection_day = ifelse(nrow(first_det) > 0, first_det$days_from_actual, NA), stringsAsFactors = FALSE )) } cat("Detection Timing Table (when does system flag harvest?):\n\n") print(detection_timing_table) cat("\n\nAcceptable Detection Window Analysis:\n") cat("(Harvest day ±1 day is realistic detection window)\n\n") acceptable_window <- detection_timing_table %>% mutate( detected_in_window = detected_minus_1d == "YES" | detected_day_0 == "YES" | detected_plus_1d == "YES" ) cat("Harvests detected within ±1 day of actual:", sum(acceptable_window$detected_in_window), "/", nrow(acceptable_window), "\n") cat("Accuracy within realistic window:", round(100 * mean(acceptable_window$detected_in_window), 1), "%\n\n") cat("False Early Detections (>3 days before harvest):\n") early_false <- sum(detection_timing_table$detected_minus_7d == "YES" | detection_timing_table$detected_minus_14d == "YES", na.rm = TRUE) cat(" Count:", early_false, "\n") cat(" These are likely detecting decline phase, not actual harvest\n\n") # Original summary for comparison detection_summary <- data.frame() for (i in 1:length(validation_results)) { result <- validation_results[[i]] if (nrow(result$first_detection) > 0) { detection_summary <- rbind(detection_summary, data.frame( harvest_event = i, actual_date = result$actual_date, detected_date = result$first_detection$test_date, days_offset = result$first_detection$days_from_actual, confidence = result$first_detection$confidence, ci_drop = result$first_detection$ci_drop )) } else { detection_summary <- rbind(detection_summary, data.frame( harvest_event = i, actual_date = result$actual_date, detected_date = NA, days_offset = NA, confidence = "not_detected", ci_drop = NA )) } } print(detection_summary) cat("\n\nDetection Performance:\n") cat(" Total harvests tested:", nrow(detection_summary), "\n") cat(" Successfully detected:", sum(!is.na(detection_summary$detected_date)), "\n") cat(" Detection rate:", round(100 * sum(!is.na(detection_summary$detected_date)) / nrow(detection_summary), 1), "%\n\n") if (sum(!is.na(detection_summary$days_offset)) > 0) { cat("Detection Timing:\n") cat(" Average days from actual:", round(mean(detection_summary$days_offset, na.rm = TRUE), 1), "\n") cat(" Median days from actual:", round(median(detection_summary$days_offset, na.rm = TRUE), 1), "\n") cat(" Detected before harvest:", sum(detection_summary$days_offset < 0, na.rm = TRUE), "\n") cat(" Detected on harvest day:", sum(detection_summary$days_offset == 0, na.rm = TRUE), "\n") cat(" Detected after harvest:", sum(detection_summary$days_offset > 0, na.rm = TRUE), "\n") } cat("\n=== TEST COMPLETE ===\n") cat("\nNext steps:\n") cat("1. Review detection timing and adjust thresholds if needed\n") cat("2. Expand to all fields in dataset\n") cat("3. Analyze which configuration gives best early detection\n")