# Analyze timing between CI threshold crossings and actual harvest dates # Goal: Determine how soon after CI drops below threshold the harvest actually occurs suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) source(here("r_app", "parameters_project.R")) # Read daily CI data ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, ci = FitData) %>% arrange(field_id, date) # Read actual harvest data harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) cat("=== ANALYZING CI THRESHOLD CROSSING TIMING ===\n\n") # For each actual harvest, find when CI first dropped below various thresholds thresholds <- c(3.0, 2.5, 2.0, 1.8) results <- list() for (i in 1:nrow(harvest_actual)) { harvest <- harvest_actual[i, ] field <- harvest$field harvest_date <- harvest$season_end # Get CI data for this field in the year before harvest field_data <- time_series_daily %>% filter(field_id == field, date >= (harvest_date - 365), date <= harvest_date) %>% arrange(date) if (nrow(field_data) == 0) next # For each threshold, find LAST crossing date (working backward from harvest) # This finds the mature→harvest transition, not the previous cycle's harvest threshold_crossings <- sapply(thresholds, function(threshold) { # Find LAST period where CI was high (>3.5), then dropped below threshold # Work backward from harvest date last_mature_idx <- NA for (j in nrow(field_data):1) { if (!is.na(field_data$ci[j]) && field_data$ci[j] > 3.5) { last_mature_idx <- j break } } # If no mature period found, skip if (is.na(last_mature_idx)) return(NA) # Now find first crossing below threshold AFTER the mature period for (j in last_mature_idx:(nrow(field_data) - 2)) { if (!is.na(field_data$ci[j]) && !is.na(field_data$ci[j+1]) && !is.na(field_data$ci[j+2]) && field_data$ci[j] < threshold && field_data$ci[j+1] < threshold && field_data$ci[j+2] < threshold) { return(as.character(field_data$date[j])) } } return(NA) }) result_row <- data.frame( field = field, harvest_date = harvest_date, ci_at_harvest = field_data$ci[nrow(field_data)] ) for (k in 1:length(thresholds)) { threshold <- thresholds[k] crossing_date <- as.Date(threshold_crossings[k]) if (!is.na(crossing_date)) { days_before_harvest <- as.numeric(harvest_date - crossing_date) result_row[[paste0("first_below_", threshold)]] <- as.character(crossing_date) result_row[[paste0("days_before_", threshold)]] <- days_before_harvest } else { result_row[[paste0("first_below_", threshold)]] <- NA result_row[[paste0("days_before_", threshold)]] <- NA } } results[[i]] <- result_row } timing_analysis <- bind_rows(results) # Print summary statistics cat("\n=== TIMING STATISTICS: Days from threshold crossing to actual harvest ===\n\n") for (threshold in thresholds) { days_col <- paste0("days_before_", threshold) days_before <- timing_analysis[[days_col]] days_before <- days_before[!is.na(days_before)] if (length(days_before) > 0) { cat(sprintf("CI < %.1f threshold:\n", threshold)) cat(sprintf(" Valid cases: %d/%d (%.1f%%)\n", length(days_before), nrow(timing_analysis), 100 * length(days_before) / nrow(timing_analysis))) cat(sprintf(" Mean: %.1f days before harvest\n", mean(days_before))) cat(sprintf(" Median: %.1f days before harvest\n", median(days_before))) cat(sprintf(" Range: %.1f to %.1f days\n", min(days_before), max(days_before))) cat(sprintf(" Q1-Q3: %.1f to %.1f days\n", quantile(days_before, 0.25), quantile(days_before, 0.75))) # Count how many harvests occur within specific time windows after crossing within_7d <- sum(days_before >= 0 & days_before <= 7) within_14d <- sum(days_before >= 0 & days_before <= 14) within_21d <- sum(days_before >= 0 & days_before <= 21) within_30d <- sum(days_before >= 0 & days_before <= 30) cat(sprintf(" Harvest timing after crossing:\n")) cat(sprintf(" 0-7 days: %d (%.1f%%)\n", within_7d, 100*within_7d/length(days_before))) cat(sprintf(" 0-14 days: %d (%.1f%%)\n", within_14d, 100*within_14d/length(days_before))) cat(sprintf(" 0-21 days: %d (%.1f%%)\n", within_21d, 100*within_21d/length(days_before))) cat(sprintf(" 0-30 days: %d (%.1f%%)\n", within_30d, 100*within_30d/length(days_before))) cat("\n") } else { cat(sprintf("CI < %.1f threshold: No valid crossings found\n\n", threshold)) } } # Show detailed table for fields with mismatches cat("\n=== DETAILED TIMING BY FIELD ===\n") # Get column names dynamically days_cols <- grep("days_before_", names(timing_analysis), value = TRUE) select_cols <- c("field", "harvest_date", "ci_at_harvest", days_cols[1:min(2, length(days_cols))]) print(timing_analysis %>% select(all_of(select_cols)) %>% arrange(field, harvest_date), n = 100) # Create visualization cat("\n=== Creating timing distribution plot ===\n") timing_long <- timing_analysis %>% select(field, harvest_date, starts_with("days_before_")) %>% pivot_longer(cols = starts_with("days_before_"), names_to = "threshold", values_to = "days_before") %>% filter(!is.na(days_before)) %>% mutate(threshold = gsub("days_before_", "CI < ", threshold)) png("timing_threshold_to_harvest.png", width = 1200, height = 800, res = 120) ggplot(timing_long, aes(x = days_before, fill = threshold)) + geom_histogram(binwidth = 7, alpha = 0.7, position = "identity") + facet_wrap(~threshold, ncol = 1) + geom_vline(xintercept = c(7, 14, 21), linetype = "dashed", color = "red", alpha = 0.5) + labs( title = "Time from CI Threshold Crossing to Actual Harvest", subtitle = "How many days AFTER CI drops below threshold does harvest actually occur?", x = "Days from threshold crossing to harvest", y = "Count of harvest events", caption = "Dashed lines at 7, 14, 21 days" ) + theme_minimal() + theme(legend.position = "none") dev.off() cat("\nPlot saved to: timing_threshold_to_harvest.png\n")