# Deep analysis of CI patterns ±30 days around actual harvest dates # Goal: Find the exact signature of harvest - decline, bottom, stabilization suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(ggplot2) }) project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) source(here("r_app", "parameters_project.R")) # Read daily CI data ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() daily_ci <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, ci = FitData) %>% arrange(field_id, date) # Read actual harvest data harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) cat("=== ANALYZING CI PATTERNS AROUND ACTUAL HARVEST DATES ===\n\n") # For each harvest, get ±30 days of data harvest_windows <- list() for (i in 1:nrow(harvest_actual)) { harvest <- harvest_actual[i, ] field <- harvest$field harvest_date <- harvest$season_end # Get CI data ±30 days window_data <- daily_ci %>% filter(field_id == field, date >= (harvest_date - 30), date <= (harvest_date + 30)) %>% arrange(date) %>% mutate( day_offset = as.numeric(date - harvest_date), # Negative = before, positive = after # Calculate daily changes ci_change = ci - lag(ci), ci_change_3d = ci - lag(ci, 3), ci_change_7d = ci - lag(ci, 7), # Calculate acceleration (rate of change of change) ci_acceleration = ci_change - lag(ci_change), # Rolling statistics ci_mean_7d = zoo::rollmean(ci, k = 7, fill = NA, align = "center"), ci_sd_7d = zoo::rollapply(ci, width = 7, FUN = sd, fill = NA, align = "center"), ci_min_7d = zoo::rollapply(ci, width = 7, FUN = min, fill = NA, align = "center"), # Detect stable periods (low variability) is_stable = !is.na(ci_sd_7d) & ci_sd_7d < 0.3 & ci < 2.5 ) if (nrow(window_data) > 0) { window_data$field_id <- field window_data$harvest_date <- harvest_date window_data$harvest_id <- i harvest_windows[[i]] <- window_data } } all_windows <- bind_rows(harvest_windows) cat(sprintf("Analyzed %d harvest events with ±30 day windows\n\n", length(harvest_windows))) # ============================================================================ # ANALYSIS 1: What happens at the exact harvest date? # ============================================================================ cat("=== ANALYSIS 1: CI AT HARVEST DATE (day 0) ===\n") harvest_day_stats <- all_windows %>% filter(day_offset == 0) %>% summarise( count = n(), mean_ci = mean(ci, na.rm = TRUE), median_ci = median(ci, na.rm = TRUE), sd_ci = sd(ci, na.rm = TRUE), min_ci = min(ci, na.rm = TRUE), max_ci = max(ci, na.rm = TRUE), q25 = quantile(ci, 0.25, na.rm = TRUE), q75 = quantile(ci, 0.75, na.rm = TRUE) ) print(harvest_day_stats) # ============================================================================ # ANALYSIS 2: When is the absolute minimum CI? # ============================================================================ cat("\n=== ANALYSIS 2: WHEN DOES CI REACH MINIMUM? ===\n") min_ci_timing <- all_windows %>% group_by(harvest_id, field_id, harvest_date) %>% summarise( min_ci_value = min(ci, na.rm = TRUE), min_ci_day = day_offset[which.min(ci)], .groups = "drop" ) cat(sprintf("\nWhen does MINIMUM CI occur relative to harvest date:\n")) cat(sprintf(" Mean offset: %.1f days (%.1f = before harvest, + = after)\n", mean(min_ci_timing$min_ci_day, na.rm = TRUE), mean(min_ci_timing$min_ci_day, na.rm = TRUE))) cat(sprintf(" Median offset: %.1f days\n", median(min_ci_timing$min_ci_day, na.rm = TRUE))) cat(sprintf(" Range: %.0f to %.0f days\n", min(min_ci_timing$min_ci_day, na.rm = TRUE), max(min_ci_timing$min_ci_day, na.rm = TRUE))) timing_distribution <- min_ci_timing %>% mutate(timing_category = case_when( min_ci_day < -7 ~ "Before harvest (>7d early)", min_ci_day >= -7 & min_ci_day < 0 ~ "Just before harvest (0-7d early)", min_ci_day == 0 ~ "On harvest date", min_ci_day > 0 & min_ci_day <= 7 ~ "Just after harvest (0-7d late)", min_ci_day > 7 ~ "After harvest (>7d late)" )) %>% count(timing_category) print(timing_distribution) # ============================================================================ # ANALYSIS 3: Decline rate before harvest # ============================================================================ cat("\n=== ANALYSIS 3: DECLINE PATTERN BEFORE HARVEST ===\n") decline_stats <- all_windows %>% filter(day_offset >= -30 & day_offset < 0) %>% group_by(week_before = ceiling(abs(day_offset) / 7)) %>% summarise( mean_ci = mean(ci, na.rm = TRUE), mean_daily_change = mean(ci_change, na.rm = TRUE), mean_7d_change = mean(ci_change_7d, na.rm = TRUE), count = n(), .groups = "drop" ) %>% arrange(desc(week_before)) cat("\nCI decline by week before harvest:\n") print(decline_stats) # ============================================================================ # ANALYSIS 4: Stabilization after harvest # ============================================================================ cat("\n=== ANALYSIS 4: WHEN DOES CI STABILIZE (stop declining)? ===\n") stabilization <- all_windows %>% filter(day_offset >= 0 & day_offset <= 30) %>% group_by(day_offset) %>% summarise( mean_ci = mean(ci, na.rm = TRUE), sd_ci = sd(ci, na.rm = TRUE), pct_stable = 100 * mean(is_stable, na.rm = TRUE), mean_daily_change = mean(ci_change, na.rm = TRUE), .groups = "drop" ) cat("\nPost-harvest stabilization (first 14 days):\n") print(stabilization %>% filter(day_offset <= 14)) # Find first day where >50% of fields show stable CI first_stable_day <- stabilization %>% filter(pct_stable > 50) %>% summarise(first_day = min(day_offset, na.rm = TRUE)) cat(sprintf("\n>50%% of fields show stable CI by day +%.0f after harvest\n", first_stable_day$first_day)) # ============================================================================ # ANALYSIS 5: Threshold crossings # ============================================================================ cat("\n=== ANALYSIS 5: THRESHOLD CROSSINGS BEFORE HARVEST ===\n") thresholds <- c(3.0, 2.5, 2.0, 1.8, 1.5) threshold_stats <- lapply(thresholds, function(thresh) { crossings <- all_windows %>% filter(day_offset < 0) %>% group_by(harvest_id) %>% summarise( first_below = min(day_offset[ci < thresh], na.rm = TRUE), .groups = "drop" ) %>% filter(is.finite(first_below)) if (nrow(crossings) > 0) { data.frame( threshold = thresh, n_crossed = nrow(crossings), mean_days_before = mean(abs(crossings$first_below)), median_days_before = median(abs(crossings$first_below)), pct_crossed = 100 * nrow(crossings) / length(unique(all_windows$harvest_id)) ) } else { data.frame(threshold = thresh, n_crossed = 0, mean_days_before = NA, median_days_before = NA, pct_crossed = 0) } }) %>% bind_rows() print(threshold_stats) # ============================================================================ # VISUALIZATION # ============================================================================ cat("\n=== CREATING VISUALIZATION ===\n") # Average CI pattern across all harvests avg_pattern <- all_windows %>% group_by(day_offset) %>% summarise( mean_ci = mean(ci, na.rm = TRUE), median_ci = median(ci, na.rm = TRUE), q25_ci = quantile(ci, 0.25, na.rm = TRUE), q75_ci = quantile(ci, 0.75, na.rm = TRUE), sd_ci = sd(ci, na.rm = TRUE), .groups = "drop" ) png("harvest_ci_pattern_analysis.png", width = 1400, height = 900, res = 120) par(mfrow = c(2, 2)) # Plot 1: Average CI pattern plot(avg_pattern$day_offset, avg_pattern$mean_ci, type = "l", lwd = 2, xlab = "Days from harvest", ylab = "CI", main = "Average CI Pattern Around Harvest", ylim = c(0, max(avg_pattern$q75_ci, na.rm = TRUE))) polygon(c(avg_pattern$day_offset, rev(avg_pattern$day_offset)), c(avg_pattern$q25_ci, rev(avg_pattern$q75_ci)), col = rgb(0, 0, 1, 0.2), border = NA) abline(v = 0, col = "red", lty = 2, lwd = 2) abline(h = c(1.5, 2.0, 2.5), col = c("blue", "orange", "green"), lty = 3) legend("topright", legend = c("Mean CI", "Q25-Q75", "Harvest date", "Thresholds 1.5, 2.0, 2.5"), lwd = c(2, 8, 2, 1), col = c("black", rgb(0,0,1,0.2), "red", "blue")) # Plot 2: Daily change rate avg_change <- all_windows %>% filter(!is.na(ci_change)) %>% group_by(day_offset) %>% summarise(mean_change = mean(ci_change, na.rm = TRUE), .groups = "drop") plot(avg_change$day_offset, avg_change$mean_change, type = "l", lwd = 2, xlab = "Days from harvest", ylab = "Daily CI change", main = "Rate of CI Change") abline(v = 0, col = "red", lty = 2) abline(h = 0, col = "gray", lty = 3) # Plot 3: Minimum CI timing distribution hist(min_ci_timing$min_ci_day, breaks = 20, xlab = "Day offset when minimum CI occurs", main = "When Does CI Reach Minimum?", col = "lightblue") abline(v = 0, col = "red", lwd = 2, lty = 2) abline(v = median(min_ci_timing$min_ci_day, na.rm = TRUE), col = "blue", lwd = 2) # Plot 4: Threshold crossing timing barplot(threshold_stats$median_days_before, names.arg = threshold_stats$threshold, xlab = "CI Threshold", ylab = "Median days before harvest", main = "When Are Thresholds Crossed?", col = "lightgreen") dev.off() cat("\nPlot saved: harvest_ci_pattern_analysis.png\n") # ============================================================================ # RECOMMENDATIONS # ============================================================================ cat("\n=== RECOMMENDATIONS FOR HARVEST DETECTION ===\n\n") # Find best threshold based on timing best_threshold <- threshold_stats %>% filter(median_days_before >= 7 & median_days_before <= 14) %>% arrange(desc(pct_crossed)) if (nrow(best_threshold) > 0) { cat(sprintf("BEST EARLY WARNING THRESHOLD: CI < %.1f\n", best_threshold$threshold[1])) cat(sprintf(" - Crossed %.0f%% of the time\n", best_threshold$pct_crossed[1])) cat(sprintf(" - Median %.1f days before harvest\n", best_threshold$median_days_before[1])) cat(sprintf(" - MESSAGE: 'Harvest expected within 7-14 days'\n\n")) } cat("HARVEST COMPLETION SIGNAL:\n") cat(sprintf(" - Look for stabilization: SD < 0.3 for 7 days\n")) cat(sprintf(" - Typically occurs around day +%.0f after reported harvest\n", first_stable_day$first_day)) cat(sprintf(" - MESSAGE: 'Harvest likely completed in recent days'\n\n")) cat("SHARP DECLINE DETECTION:\n") sharp_decline_threshold <- -0.5 # CI dropping >0.5 per day sharp_declines <- all_windows %>% filter(!is.na(ci_change) & ci_change < sharp_decline_threshold) %>% group_by(day_offset) %>% summarise(count = n(), .groups = "drop") %>% filter(day_offset < 0) %>% arrange(desc(count)) if (nrow(sharp_declines) > 0) { cat(sprintf(" - Sharp drops (>0.5/day) most common at day %.0f before harvest\n", sharp_declines$day_offset[1])) cat(sprintf(" - Can trigger immediate alert: 'Sharp decline detected - investigate field'\n")) }