# Analyze CI drop patterns to distinguish harvest from anomalies # Goal: Identify characteristics of true harvest drops vs single-day noise suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(ggplot2) }) project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) source(here("r_app", "parameters_project.R")) # Read daily CI data ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, ci = FitData) %>% arrange(field_id, date) %>% group_by(field_id) %>% mutate( # Calculate changes ci_lag1 = lag(ci, 1), ci_lag2 = lag(ci, 2), ci_lead1 = lead(ci, 1), ci_lead2 = lead(ci, 2), ci_lead3 = lead(ci, 3), # Drop magnitude drop_1day = ci_lag1 - ci, drop_2day = ci_lag2 - ci, # Recovery after drop recovery_1day = ci_lead1 - ci, recovery_2day = ci_lead2 - ci, recovery_3day = ci_lead3 - ci, # Is this a single-day anomaly? is_spike_drop = (ci < 2.0 & ci_lag1 > 3.0 & ci_lead1 > 3.0) ) %>% ungroup() # Read actual harvest data harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) cat("=== ANALYZING CI DROP PATTERNS ===\n\n") # Find all instances where CI drops below 2.0 all_drops <- time_series_daily %>% filter(ci < 2.0, ci_lag1 > 2.0) %>% # First day below 2.0 select(field_id, date, ci, ci_lag1, drop_1day, ci_lead1, ci_lead2, ci_lead3, recovery_1day, recovery_2day, recovery_3day) # Classify drops based on what happens next drops_classified <- all_drops %>% mutate( drop_type = case_when( # Spike: drops but recovers to >3.0 within 3 days !is.na(ci_lead1) & ci_lead1 > 3.0 ~ "SPIKE (1-day anomaly)", !is.na(ci_lead2) & ci_lead2 > 3.0 ~ "SPIKE (2-day anomaly)", !is.na(ci_lead3) & ci_lead3 > 3.0 ~ "SPIKE (3-day anomaly)", # Sustained: stays below 2.5 for at least 3 days !is.na(ci_lead1) & !is.na(ci_lead2) & !is.na(ci_lead3) & ci_lead1 < 2.5 & ci_lead2 < 2.5 & ci_lead3 < 2.5 ~ "SUSTAINED (likely harvest)", TRUE ~ "UNCLEAR (insufficient data)" ), sharp_drop = drop_1day > 1.0 # Drop >1 CI point ) cat("=== DROP TYPE DISTRIBUTION ===\n") drop_summary <- drops_classified %>% count(drop_type) %>% mutate(percent = 100 * n / sum(n)) %>% arrange(desc(n)) print(drop_summary) cat("\n=== SHARP DROPS (>1.0 CI point) ===\n") sharp_summary <- drops_classified %>% filter(sharp_drop) %>% count(drop_type) %>% mutate(percent = 100 * n / sum(n)) print(sharp_summary) # Match drops to actual harvests cat("\n=== MATCHING DROPS TO ACTUAL HARVESTS ===\n") drops_with_harvest <- drops_classified %>% left_join( harvest_actual %>% select(field, actual_harvest_date = season_end), by = c("field_id" = "field") ) %>% filter(!is.na(actual_harvest_date)) %>% mutate( days_from_harvest = as.numeric(date - actual_harvest_date), near_harvest = abs(days_from_harvest) <= 14, timing_category = case_when( days_from_harvest >= -7 & days_from_harvest <= 7 ~ "Within 1 week of harvest", days_from_harvest >= -14 & days_from_harvest <= 14 ~ "Within 2 weeks of harvest", days_from_harvest >= -21 & days_from_harvest <= 21 ~ "Within 3 weeks of harvest", TRUE ~ "Far from harvest (>3 weeks)" ) ) cat("\n=== DROP TYPES BY PROXIMITY TO ACTUAL HARVEST ===\n") harvest_proximity_summary <- drops_with_harvest %>% count(drop_type, timing_category) %>% pivot_wider(names_from = timing_category, values_from = n, values_fill = 0) print(harvest_proximity_summary) # Key insight: What % of SUSTAINED drops are near harvest vs SPIKE drops? cat("\n=== KEY INSIGHT: Are sustained drops near harvest? ===\n") sustained_near_harvest <- drops_with_harvest %>% filter(grepl("SUSTAINED", drop_type)) %>% summarise( total = n(), near_harvest = sum(near_harvest), percent_near = 100 * near_harvest / total ) spike_near_harvest <- drops_with_harvest %>% filter(grepl("SPIKE", drop_type)) %>% summarise( total = n(), near_harvest = sum(near_harvest), percent_near = 100 * near_harvest / total ) cat("\nSUSTAINED drops (CI stays low):\n") cat(sprintf(" Total: %d\n", sustained_near_harvest$total)) cat(sprintf(" Near harvest (±14d): %d (%.1f%%)\n", sustained_near_harvest$near_harvest, sustained_near_harvest$percent_near)) cat("\nSPIKE drops (CI recovers quickly):\n") cat(sprintf(" Total: %d\n", spike_near_harvest$total)) cat(sprintf(" Near harvest (±14d): %d (%.1f%%)\n", spike_near_harvest$near_harvest, spike_near_harvest$percent_near)) # Analyze recovery patterns cat("\n=== RECOVERY PATTERNS (how fast does CI bounce back?) ===\n") recovery_stats <- drops_classified %>% filter(!is.na(recovery_3day)) %>% group_by(drop_type) %>% summarise( count = n(), mean_recovery_1d = mean(recovery_1day, na.rm = TRUE), mean_recovery_2d = mean(recovery_2day, na.rm = TRUE), mean_recovery_3d = mean(recovery_3day, na.rm = TRUE), median_recovery_1d = median(recovery_1day, na.rm = TRUE), median_recovery_2d = median(recovery_2day, na.rm = TRUE), median_recovery_3d = median(recovery_3day, na.rm = TRUE) ) print(recovery_stats) # Show examples of each type cat("\n=== EXAMPLES: SPIKE (false alarm) ===\n") print(drops_classified %>% filter(drop_type == "SPIKE (1-day anomaly)") %>% select(field_id, date, ci_lag1, ci, ci_lead1, drop_1day, recovery_1day) %>% head(10), n = 10) cat("\n=== EXAMPLES: SUSTAINED (likely harvest) ===\n") print(drops_classified %>% filter(drop_type == "SUSTAINED (likely harvest)") %>% select(field_id, date, ci_lag1, ci, ci_lead1, ci_lead2, ci_lead3, drop_1day) %>% head(10), n = 10) # Recommendation cat("\n=== RECOMMENDATION ===\n") cat("To avoid false alarms from single-day spikes:\n") cat("1. Require CI to stay below 2.0 for at least 3 consecutive days\n") cat("2. Check that CI doesn't recover above 3.0 within next 3 days\n") cat("3. Sharp drops (>1.0 CI) that sustain are strong harvest signals\n") cat("4. Trade-off: Waiting 3 days for confirmation delays alert by 3 days\n") cat(" - But eliminates false positives from cloud noise\n") cat(" - Harvest still detected 4-11 days before actual event (median 7d)\n")