SmartCane/analyze_drop_patterns.R

# Analyze CI drop patterns to distinguish harvest from anomalies
# Goal: Identify characteristics of true harvest drops vs single-day noise

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(ggplot2)
})

project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))

# Read daily CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, ci = FitData) %>%
  arrange(field_id, date) %>%
  group_by(field_id) %>%
  mutate(
    # Calculate changes
    ci_lag1 = lag(ci, 1),
    ci_lag2 = lag(ci, 2),
    ci_lead1 = lead(ci, 1),
    ci_lead2 = lead(ci, 2),
    ci_lead3 = lead(ci, 3),

    # Drop magnitude
    drop_1day = ci_lag1 - ci,
    drop_2day = ci_lag2 - ci,

    # Recovery after drop
    recovery_1day = ci_lead1 - ci,
    recovery_2day = ci_lead2 - ci,
    recovery_3day = ci_lead3 - ci,

    # Is this a single-day anomaly?
    is_spike_drop = (ci < 2.0 & ci_lag1 > 3.0 & ci_lead1 > 3.0)
  ) %>%
  ungroup()

# Read actual harvest data
harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

cat("=== ANALYZING CI DROP PATTERNS ===\n\n")

# Find all instances where CI drops below 2.0
all_drops <- time_series_daily %>%
  filter(ci < 2.0, ci_lag1 > 2.0) %>%  # First day below 2.0
  select(field_id, date, ci, ci_lag1, drop_1day,
         ci_lead1, ci_lead2, ci_lead3,
         recovery_1day, recovery_2day, recovery_3day)

# Classify drops based on what happens next
drops_classified <- all_drops %>%
  mutate(
    drop_type = case_when(
      # Spike: drops but recovers to >3.0 within 3 days
      !is.na(ci_lead1) & ci_lead1 > 3.0 ~ "SPIKE (1-day anomaly)",
      !is.na(ci_lead2) & ci_lead2 > 3.0 ~ "SPIKE (2-day anomaly)",
      !is.na(ci_lead3) & ci_lead3 > 3.0 ~ "SPIKE (3-day anomaly)",

      # Sustained: stays below 2.5 for at least 3 days
      !is.na(ci_lead1) & !is.na(ci_lead2) & !is.na(ci_lead3) &
        ci_lead1 < 2.5 & ci_lead2 < 2.5 & ci_lead3 < 2.5 ~ "SUSTAINED (likely harvest)",

      TRUE ~ "UNCLEAR (insufficient data)"
    ),

    sharp_drop = drop_1day > 1.0  # Drop >1 CI point
  )

cat("=== DROP TYPE DISTRIBUTION ===\n")
drop_summary <- drops_classified %>%
  count(drop_type) %>%
  mutate(percent = 100 * n / sum(n)) %>%
  arrange(desc(n))

print(drop_summary)

cat("\n=== SHARP DROPS (>1.0 CI point) ===\n")
sharp_summary <- drops_classified %>%
  filter(sharp_drop) %>%
  count(drop_type) %>%
  mutate(percent = 100 * n / sum(n))

print(sharp_summary)

# Match drops to actual harvests
cat("\n=== MATCHING DROPS TO ACTUAL HARVESTS ===\n")

drops_with_harvest <- drops_classified %>%
  left_join(
    harvest_actual %>%
      select(field, actual_harvest_date = season_end),
    by = c("field_id" = "field")
  ) %>%
  filter(!is.na(actual_harvest_date)) %>%
  mutate(
    days_from_harvest = as.numeric(date - actual_harvest_date),
    near_harvest = abs(days_from_harvest) <= 14,
    timing_category = case_when(
      days_from_harvest >= -7 & days_from_harvest <= 7 ~ "Within 1 week of harvest",
      days_from_harvest >= -14 & days_from_harvest <= 14 ~ "Within 2 weeks of harvest",
      days_from_harvest >= -21 & days_from_harvest <= 21 ~ "Within 3 weeks of harvest",
      TRUE ~ "Far from harvest (>3 weeks)"
    )
  )

cat("\n=== DROP TYPES BY PROXIMITY TO ACTUAL HARVEST ===\n")
harvest_proximity_summary <- drops_with_harvest %>%
  count(drop_type, timing_category) %>%
  pivot_wider(names_from = timing_category, values_from = n, values_fill = 0)

print(harvest_proximity_summary)

# Key insight: What % of SUSTAINED drops are near harvest vs SPIKE drops?
cat("\n=== KEY INSIGHT: Are sustained drops near harvest? ===\n")
sustained_near_harvest <- drops_with_harvest %>%
  filter(grepl("SUSTAINED", drop_type)) %>%
  summarise(
    total = n(),
    near_harvest = sum(near_harvest),
    percent_near = 100 * near_harvest / total
  )

spike_near_harvest <- drops_with_harvest %>%
  filter(grepl("SPIKE", drop_type)) %>%
  summarise(
    total = n(),
    near_harvest = sum(near_harvest),
    percent_near = 100 * near_harvest / total
  )

cat("\nSUSTAINED drops (CI stays low):\n")
cat(sprintf("  Total: %d\n", sustained_near_harvest$total))
cat(sprintf("  Near harvest (±14d): %d (%.1f%%)\n",
            sustained_near_harvest$near_harvest,
            sustained_near_harvest$percent_near))

cat("\nSPIKE drops (CI recovers quickly):\n")
cat(sprintf("  Total: %d\n", spike_near_harvest$total))
cat(sprintf("  Near harvest (±14d): %d (%.1f%%)\n",
            spike_near_harvest$near_harvest,
            spike_near_harvest$percent_near))

# Analyze recovery patterns
cat("\n=== RECOVERY PATTERNS (how fast does CI bounce back?) ===\n")

recovery_stats <- drops_classified %>%
  filter(!is.na(recovery_3day)) %>%
  group_by(drop_type) %>%
  summarise(
    count = n(),
    mean_recovery_1d = mean(recovery_1day, na.rm = TRUE),
    mean_recovery_2d = mean(recovery_2day, na.rm = TRUE),
    mean_recovery_3d = mean(recovery_3day, na.rm = TRUE),
    median_recovery_1d = median(recovery_1day, na.rm = TRUE),
    median_recovery_2d = median(recovery_2day, na.rm = TRUE),
    median_recovery_3d = median(recovery_3day, na.rm = TRUE)
  )

print(recovery_stats)

# Show examples of each type
cat("\n=== EXAMPLES: SPIKE (false alarm) ===\n")
print(drops_classified %>%
        filter(drop_type == "SPIKE (1-day anomaly)") %>%
        select(field_id, date, ci_lag1, ci, ci_lead1, drop_1day, recovery_1day) %>%
        head(10), n = 10)

cat("\n=== EXAMPLES: SUSTAINED (likely harvest) ===\n")
print(drops_classified %>%
        filter(drop_type == "SUSTAINED (likely harvest)") %>%
        select(field_id, date, ci_lag1, ci, ci_lead1, ci_lead2, ci_lead3, drop_1day) %>%
        head(10), n = 10)

# Recommendation
cat("\n=== RECOMMENDATION ===\n")
cat("To avoid false alarms from single-day spikes:\n")
cat("1. Require CI to stay below 2.0 for at least 3 consecutive days\n")
cat("2. Check that CI doesn't recover above 3.0 within next 3 days\n")
cat("3. Sharp drops (>1.0 CI) that sustain are strong harvest signals\n")
cat("4. Trade-off: Waiting 3 days for confirmation delays alert by 3 days\n")
cat("   - But eliminates false positives from cloud noise\n")
cat("   - Harvest still detected 4-11 days before actual event (median 7d)\n")