SmartCane/analyze_drop_patterns.R
2026-01-06 14:17:37 +01:00

198 lines
6.6 KiB
R

# Analyze CI drop patterns to distinguish harvest from anomalies
# Goal: Identify characteristics of true harvest drops vs single-day noise
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(ggplot2)
})
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))
# Read daily CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, ci = FitData) %>%
arrange(field_id, date) %>%
group_by(field_id) %>%
mutate(
# Calculate changes
ci_lag1 = lag(ci, 1),
ci_lag2 = lag(ci, 2),
ci_lead1 = lead(ci, 1),
ci_lead2 = lead(ci, 2),
ci_lead3 = lead(ci, 3),
# Drop magnitude
drop_1day = ci_lag1 - ci,
drop_2day = ci_lag2 - ci,
# Recovery after drop
recovery_1day = ci_lead1 - ci,
recovery_2day = ci_lead2 - ci,
recovery_3day = ci_lead3 - ci,
# Is this a single-day anomaly?
is_spike_drop = (ci < 2.0 & ci_lag1 > 3.0 & ci_lead1 > 3.0)
) %>%
ungroup()
# Read actual harvest data
harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
cat("=== ANALYZING CI DROP PATTERNS ===\n\n")
# Find all instances where CI drops below 2.0
all_drops <- time_series_daily %>%
filter(ci < 2.0, ci_lag1 > 2.0) %>% # First day below 2.0
select(field_id, date, ci, ci_lag1, drop_1day,
ci_lead1, ci_lead2, ci_lead3,
recovery_1day, recovery_2day, recovery_3day)
# Classify drops based on what happens next
drops_classified <- all_drops %>%
mutate(
drop_type = case_when(
# Spike: drops but recovers to >3.0 within 3 days
!is.na(ci_lead1) & ci_lead1 > 3.0 ~ "SPIKE (1-day anomaly)",
!is.na(ci_lead2) & ci_lead2 > 3.0 ~ "SPIKE (2-day anomaly)",
!is.na(ci_lead3) & ci_lead3 > 3.0 ~ "SPIKE (3-day anomaly)",
# Sustained: stays below 2.5 for at least 3 days
!is.na(ci_lead1) & !is.na(ci_lead2) & !is.na(ci_lead3) &
ci_lead1 < 2.5 & ci_lead2 < 2.5 & ci_lead3 < 2.5 ~ "SUSTAINED (likely harvest)",
TRUE ~ "UNCLEAR (insufficient data)"
),
sharp_drop = drop_1day > 1.0 # Drop >1 CI point
)
cat("=== DROP TYPE DISTRIBUTION ===\n")
drop_summary <- drops_classified %>%
count(drop_type) %>%
mutate(percent = 100 * n / sum(n)) %>%
arrange(desc(n))
print(drop_summary)
cat("\n=== SHARP DROPS (>1.0 CI point) ===\n")
sharp_summary <- drops_classified %>%
filter(sharp_drop) %>%
count(drop_type) %>%
mutate(percent = 100 * n / sum(n))
print(sharp_summary)
# Match drops to actual harvests
cat("\n=== MATCHING DROPS TO ACTUAL HARVESTS ===\n")
drops_with_harvest <- drops_classified %>%
left_join(
harvest_actual %>%
select(field, actual_harvest_date = season_end),
by = c("field_id" = "field")
) %>%
filter(!is.na(actual_harvest_date)) %>%
mutate(
days_from_harvest = as.numeric(date - actual_harvest_date),
near_harvest = abs(days_from_harvest) <= 14,
timing_category = case_when(
days_from_harvest >= -7 & days_from_harvest <= 7 ~ "Within 1 week of harvest",
days_from_harvest >= -14 & days_from_harvest <= 14 ~ "Within 2 weeks of harvest",
days_from_harvest >= -21 & days_from_harvest <= 21 ~ "Within 3 weeks of harvest",
TRUE ~ "Far from harvest (>3 weeks)"
)
)
cat("\n=== DROP TYPES BY PROXIMITY TO ACTUAL HARVEST ===\n")
harvest_proximity_summary <- drops_with_harvest %>%
count(drop_type, timing_category) %>%
pivot_wider(names_from = timing_category, values_from = n, values_fill = 0)
print(harvest_proximity_summary)
# Key insight: What % of SUSTAINED drops are near harvest vs SPIKE drops?
cat("\n=== KEY INSIGHT: Are sustained drops near harvest? ===\n")
sustained_near_harvest <- drops_with_harvest %>%
filter(grepl("SUSTAINED", drop_type)) %>%
summarise(
total = n(),
near_harvest = sum(near_harvest),
percent_near = 100 * near_harvest / total
)
spike_near_harvest <- drops_with_harvest %>%
filter(grepl("SPIKE", drop_type)) %>%
summarise(
total = n(),
near_harvest = sum(near_harvest),
percent_near = 100 * near_harvest / total
)
cat("\nSUSTAINED drops (CI stays low):\n")
cat(sprintf(" Total: %d\n", sustained_near_harvest$total))
cat(sprintf(" Near harvest (±14d): %d (%.1f%%)\n",
sustained_near_harvest$near_harvest,
sustained_near_harvest$percent_near))
cat("\nSPIKE drops (CI recovers quickly):\n")
cat(sprintf(" Total: %d\n", spike_near_harvest$total))
cat(sprintf(" Near harvest (±14d): %d (%.1f%%)\n",
spike_near_harvest$near_harvest,
spike_near_harvest$percent_near))
# Analyze recovery patterns
cat("\n=== RECOVERY PATTERNS (how fast does CI bounce back?) ===\n")
recovery_stats <- drops_classified %>%
filter(!is.na(recovery_3day)) %>%
group_by(drop_type) %>%
summarise(
count = n(),
mean_recovery_1d = mean(recovery_1day, na.rm = TRUE),
mean_recovery_2d = mean(recovery_2day, na.rm = TRUE),
mean_recovery_3d = mean(recovery_3day, na.rm = TRUE),
median_recovery_1d = median(recovery_1day, na.rm = TRUE),
median_recovery_2d = median(recovery_2day, na.rm = TRUE),
median_recovery_3d = median(recovery_3day, na.rm = TRUE)
)
print(recovery_stats)
# Show examples of each type
cat("\n=== EXAMPLES: SPIKE (false alarm) ===\n")
print(drops_classified %>%
filter(drop_type == "SPIKE (1-day anomaly)") %>%
select(field_id, date, ci_lag1, ci, ci_lead1, drop_1day, recovery_1day) %>%
head(10), n = 10)
cat("\n=== EXAMPLES: SUSTAINED (likely harvest) ===\n")
print(drops_classified %>%
filter(drop_type == "SUSTAINED (likely harvest)") %>%
select(field_id, date, ci_lag1, ci, ci_lead1, ci_lead2, ci_lead3, drop_1day) %>%
head(10), n = 10)
# Recommendation
cat("\n=== RECOMMENDATION ===\n")
cat("To avoid false alarms from single-day spikes:\n")
cat("1. Require CI to stay below 2.0 for at least 3 consecutive days\n")
cat("2. Check that CI doesn't recover above 3.0 within next 3 days\n")
cat("3. Sharp drops (>1.0 CI) that sustain are strong harvest signals\n")
cat("4. Trade-off: Waiting 3 days for confirmation delays alert by 3 days\n")
cat(" - But eliminates false positives from cloud noise\n")
cat(" - Harvest still detected 4-11 days before actual event (median 7d)\n")