198 lines
6.6 KiB
R
198 lines
6.6 KiB
R
# Analyze CI drop patterns to distinguish harvest from anomalies
|
|
# Goal: Identify characteristics of true harvest drops vs single-day noise
|
|
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(here)
|
|
library(ggplot2)
|
|
})
|
|
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# Read daily CI data
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(date = as.Date(Date)) %>%
|
|
select(field_id = field, date, ci = FitData) %>%
|
|
arrange(field_id, date) %>%
|
|
group_by(field_id) %>%
|
|
mutate(
|
|
# Calculate changes
|
|
ci_lag1 = lag(ci, 1),
|
|
ci_lag2 = lag(ci, 2),
|
|
ci_lead1 = lead(ci, 1),
|
|
ci_lead2 = lead(ci, 2),
|
|
ci_lead3 = lead(ci, 3),
|
|
|
|
# Drop magnitude
|
|
drop_1day = ci_lag1 - ci,
|
|
drop_2day = ci_lag2 - ci,
|
|
|
|
# Recovery after drop
|
|
recovery_1day = ci_lead1 - ci,
|
|
recovery_2day = ci_lead2 - ci,
|
|
recovery_3day = ci_lead3 - ci,
|
|
|
|
# Is this a single-day anomaly?
|
|
is_spike_drop = (ci < 2.0 & ci_lag1 > 3.0 & ci_lead1 > 3.0)
|
|
) %>%
|
|
ungroup()
|
|
|
|
# Read actual harvest data
|
|
harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
cat("=== ANALYZING CI DROP PATTERNS ===\n\n")
|
|
|
|
# Find all instances where CI drops below 2.0
|
|
all_drops <- time_series_daily %>%
|
|
filter(ci < 2.0, ci_lag1 > 2.0) %>% # First day below 2.0
|
|
select(field_id, date, ci, ci_lag1, drop_1day,
|
|
ci_lead1, ci_lead2, ci_lead3,
|
|
recovery_1day, recovery_2day, recovery_3day)
|
|
|
|
# Classify drops based on what happens next
|
|
drops_classified <- all_drops %>%
|
|
mutate(
|
|
drop_type = case_when(
|
|
# Spike: drops but recovers to >3.0 within 3 days
|
|
!is.na(ci_lead1) & ci_lead1 > 3.0 ~ "SPIKE (1-day anomaly)",
|
|
!is.na(ci_lead2) & ci_lead2 > 3.0 ~ "SPIKE (2-day anomaly)",
|
|
!is.na(ci_lead3) & ci_lead3 > 3.0 ~ "SPIKE (3-day anomaly)",
|
|
|
|
# Sustained: stays below 2.5 for at least 3 days
|
|
!is.na(ci_lead1) & !is.na(ci_lead2) & !is.na(ci_lead3) &
|
|
ci_lead1 < 2.5 & ci_lead2 < 2.5 & ci_lead3 < 2.5 ~ "SUSTAINED (likely harvest)",
|
|
|
|
TRUE ~ "UNCLEAR (insufficient data)"
|
|
),
|
|
|
|
sharp_drop = drop_1day > 1.0 # Drop >1 CI point
|
|
)
|
|
|
|
cat("=== DROP TYPE DISTRIBUTION ===\n")
|
|
drop_summary <- drops_classified %>%
|
|
count(drop_type) %>%
|
|
mutate(percent = 100 * n / sum(n)) %>%
|
|
arrange(desc(n))
|
|
|
|
print(drop_summary)
|
|
|
|
cat("\n=== SHARP DROPS (>1.0 CI point) ===\n")
|
|
sharp_summary <- drops_classified %>%
|
|
filter(sharp_drop) %>%
|
|
count(drop_type) %>%
|
|
mutate(percent = 100 * n / sum(n))
|
|
|
|
print(sharp_summary)
|
|
|
|
# Match drops to actual harvests
|
|
cat("\n=== MATCHING DROPS TO ACTUAL HARVESTS ===\n")
|
|
|
|
drops_with_harvest <- drops_classified %>%
|
|
left_join(
|
|
harvest_actual %>%
|
|
select(field, actual_harvest_date = season_end),
|
|
by = c("field_id" = "field")
|
|
) %>%
|
|
filter(!is.na(actual_harvest_date)) %>%
|
|
mutate(
|
|
days_from_harvest = as.numeric(date - actual_harvest_date),
|
|
near_harvest = abs(days_from_harvest) <= 14,
|
|
timing_category = case_when(
|
|
days_from_harvest >= -7 & days_from_harvest <= 7 ~ "Within 1 week of harvest",
|
|
days_from_harvest >= -14 & days_from_harvest <= 14 ~ "Within 2 weeks of harvest",
|
|
days_from_harvest >= -21 & days_from_harvest <= 21 ~ "Within 3 weeks of harvest",
|
|
TRUE ~ "Far from harvest (>3 weeks)"
|
|
)
|
|
)
|
|
|
|
cat("\n=== DROP TYPES BY PROXIMITY TO ACTUAL HARVEST ===\n")
|
|
harvest_proximity_summary <- drops_with_harvest %>%
|
|
count(drop_type, timing_category) %>%
|
|
pivot_wider(names_from = timing_category, values_from = n, values_fill = 0)
|
|
|
|
print(harvest_proximity_summary)
|
|
|
|
# Key insight: What % of SUSTAINED drops are near harvest vs SPIKE drops?
|
|
cat("\n=== KEY INSIGHT: Are sustained drops near harvest? ===\n")
|
|
sustained_near_harvest <- drops_with_harvest %>%
|
|
filter(grepl("SUSTAINED", drop_type)) %>%
|
|
summarise(
|
|
total = n(),
|
|
near_harvest = sum(near_harvest),
|
|
percent_near = 100 * near_harvest / total
|
|
)
|
|
|
|
spike_near_harvest <- drops_with_harvest %>%
|
|
filter(grepl("SPIKE", drop_type)) %>%
|
|
summarise(
|
|
total = n(),
|
|
near_harvest = sum(near_harvest),
|
|
percent_near = 100 * near_harvest / total
|
|
)
|
|
|
|
cat("\nSUSTAINED drops (CI stays low):\n")
|
|
cat(sprintf(" Total: %d\n", sustained_near_harvest$total))
|
|
cat(sprintf(" Near harvest (±14d): %d (%.1f%%)\n",
|
|
sustained_near_harvest$near_harvest,
|
|
sustained_near_harvest$percent_near))
|
|
|
|
cat("\nSPIKE drops (CI recovers quickly):\n")
|
|
cat(sprintf(" Total: %d\n", spike_near_harvest$total))
|
|
cat(sprintf(" Near harvest (±14d): %d (%.1f%%)\n",
|
|
spike_near_harvest$near_harvest,
|
|
spike_near_harvest$percent_near))
|
|
|
|
# Analyze recovery patterns
|
|
cat("\n=== RECOVERY PATTERNS (how fast does CI bounce back?) ===\n")
|
|
|
|
recovery_stats <- drops_classified %>%
|
|
filter(!is.na(recovery_3day)) %>%
|
|
group_by(drop_type) %>%
|
|
summarise(
|
|
count = n(),
|
|
mean_recovery_1d = mean(recovery_1day, na.rm = TRUE),
|
|
mean_recovery_2d = mean(recovery_2day, na.rm = TRUE),
|
|
mean_recovery_3d = mean(recovery_3day, na.rm = TRUE),
|
|
median_recovery_1d = median(recovery_1day, na.rm = TRUE),
|
|
median_recovery_2d = median(recovery_2day, na.rm = TRUE),
|
|
median_recovery_3d = median(recovery_3day, na.rm = TRUE)
|
|
)
|
|
|
|
print(recovery_stats)
|
|
|
|
# Show examples of each type
|
|
cat("\n=== EXAMPLES: SPIKE (false alarm) ===\n")
|
|
print(drops_classified %>%
|
|
filter(drop_type == "SPIKE (1-day anomaly)") %>%
|
|
select(field_id, date, ci_lag1, ci, ci_lead1, drop_1day, recovery_1day) %>%
|
|
head(10), n = 10)
|
|
|
|
cat("\n=== EXAMPLES: SUSTAINED (likely harvest) ===\n")
|
|
print(drops_classified %>%
|
|
filter(drop_type == "SUSTAINED (likely harvest)") %>%
|
|
select(field_id, date, ci_lag1, ci, ci_lead1, ci_lead2, ci_lead3, drop_1day) %>%
|
|
head(10), n = 10)
|
|
|
|
# Recommendation
|
|
cat("\n=== RECOMMENDATION ===\n")
|
|
cat("To avoid false alarms from single-day spikes:\n")
|
|
cat("1. Require CI to stay below 2.0 for at least 3 consecutive days\n")
|
|
cat("2. Check that CI doesn't recover above 3.0 within next 3 days\n")
|
|
cat("3. Sharp drops (>1.0 CI) that sustain are strong harvest signals\n")
|
|
cat("4. Trade-off: Waiting 3 days for confirmation delays alert by 3 days\n")
|
|
cat(" - But eliminates false positives from cloud noise\n")
|
|
cat(" - Harvest still detected 4-11 days before actual event (median 7d)\n")
|