SmartCane/r_app/experiments/harvest_prediction/old/analyze_harvest_signature.R
2026-01-06 14:17:37 +01:00

325 lines
11 KiB
R

# Deep analysis of CI patterns ±30 days around actual harvest dates
# Goal: Find the exact signature of harvest - decline, bottom, stabilization
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(ggplot2)
})
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))
# Read daily CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
daily_ci <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, ci = FitData) %>%
arrange(field_id, date)
# Read actual harvest data
harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
cat("=== ANALYZING CI PATTERNS AROUND ACTUAL HARVEST DATES ===\n\n")
# For each harvest, get ±30 days of data
harvest_windows <- list()
for (i in 1:nrow(harvest_actual)) {
harvest <- harvest_actual[i, ]
field <- harvest$field
harvest_date <- harvest$season_end
# Get CI data ±30 days
window_data <- daily_ci %>%
filter(field_id == field,
date >= (harvest_date - 30),
date <= (harvest_date + 30)) %>%
arrange(date) %>%
mutate(
day_offset = as.numeric(date - harvest_date), # Negative = before, positive = after
# Calculate daily changes
ci_change = ci - lag(ci),
ci_change_3d = ci - lag(ci, 3),
ci_change_7d = ci - lag(ci, 7),
# Calculate acceleration (rate of change of change)
ci_acceleration = ci_change - lag(ci_change),
# Rolling statistics
ci_mean_7d = zoo::rollmean(ci, k = 7, fill = NA, align = "center"),
ci_sd_7d = zoo::rollapply(ci, width = 7, FUN = sd, fill = NA, align = "center"),
ci_min_7d = zoo::rollapply(ci, width = 7, FUN = min, fill = NA, align = "center"),
# Detect stable periods (low variability)
is_stable = !is.na(ci_sd_7d) & ci_sd_7d < 0.3 & ci < 2.5
)
if (nrow(window_data) > 0) {
window_data$field_id <- field
window_data$harvest_date <- harvest_date
window_data$harvest_id <- i
harvest_windows[[i]] <- window_data
}
}
all_windows <- bind_rows(harvest_windows)
cat(sprintf("Analyzed %d harvest events with ±30 day windows\n\n", length(harvest_windows)))
# ============================================================================
# ANALYSIS 1: What happens at the exact harvest date?
# ============================================================================
cat("=== ANALYSIS 1: CI AT HARVEST DATE (day 0) ===\n")
harvest_day_stats <- all_windows %>%
filter(day_offset == 0) %>%
summarise(
count = n(),
mean_ci = mean(ci, na.rm = TRUE),
median_ci = median(ci, na.rm = TRUE),
sd_ci = sd(ci, na.rm = TRUE),
min_ci = min(ci, na.rm = TRUE),
max_ci = max(ci, na.rm = TRUE),
q25 = quantile(ci, 0.25, na.rm = TRUE),
q75 = quantile(ci, 0.75, na.rm = TRUE)
)
print(harvest_day_stats)
# ============================================================================
# ANALYSIS 2: When is the absolute minimum CI?
# ============================================================================
cat("\n=== ANALYSIS 2: WHEN DOES CI REACH MINIMUM? ===\n")
min_ci_timing <- all_windows %>%
group_by(harvest_id, field_id, harvest_date) %>%
summarise(
min_ci_value = min(ci, na.rm = TRUE),
min_ci_day = day_offset[which.min(ci)],
.groups = "drop"
)
cat(sprintf("\nWhen does MINIMUM CI occur relative to harvest date:\n"))
cat(sprintf(" Mean offset: %.1f days (%.1f = before harvest, + = after)\n",
mean(min_ci_timing$min_ci_day, na.rm = TRUE),
mean(min_ci_timing$min_ci_day, na.rm = TRUE)))
cat(sprintf(" Median offset: %.1f days\n", median(min_ci_timing$min_ci_day, na.rm = TRUE)))
cat(sprintf(" Range: %.0f to %.0f days\n",
min(min_ci_timing$min_ci_day, na.rm = TRUE),
max(min_ci_timing$min_ci_day, na.rm = TRUE)))
timing_distribution <- min_ci_timing %>%
mutate(timing_category = case_when(
min_ci_day < -7 ~ "Before harvest (>7d early)",
min_ci_day >= -7 & min_ci_day < 0 ~ "Just before harvest (0-7d early)",
min_ci_day == 0 ~ "On harvest date",
min_ci_day > 0 & min_ci_day <= 7 ~ "Just after harvest (0-7d late)",
min_ci_day > 7 ~ "After harvest (>7d late)"
)) %>%
count(timing_category)
print(timing_distribution)
# ============================================================================
# ANALYSIS 3: Decline rate before harvest
# ============================================================================
cat("\n=== ANALYSIS 3: DECLINE PATTERN BEFORE HARVEST ===\n")
decline_stats <- all_windows %>%
filter(day_offset >= -30 & day_offset < 0) %>%
group_by(week_before = ceiling(abs(day_offset) / 7)) %>%
summarise(
mean_ci = mean(ci, na.rm = TRUE),
mean_daily_change = mean(ci_change, na.rm = TRUE),
mean_7d_change = mean(ci_change_7d, na.rm = TRUE),
count = n(),
.groups = "drop"
) %>%
arrange(desc(week_before))
cat("\nCI decline by week before harvest:\n")
print(decline_stats)
# ============================================================================
# ANALYSIS 4: Stabilization after harvest
# ============================================================================
cat("\n=== ANALYSIS 4: WHEN DOES CI STABILIZE (stop declining)? ===\n")
stabilization <- all_windows %>%
filter(day_offset >= 0 & day_offset <= 30) %>%
group_by(day_offset) %>%
summarise(
mean_ci = mean(ci, na.rm = TRUE),
sd_ci = sd(ci, na.rm = TRUE),
pct_stable = 100 * mean(is_stable, na.rm = TRUE),
mean_daily_change = mean(ci_change, na.rm = TRUE),
.groups = "drop"
)
cat("\nPost-harvest stabilization (first 14 days):\n")
print(stabilization %>% filter(day_offset <= 14))
# Find first day where >50% of fields show stable CI
first_stable_day <- stabilization %>%
filter(pct_stable > 50) %>%
summarise(first_day = min(day_offset, na.rm = TRUE))
cat(sprintf("\n>50%% of fields show stable CI by day +%.0f after harvest\n",
first_stable_day$first_day))
# ============================================================================
# ANALYSIS 5: Threshold crossings
# ============================================================================
cat("\n=== ANALYSIS 5: THRESHOLD CROSSINGS BEFORE HARVEST ===\n")
thresholds <- c(3.0, 2.5, 2.0, 1.8, 1.5)
threshold_stats <- lapply(thresholds, function(thresh) {
crossings <- all_windows %>%
filter(day_offset < 0) %>%
group_by(harvest_id) %>%
summarise(
first_below = min(day_offset[ci < thresh], na.rm = TRUE),
.groups = "drop"
) %>%
filter(is.finite(first_below))
if (nrow(crossings) > 0) {
data.frame(
threshold = thresh,
n_crossed = nrow(crossings),
mean_days_before = mean(abs(crossings$first_below)),
median_days_before = median(abs(crossings$first_below)),
pct_crossed = 100 * nrow(crossings) / length(unique(all_windows$harvest_id))
)
} else {
data.frame(threshold = thresh, n_crossed = 0, mean_days_before = NA,
median_days_before = NA, pct_crossed = 0)
}
}) %>% bind_rows()
print(threshold_stats)
# ============================================================================
# VISUALIZATION
# ============================================================================
cat("\n=== CREATING VISUALIZATION ===\n")
# Average CI pattern across all harvests
avg_pattern <- all_windows %>%
group_by(day_offset) %>%
summarise(
mean_ci = mean(ci, na.rm = TRUE),
median_ci = median(ci, na.rm = TRUE),
q25_ci = quantile(ci, 0.25, na.rm = TRUE),
q75_ci = quantile(ci, 0.75, na.rm = TRUE),
sd_ci = sd(ci, na.rm = TRUE),
.groups = "drop"
)
png("harvest_ci_pattern_analysis.png", width = 1400, height = 900, res = 120)
par(mfrow = c(2, 2))
# Plot 1: Average CI pattern
plot(avg_pattern$day_offset, avg_pattern$mean_ci, type = "l", lwd = 2,
xlab = "Days from harvest", ylab = "CI",
main = "Average CI Pattern Around Harvest",
ylim = c(0, max(avg_pattern$q75_ci, na.rm = TRUE)))
polygon(c(avg_pattern$day_offset, rev(avg_pattern$day_offset)),
c(avg_pattern$q25_ci, rev(avg_pattern$q75_ci)),
col = rgb(0, 0, 1, 0.2), border = NA)
abline(v = 0, col = "red", lty = 2, lwd = 2)
abline(h = c(1.5, 2.0, 2.5), col = c("blue", "orange", "green"), lty = 3)
legend("topright", legend = c("Mean CI", "Q25-Q75", "Harvest date", "Thresholds 1.5, 2.0, 2.5"),
lwd = c(2, 8, 2, 1), col = c("black", rgb(0,0,1,0.2), "red", "blue"))
# Plot 2: Daily change rate
avg_change <- all_windows %>%
filter(!is.na(ci_change)) %>%
group_by(day_offset) %>%
summarise(mean_change = mean(ci_change, na.rm = TRUE), .groups = "drop")
plot(avg_change$day_offset, avg_change$mean_change, type = "l", lwd = 2,
xlab = "Days from harvest", ylab = "Daily CI change",
main = "Rate of CI Change")
abline(v = 0, col = "red", lty = 2)
abline(h = 0, col = "gray", lty = 3)
# Plot 3: Minimum CI timing distribution
hist(min_ci_timing$min_ci_day, breaks = 20,
xlab = "Day offset when minimum CI occurs",
main = "When Does CI Reach Minimum?",
col = "lightblue")
abline(v = 0, col = "red", lwd = 2, lty = 2)
abline(v = median(min_ci_timing$min_ci_day, na.rm = TRUE), col = "blue", lwd = 2)
# Plot 4: Threshold crossing timing
barplot(threshold_stats$median_days_before,
names.arg = threshold_stats$threshold,
xlab = "CI Threshold",
ylab = "Median days before harvest",
main = "When Are Thresholds Crossed?",
col = "lightgreen")
dev.off()
cat("\nPlot saved: harvest_ci_pattern_analysis.png\n")
# ============================================================================
# RECOMMENDATIONS
# ============================================================================
cat("\n=== RECOMMENDATIONS FOR HARVEST DETECTION ===\n\n")
# Find best threshold based on timing
best_threshold <- threshold_stats %>%
filter(median_days_before >= 7 & median_days_before <= 14) %>%
arrange(desc(pct_crossed))
if (nrow(best_threshold) > 0) {
cat(sprintf("BEST EARLY WARNING THRESHOLD: CI < %.1f\n", best_threshold$threshold[1]))
cat(sprintf(" - Crossed %.0f%% of the time\n", best_threshold$pct_crossed[1]))
cat(sprintf(" - Median %.1f days before harvest\n", best_threshold$median_days_before[1]))
cat(sprintf(" - MESSAGE: 'Harvest expected within 7-14 days'\n\n"))
}
cat("HARVEST COMPLETION SIGNAL:\n")
cat(sprintf(" - Look for stabilization: SD < 0.3 for 7 days\n"))
cat(sprintf(" - Typically occurs around day +%.0f after reported harvest\n", first_stable_day$first_day))
cat(sprintf(" - MESSAGE: 'Harvest likely completed in recent days'\n\n"))
cat("SHARP DECLINE DETECTION:\n")
sharp_decline_threshold <- -0.5 # CI dropping >0.5 per day
sharp_declines <- all_windows %>%
filter(!is.na(ci_change) & ci_change < sharp_decline_threshold) %>%
group_by(day_offset) %>%
summarise(count = n(), .groups = "drop") %>%
filter(day_offset < 0) %>%
arrange(desc(count))
if (nrow(sharp_declines) > 0) {
cat(sprintf(" - Sharp drops (>0.5/day) most common at day %.0f before harvest\n",
sharp_declines$day_offset[1]))
cat(sprintf(" - Can trigger immediate alert: 'Sharp decline detected - investigate field'\n"))
}