SmartCane/analyze_ci_threshold_timing.R
2026-01-06 14:17:37 +01:00

181 lines
6.7 KiB
R

# Analyze timing between CI threshold crossings and actual harvest dates
# Goal: Determine how soon after CI drops below threshold the harvest actually occurs
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))
# Read daily CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, ci = FitData) %>%
arrange(field_id, date)
# Read actual harvest data
harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
cat("=== ANALYZING CI THRESHOLD CROSSING TIMING ===\n\n")
# For each actual harvest, find when CI first dropped below various thresholds
thresholds <- c(3.0, 2.5, 2.0, 1.8)
results <- list()
for (i in 1:nrow(harvest_actual)) {
harvest <- harvest_actual[i, ]
field <- harvest$field
harvest_date <- harvest$season_end
# Get CI data for this field in the year before harvest
field_data <- time_series_daily %>%
filter(field_id == field,
date >= (harvest_date - 365),
date <= harvest_date) %>%
arrange(date)
if (nrow(field_data) == 0) next
# For each threshold, find LAST crossing date (working backward from harvest)
# This finds the mature→harvest transition, not the previous cycle's harvest
threshold_crossings <- sapply(thresholds, function(threshold) {
# Find LAST period where CI was high (>3.5), then dropped below threshold
# Work backward from harvest date
last_mature_idx <- NA
for (j in nrow(field_data):1) {
if (!is.na(field_data$ci[j]) && field_data$ci[j] > 3.5) {
last_mature_idx <- j
break
}
}
# If no mature period found, skip
if (is.na(last_mature_idx)) return(NA)
# Now find first crossing below threshold AFTER the mature period
for (j in last_mature_idx:(nrow(field_data) - 2)) {
if (!is.na(field_data$ci[j]) && !is.na(field_data$ci[j+1]) && !is.na(field_data$ci[j+2]) &&
field_data$ci[j] < threshold &&
field_data$ci[j+1] < threshold &&
field_data$ci[j+2] < threshold) {
return(as.character(field_data$date[j]))
}
}
return(NA)
})
result_row <- data.frame(
field = field,
harvest_date = harvest_date,
ci_at_harvest = field_data$ci[nrow(field_data)]
)
for (k in 1:length(thresholds)) {
threshold <- thresholds[k]
crossing_date <- as.Date(threshold_crossings[k])
if (!is.na(crossing_date)) {
days_before_harvest <- as.numeric(harvest_date - crossing_date)
result_row[[paste0("first_below_", threshold)]] <- as.character(crossing_date)
result_row[[paste0("days_before_", threshold)]] <- days_before_harvest
} else {
result_row[[paste0("first_below_", threshold)]] <- NA
result_row[[paste0("days_before_", threshold)]] <- NA
}
}
results[[i]] <- result_row
}
timing_analysis <- bind_rows(results)
# Print summary statistics
cat("\n=== TIMING STATISTICS: Days from threshold crossing to actual harvest ===\n\n")
for (threshold in thresholds) {
days_col <- paste0("days_before_", threshold)
days_before <- timing_analysis[[days_col]]
days_before <- days_before[!is.na(days_before)]
if (length(days_before) > 0) {
cat(sprintf("CI < %.1f threshold:\n", threshold))
cat(sprintf(" Valid cases: %d/%d (%.1f%%)\n",
length(days_before), nrow(timing_analysis),
100 * length(days_before) / nrow(timing_analysis)))
cat(sprintf(" Mean: %.1f days before harvest\n", mean(days_before)))
cat(sprintf(" Median: %.1f days before harvest\n", median(days_before)))
cat(sprintf(" Range: %.1f to %.1f days\n", min(days_before), max(days_before)))
cat(sprintf(" Q1-Q3: %.1f to %.1f days\n", quantile(days_before, 0.25), quantile(days_before, 0.75)))
# Count how many harvests occur within specific time windows after crossing
within_7d <- sum(days_before >= 0 & days_before <= 7)
within_14d <- sum(days_before >= 0 & days_before <= 14)
within_21d <- sum(days_before >= 0 & days_before <= 21)
within_30d <- sum(days_before >= 0 & days_before <= 30)
cat(sprintf(" Harvest timing after crossing:\n"))
cat(sprintf(" 0-7 days: %d (%.1f%%)\n", within_7d, 100*within_7d/length(days_before)))
cat(sprintf(" 0-14 days: %d (%.1f%%)\n", within_14d, 100*within_14d/length(days_before)))
cat(sprintf(" 0-21 days: %d (%.1f%%)\n", within_21d, 100*within_21d/length(days_before)))
cat(sprintf(" 0-30 days: %d (%.1f%%)\n", within_30d, 100*within_30d/length(days_before)))
cat("\n")
} else {
cat(sprintf("CI < %.1f threshold: No valid crossings found\n\n", threshold))
}
}
# Show detailed table for fields with mismatches
cat("\n=== DETAILED TIMING BY FIELD ===\n")
# Get column names dynamically
days_cols <- grep("days_before_", names(timing_analysis), value = TRUE)
select_cols <- c("field", "harvest_date", "ci_at_harvest", days_cols[1:min(2, length(days_cols))])
print(timing_analysis %>%
select(all_of(select_cols)) %>%
arrange(field, harvest_date), n = 100)
# Create visualization
cat("\n=== Creating timing distribution plot ===\n")
timing_long <- timing_analysis %>%
select(field, harvest_date, starts_with("days_before_")) %>%
pivot_longer(cols = starts_with("days_before_"),
names_to = "threshold",
values_to = "days_before") %>%
filter(!is.na(days_before)) %>%
mutate(threshold = gsub("days_before_", "CI < ", threshold))
png("timing_threshold_to_harvest.png", width = 1200, height = 800, res = 120)
ggplot(timing_long, aes(x = days_before, fill = threshold)) +
geom_histogram(binwidth = 7, alpha = 0.7, position = "identity") +
facet_wrap(~threshold, ncol = 1) +
geom_vline(xintercept = c(7, 14, 21), linetype = "dashed", color = "red", alpha = 0.5) +
labs(
title = "Time from CI Threshold Crossing to Actual Harvest",
subtitle = "How many days AFTER CI drops below threshold does harvest actually occur?",
x = "Days from threshold crossing to harvest",
y = "Count of harvest events",
caption = "Dashed lines at 7, 14, 21 days"
) +
theme_minimal() +
theme(legend.position = "none")
dev.off()
cat("\nPlot saved to: timing_threshold_to_harvest.png\n")