181 lines
6.7 KiB
R
181 lines
6.7 KiB
R
# Analyze timing between CI threshold crossings and actual harvest dates
|
|
# Goal: Determine how soon after CI drops below threshold the harvest actually occurs
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(here)
|
|
library(ggplot2)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# Read daily CI data
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(date = as.Date(Date)) %>%
|
|
select(field_id = field, date, ci = FitData) %>%
|
|
arrange(field_id, date)
|
|
|
|
# Read actual harvest data
|
|
harvest_actual <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
cat("=== ANALYZING CI THRESHOLD CROSSING TIMING ===\n\n")
|
|
|
|
# For each actual harvest, find when CI first dropped below various thresholds
|
|
thresholds <- c(3.0, 2.5, 2.0, 1.8)
|
|
|
|
results <- list()
|
|
|
|
for (i in 1:nrow(harvest_actual)) {
|
|
harvest <- harvest_actual[i, ]
|
|
field <- harvest$field
|
|
harvest_date <- harvest$season_end
|
|
|
|
# Get CI data for this field in the year before harvest
|
|
field_data <- time_series_daily %>%
|
|
filter(field_id == field,
|
|
date >= (harvest_date - 365),
|
|
date <= harvest_date) %>%
|
|
arrange(date)
|
|
|
|
if (nrow(field_data) == 0) next
|
|
|
|
# For each threshold, find LAST crossing date (working backward from harvest)
|
|
# This finds the mature→harvest transition, not the previous cycle's harvest
|
|
threshold_crossings <- sapply(thresholds, function(threshold) {
|
|
# Find LAST period where CI was high (>3.5), then dropped below threshold
|
|
# Work backward from harvest date
|
|
last_mature_idx <- NA
|
|
for (j in nrow(field_data):1) {
|
|
if (!is.na(field_data$ci[j]) && field_data$ci[j] > 3.5) {
|
|
last_mature_idx <- j
|
|
break
|
|
}
|
|
}
|
|
|
|
# If no mature period found, skip
|
|
if (is.na(last_mature_idx)) return(NA)
|
|
|
|
# Now find first crossing below threshold AFTER the mature period
|
|
for (j in last_mature_idx:(nrow(field_data) - 2)) {
|
|
if (!is.na(field_data$ci[j]) && !is.na(field_data$ci[j+1]) && !is.na(field_data$ci[j+2]) &&
|
|
field_data$ci[j] < threshold &&
|
|
field_data$ci[j+1] < threshold &&
|
|
field_data$ci[j+2] < threshold) {
|
|
return(as.character(field_data$date[j]))
|
|
}
|
|
}
|
|
return(NA)
|
|
})
|
|
|
|
result_row <- data.frame(
|
|
field = field,
|
|
harvest_date = harvest_date,
|
|
ci_at_harvest = field_data$ci[nrow(field_data)]
|
|
)
|
|
|
|
for (k in 1:length(thresholds)) {
|
|
threshold <- thresholds[k]
|
|
crossing_date <- as.Date(threshold_crossings[k])
|
|
|
|
if (!is.na(crossing_date)) {
|
|
days_before_harvest <- as.numeric(harvest_date - crossing_date)
|
|
result_row[[paste0("first_below_", threshold)]] <- as.character(crossing_date)
|
|
result_row[[paste0("days_before_", threshold)]] <- days_before_harvest
|
|
} else {
|
|
result_row[[paste0("first_below_", threshold)]] <- NA
|
|
result_row[[paste0("days_before_", threshold)]] <- NA
|
|
}
|
|
}
|
|
|
|
results[[i]] <- result_row
|
|
}
|
|
|
|
timing_analysis <- bind_rows(results)
|
|
|
|
# Print summary statistics
|
|
cat("\n=== TIMING STATISTICS: Days from threshold crossing to actual harvest ===\n\n")
|
|
|
|
for (threshold in thresholds) {
|
|
days_col <- paste0("days_before_", threshold)
|
|
days_before <- timing_analysis[[days_col]]
|
|
days_before <- days_before[!is.na(days_before)]
|
|
|
|
if (length(days_before) > 0) {
|
|
cat(sprintf("CI < %.1f threshold:\n", threshold))
|
|
cat(sprintf(" Valid cases: %d/%d (%.1f%%)\n",
|
|
length(days_before), nrow(timing_analysis),
|
|
100 * length(days_before) / nrow(timing_analysis)))
|
|
cat(sprintf(" Mean: %.1f days before harvest\n", mean(days_before)))
|
|
cat(sprintf(" Median: %.1f days before harvest\n", median(days_before)))
|
|
cat(sprintf(" Range: %.1f to %.1f days\n", min(days_before), max(days_before)))
|
|
cat(sprintf(" Q1-Q3: %.1f to %.1f days\n", quantile(days_before, 0.25), quantile(days_before, 0.75)))
|
|
|
|
# Count how many harvests occur within specific time windows after crossing
|
|
within_7d <- sum(days_before >= 0 & days_before <= 7)
|
|
within_14d <- sum(days_before >= 0 & days_before <= 14)
|
|
within_21d <- sum(days_before >= 0 & days_before <= 21)
|
|
within_30d <- sum(days_before >= 0 & days_before <= 30)
|
|
|
|
cat(sprintf(" Harvest timing after crossing:\n"))
|
|
cat(sprintf(" 0-7 days: %d (%.1f%%)\n", within_7d, 100*within_7d/length(days_before)))
|
|
cat(sprintf(" 0-14 days: %d (%.1f%%)\n", within_14d, 100*within_14d/length(days_before)))
|
|
cat(sprintf(" 0-21 days: %d (%.1f%%)\n", within_21d, 100*within_21d/length(days_before)))
|
|
cat(sprintf(" 0-30 days: %d (%.1f%%)\n", within_30d, 100*within_30d/length(days_before)))
|
|
cat("\n")
|
|
} else {
|
|
cat(sprintf("CI < %.1f threshold: No valid crossings found\n\n", threshold))
|
|
}
|
|
}
|
|
|
|
# Show detailed table for fields with mismatches
|
|
cat("\n=== DETAILED TIMING BY FIELD ===\n")
|
|
|
|
# Get column names dynamically
|
|
days_cols <- grep("days_before_", names(timing_analysis), value = TRUE)
|
|
select_cols <- c("field", "harvest_date", "ci_at_harvest", days_cols[1:min(2, length(days_cols))])
|
|
|
|
print(timing_analysis %>%
|
|
select(all_of(select_cols)) %>%
|
|
arrange(field, harvest_date), n = 100)
|
|
|
|
# Create visualization
|
|
cat("\n=== Creating timing distribution plot ===\n")
|
|
timing_long <- timing_analysis %>%
|
|
select(field, harvest_date, starts_with("days_before_")) %>%
|
|
pivot_longer(cols = starts_with("days_before_"),
|
|
names_to = "threshold",
|
|
values_to = "days_before") %>%
|
|
filter(!is.na(days_before)) %>%
|
|
mutate(threshold = gsub("days_before_", "CI < ", threshold))
|
|
|
|
png("timing_threshold_to_harvest.png", width = 1200, height = 800, res = 120)
|
|
ggplot(timing_long, aes(x = days_before, fill = threshold)) +
|
|
geom_histogram(binwidth = 7, alpha = 0.7, position = "identity") +
|
|
facet_wrap(~threshold, ncol = 1) +
|
|
geom_vline(xintercept = c(7, 14, 21), linetype = "dashed", color = "red", alpha = 0.5) +
|
|
labs(
|
|
title = "Time from CI Threshold Crossing to Actual Harvest",
|
|
subtitle = "How many days AFTER CI drops below threshold does harvest actually occur?",
|
|
x = "Days from threshold crossing to harvest",
|
|
y = "Count of harvest events",
|
|
caption = "Dashed lines at 7, 14, 21 days"
|
|
) +
|
|
theme_minimal() +
|
|
theme(legend.position = "none")
|
|
dev.off()
|
|
|
|
cat("\nPlot saved to: timing_threshold_to_harvest.png\n")
|