SmartCane/r_app/experiments/harvest_prediction/old/analyze_harvest_ci.R
2026-01-06 14:17:37 +01:00

341 lines
12 KiB
R

# Analyze CI values around actual harvest dates to tune detection parameters
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(terra)
library(sf)
library(here)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
# Source required files
cat("Loading project configuration...\n")
source(here("r_app", "parameters_project.R"))
# Read pre-extracted DAILY CI data from script 02
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
cat("Reading pre-extracted daily CI data from:\n")
cat(" ", ci_rds_file, "\n")
if (!file.exists(ci_rds_file)) {
stop("CI data file not found: ", ci_rds_file)
}
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
cat("Loaded CI data with", nrow(ci_data_raw), "rows\n\n")
# Transform to daily time series format
cat("Converting to daily time series format...\n")
time_series <- ci_data_raw %>%
mutate(
date = as.Date(Date),
week = isoweek(date),
year = isoyear(date)
) %>%
select(
field_id = field,
date,
week,
year,
mean_ci = FitData
) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
cat("Daily time series ready:", nrow(time_series), "observations\n")
cat("Fields:", n_distinct(time_series$field_id), "\n")
cat("Date range:", as.character(min(time_series$date)), "to", as.character(max(time_series$date)), "\n\n")
# Read actual harvest data
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>% select(-age, -sub_area, -tonnage_ha, -sub_field)
fields_with_data <- unique(field_boundaries_sf$field)
harvest_actual <- harvest_actual_all %>%
filter(field %in% fields_with_data) %>%
filter(!is.na(season_end)) %>%
mutate(
actual_harvest_week = isoweek(season_end),
actual_harvest_year = isoyear(season_end)
)
cat("Analyzing CI values around actual harvest dates...\n\n")
# For each actual harvest, find the NEAREST date in time series (within ±3 days)
harvest_analysis <- harvest_actual %>%
rowwise() %>%
do({
h_field <- .$field
h_date <- .$season_end
h_week <- .$actual_harvest_week
h_year <- .$actual_harvest_year
# Find nearest date in time series for this field
nearest_match <- time_series %>%
filter(field_id == h_field) %>%
mutate(
date_diff = abs(as.numeric(date - h_date))
) %>%
filter(date_diff <= 3) %>% # Within 3 days
arrange(date_diff) %>%
head(1)
if (nrow(nearest_match) > 0) {
data.frame(
field = h_field,
season_end = h_date,
actual_harvest_week = h_week,
actual_harvest_year = h_year,
matched_date = nearest_match$date,
date_diff = nearest_match$date_diff,
mean_ci = nearest_match$mean_ci,
stringsAsFactors = FALSE
)
} else {
data.frame(
field = h_field,
season_end = h_date,
actual_harvest_week = h_week,
actual_harvest_year = h_year,
matched_date = as.Date(NA),
date_diff = NA,
mean_ci = NA,
stringsAsFactors = FALSE
)
}
}) %>%
ungroup() %>%
mutate(has_ci_data = !is.na(mean_ci))
# Summary statistics
cat("=== CI VALUES AT ACTUAL HARVEST DATES ===\n")
cat("Harvests with CI data:", sum(harvest_analysis$has_ci_data), "/", nrow(harvest_analysis), "\n\n")
ci_at_harvest <- harvest_analysis %>% filter(has_ci_data)
if (nrow(ci_at_harvest) > 0) {
cat("CI Statistics at harvest:\n")
cat(" Min:", round(min(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
cat(" Max:", round(max(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
cat(" Mean:", round(mean(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
cat(" Median:", round(median(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
cat(" Q25:", round(quantile(ci_at_harvest$mean_ci, 0.25, na.rm = TRUE), 2), "\n")
cat(" Q75:", round(quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE), 2), "\n\n")
cat("Distribution of CI at harvest:\n")
cat(" CI < 1.0:", sum(ci_at_harvest$mean_ci < 1.0, na.rm = TRUE), "\n")
cat(" CI < 1.5:", sum(ci_at_harvest$mean_ci < 1.5, na.rm = TRUE), "\n")
cat(" CI < 2.0:", sum(ci_at_harvest$mean_ci < 2.0, na.rm = TRUE), "\n")
cat(" CI < 2.5:", sum(ci_at_harvest$mean_ci < 2.5, na.rm = TRUE), "\n")
cat(" CI < 3.0:", sum(ci_at_harvest$mean_ci < 3.0, na.rm = TRUE), "\n")
cat(" CI >= 3.0:", sum(ci_at_harvest$mean_ci >= 3.0, na.rm = TRUE), "\n\n")
}
# Look at CI values in DAYS BEFORE and AFTER harvest
cat("\n=== CI TEMPORAL PATTERN AROUND HARVEST (DAILY) ===\n")
cat("Analyzing ±30 days around actual harvest dates...\n\n")
# For each harvest, get CI values in surrounding days
temporal_analysis <- harvest_actual %>%
rowwise() %>%
do({
field_name <- .$field
harvest_date <- .$season_end
# Get CI values for days around harvest
field_ts <- time_series %>%
filter(field_id == field_name,
date >= (harvest_date - 30),
date <= (harvest_date + 30)) %>%
mutate(
days_from_harvest = as.numeric(date - harvest_date),
harvest_date_ref = harvest_date
) %>%
select(field_id, date, days_from_harvest, mean_ci)
field_ts
}) %>%
ungroup()
if (nrow(temporal_analysis) > 0) {
summary_by_offset <- temporal_analysis %>%
group_by(days_from_harvest) %>%
summarise(
n = n(),
mean_ci = mean(mean_ci, na.rm = TRUE),
median_ci = median(mean_ci, na.rm = TRUE),
min_ci = min(mean_ci, na.rm = TRUE),
max_ci = max(mean_ci, na.rm = TRUE),
sd_ci = sd(mean_ci, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(days_from_harvest)
cat("\nDaily CI pattern around harvest (±30 days):\n")
print(summary_by_offset, n = 100)
# Calculate CI drop from pre-harvest to post-harvest
cat("\n=== CI DROP ANALYSIS ===\n")
pre_harvest_ci <- summary_by_offset %>%
filter(days_from_harvest >= -7, days_from_harvest <= -1) %>%
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
pull(mean_ci)
harvest_day_ci <- summary_by_offset %>%
filter(days_from_harvest == 0) %>%
pull(mean_ci)
post_harvest_ci <- summary_by_offset %>%
filter(days_from_harvest >= 1, days_from_harvest <= 7) %>%
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
pull(mean_ci)
cat("CI 7 days before harvest:", round(pre_harvest_ci, 2), "\n")
cat("CI on harvest day:", round(harvest_day_ci, 2), "\n")
cat("CI 7 days after harvest:", round(post_harvest_ci, 2), "\n")
cat("Drop (pre to harvest day):", round(pre_harvest_ci - harvest_day_ci, 2), "\n")
cat("Drop (harvest day to post):", round(harvest_day_ci - post_harvest_ci, 2), "\n")
cat("Total drop (pre to post):", round(pre_harvest_ci - post_harvest_ci, 2), "\n\n")
# Analyze when CI starts dropping
cat("\n=== WHEN DOES CI DROP START? ===\n")
baseline_ci <- summary_by_offset %>%
filter(days_from_harvest >= -30, days_from_harvest <= -15) %>%
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
pull(mean_ci)
cat("Baseline CI (days -30 to -15):", round(baseline_ci, 2), "\n")
# Find when CI first drops significantly below baseline
drop_start <- summary_by_offset %>%
filter(days_from_harvest < 0) %>%
mutate(drop_from_baseline = baseline_ci - mean_ci) %>%
filter(drop_from_baseline > 0.3) %>% # Significant drop
arrange(days_from_harvest) %>%
head(1)
if (nrow(drop_start) > 0) {
cat("First significant drop detected at day:", drop_start$days_from_harvest,
"(CI:", round(drop_start$mean_ci, 2), ", drop:", round(drop_start$drop_from_baseline, 2), ")\n")
}
# Find when CI reaches minimum
min_ci_day <- summary_by_offset %>%
filter(days_from_harvest >= -30, days_from_harvest <= 30) %>%
arrange(mean_ci) %>%
head(1)
cat("Minimum CI reached at day:", min_ci_day$days_from_harvest,
"(CI:", round(min_ci_day$mean_ci, 2), ")\n")
# Find when CI starts recovering
recovery_start <- summary_by_offset %>%
filter(days_from_harvest > 0) %>%
mutate(recovery_from_harvest = mean_ci - harvest_day_ci) %>%
filter(recovery_from_harvest > 0.3) %>% # Significant recovery
arrange(days_from_harvest) %>%
head(1)
if (nrow(recovery_start) > 0) {
cat("Recovery detected at day:", recovery_start$days_from_harvest,
"(CI:", round(recovery_start$mean_ci, 2), ", gain:", round(recovery_start$recovery_from_harvest, 2), ")\n")
}
# Analyze the ENTIRE harvest period (not just a single day)
cat("\n=== MULTI-DAY HARVEST PERIOD ANALYSIS ===\n")
cat("Harvest may span multiple days/weeks. Looking for extended low CI periods...\n\n")
# Count consecutive days below different thresholds
for (threshold in c(1.5, 2.0, 2.5, 3.0)) {
consecutive_low <- temporal_analysis %>%
arrange(field_id, date) %>%
group_by(field_id) %>%
mutate(
is_low = mean_ci < threshold,
day_diff = as.numeric(date - lag(date)),
new_period = is.na(day_diff) | day_diff > 3 | !is_low, # Gap or not low
period_id = cumsum(new_period)
) %>%
filter(is_low) %>%
group_by(field_id, period_id) %>%
summarise(
start_day = min(days_from_harvest),
end_day = max(days_from_harvest),
duration = n(),
mean_ci_period = mean(mean_ci),
.groups = "drop"
) %>%
filter(duration >= 3) # At least 3 consecutive days
if (nrow(consecutive_low) > 0) {
cat("\nConsecutive periods with CI <", threshold, ":\n")
cat(" Number of periods:", nrow(consecutive_low), "\n")
cat(" Average duration:", round(mean(consecutive_low$duration), 1), "days\n")
cat(" Median start day:", round(median(consecutive_low$start_day), 1), "\n")
cat(" Median end day:", round(median(consecutive_low$end_day), 1), "\n")
# Show distribution of when these periods start
periods_before <- sum(consecutive_low$start_day < -7)
periods_during <- sum(consecutive_low$start_day >= -7 & consecutive_low$start_day <= 7)
periods_after <- sum(consecutive_low$start_day > 7)
cat(" Periods starting before harvest (-30 to -7):", periods_before, "\n")
cat(" Periods starting during harvest (-7 to +7):", periods_during, "\n")
cat(" Periods starting after harvest (+7 to +30):", periods_after, "\n")
}
}
cat("\n=== RECOMMENDED THRESHOLDS (DAILY DATA) ===\n")
ci_75th <- quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE)
ci_90th <- quantile(ci_at_harvest$mean_ci, 0.90, na.rm = TRUE)
cat("Based on actual harvest CI values:\n")
cat(" Conservative threshold (captures 75% of harvests): CI <", round(ci_75th, 2), "\n")
cat(" Aggressive threshold (captures 90% of harvests): CI <", round(ci_90th, 2), "\n\n")
# Calculate drop thresholds
if (!is.na(pre_harvest_ci) && !is.na(post_harvest_ci)) {
typical_drop <- pre_harvest_ci - post_harvest_ci
cat("Typical CI drop (7 days before to 7 days after):", round(typical_drop, 2), "\n")
cat("Suggested drop_threshold:", round(typical_drop * 0.5, 2), "(half of typical drop)\n\n")
}
cat("Suggested detection parameters for daily data:\n")
cat(" low_ci_threshold:", round(ci_75th, 1), "(75th percentile of harvest CI)\n")
cat(" drop_threshold:", round((pre_harvest_ci - post_harvest_ci) * 0.5, 1), "(half of typical drop)\n")
cat(" min_low_days: 7-10 (stay below threshold for this many days)\n")
cat(" recovery_threshold:", round(pre_harvest_ci, 1), "(pre-harvest CI level)\n")
}
# Show sample cases where detection failed
cat("\n\n=== SAMPLE HARVEST DATES WITH CI VALUES ===\n")
sample_harvests <- harvest_analysis %>%
filter(has_ci_data) %>%
arrange(mean_ci) %>%
select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>%
head(15)
cat("15 harvests with LOWEST CI on harvest day:\n")
print(sample_harvests)
sample_high <- harvest_analysis %>%
filter(has_ci_data) %>%
arrange(desc(mean_ci)) %>%
select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>%
head(10)
cat("\n10 harvests with HIGHEST CI on harvest day:\n")
print(sample_high)