SmartCane/r_app/experiments/harvest_prediction/old/analyze_missed_harvests.R
2026-01-06 14:17:37 +01:00

197 lines
6.5 KiB
R

# Analyze specific MISSED harvests to understand why detection failed
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(terra)
library(sf)
library(here)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
# Source required files
cat("Loading project configuration...\n")
source(here("r_app", "parameters_project.R"))
# Read pre-extracted CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series <- ci_data_raw %>%
mutate(
date = as.Date(Date),
week = isoweek(date),
year = isoyear(date)
) %>%
select(
field_id = field,
date,
week,
year,
mean_ci = FitData
) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
# Read actual harvest data
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
fields_with_data <- unique(field_boundaries_sf$field)
harvest_actual <- harvest_actual_all %>%
filter(field %in% fields_with_data) %>%
filter(!is.na(season_end))
cat("=== ANALYZING MISSED HARVESTS ===\n\n")
# Fields that were missed in detection results (from previous output)
missed_cases <- c("00302", "00F25", "00F28", "00P81", "00P82", "00P83", "00P84", "KHWA", "KHWB", "KHWC", "LOMDA")
# Analyze each missed field's harvests
for (field_name in missed_cases[1:5]) { # Analyze first 5 fields
field_harvests <- harvest_actual %>%
filter(field == field_name) %>%
arrange(season_end)
if (nrow(field_harvests) == 0) next
cat("\n========================================\n")
cat("FIELD:", field_name, "\n")
cat("Total harvests:", nrow(field_harvests), "\n")
cat("========================================\n\n")
# Analyze each harvest for this field
for (i in 1:min(3, nrow(field_harvests))) { # First 3 harvests
harvest_date <- field_harvests$season_end[i]
harvest_week <- isoweek(harvest_date)
harvest_year <- isoyear(harvest_date)
cat("\n--- Harvest", i, "---\n")
cat("Date:", as.character(harvest_date), "(Week", harvest_week, harvest_year, ")\n\n")
# Get CI values around this harvest
harvest_window <- time_series %>%
filter(
field_id == field_name,
date >= (harvest_date - 30),
date <= (harvest_date + 30)
) %>%
mutate(
days_from_harvest = as.numeric(date - harvest_date),
ci_smooth = zoo::rollmean(mean_ci, k = 7, fill = NA, align = "center"),
ci_lag7 = lag(ci_smooth, 7),
ci_drop = ci_lag7 - ci_smooth,
is_low_1.5 = mean_ci < 1.5,
is_low_2.0 = mean_ci < 2.0,
is_low_2.5 = mean_ci < 2.5,
is_drop_0.3 = ci_drop > 0.3,
is_drop_0.5 = ci_drop > 0.5
)
if (nrow(harvest_window) == 0) {
cat(" NO DATA available for this harvest period\n")
next
}
# Summary statistics
cat("CI Summary (±30 days):\n")
cat(" Min CI:", round(min(harvest_window$mean_ci, na.rm = TRUE), 2), "\n")
cat(" Max CI:", round(max(harvest_window$mean_ci, na.rm = TRUE), 2), "\n")
cat(" Mean CI:", round(mean(harvest_window$mean_ci, na.rm = TRUE), 2), "\n")
# CI at/near harvest date
near_harvest <- harvest_window %>%
filter(abs(days_from_harvest) <= 3) %>%
arrange(abs(days_from_harvest))
if (nrow(near_harvest) > 0) {
cat(" CI at harvest date (±3 days):", round(near_harvest$mean_ci[1], 2), "\n")
}
# Find minimum CI and when it occurred
min_ci_row <- harvest_window %>%
filter(mean_ci == min(mean_ci, na.rm = TRUE)) %>%
head(1)
cat(" Minimum CI:", round(min_ci_row$mean_ci, 2), "at day", min_ci_row$days_from_harvest, "\n\n")
# Count days below different thresholds
cat("Days with low CI:\n")
cat(" CI < 1.5:", sum(harvest_window$is_low_1.5, na.rm = TRUE), "days\n")
cat(" CI < 2.0:", sum(harvest_window$is_low_2.0, na.rm = TRUE), "days\n")
cat(" CI < 2.5:", sum(harvest_window$is_low_2.5, na.rm = TRUE), "days\n\n")
# Find longest consecutive period below threshold
for (threshold in c(1.5, 2.0, 2.5)) {
consecutive <- harvest_window %>%
arrange(date) %>%
mutate(
is_low = mean_ci < threshold,
day_diff = as.numeric(date - lag(date)),
new_period = is.na(day_diff) | day_diff > 3 | !is_low,
period_id = cumsum(new_period)
) %>%
filter(is_low) %>%
group_by(period_id) %>%
summarise(
start_day = min(days_from_harvest),
end_day = max(days_from_harvest),
duration = n(),
mean_ci_period = mean(mean_ci),
.groups = "drop"
) %>%
arrange(desc(duration))
if (nrow(consecutive) > 0) {
longest <- consecutive[1, ]
cat("Longest consecutive period (CI <", threshold, "):\n")
cat(" Duration:", longest$duration, "days\n")
cat(" Start day:", longest$start_day, ", End day:", longest$end_day, "\n")
cat(" Mean CI:", round(longest$mean_ci_period, 2), "\n\n")
}
}
# Show when significant drops occurred
drops <- harvest_window %>%
filter(!is.na(ci_drop), ci_drop > 0.3) %>%
arrange(days_from_harvest)
if (nrow(drops) > 0) {
cat("Significant CI drops (>0.3) detected:\n")
cat(" First drop at day:", drops$days_from_harvest[1], "(drop:", round(drops$ci_drop[1], 2), ")\n")
if (nrow(drops) > 1) {
cat(" Total drops detected:", nrow(drops), "\n")
}
cat("\n")
} else {
cat("No significant CI drops (>0.3) detected in this period\n\n")
}
# Show daily data around harvest
cat("Daily CI values (days -7 to +21):\n")
daily_view <- harvest_window %>%
filter(days_from_harvest >= -7, days_from_harvest <= 21) %>%
select(days_from_harvest, date, mean_ci, is_low_2.0) %>%
arrange(days_from_harvest)
print(daily_view, n = 100)
}
}
cat("\n\n=== SUMMARY ===\n")
cat("Key observations:\n")
cat("1. Check if CI actually drops below 2.0 around harvest dates\n")
cat("2. Check when the minimum CI occurs (before, during, or after harvest)\n")
cat("3. Check duration of low CI periods\n")
cat("4. Identify timing offset between reported harvest date and actual low CI period\n")