# Analyze CI values around actual harvest dates to tune detection parameters suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(terra) library(sf) library(here) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) # Source required files cat("Loading project configuration...\n") source(here("r_app", "parameters_project.R")) # Read pre-extracted DAILY CI data from script 02 ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") cat("Reading pre-extracted daily CI data from:\n") cat(" ", ci_rds_file, "\n") if (!file.exists(ci_rds_file)) { stop("CI data file not found: ", ci_rds_file) } ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() cat("Loaded CI data with", nrow(ci_data_raw), "rows\n\n") # Transform to daily time series format cat("Converting to daily time series format...\n") time_series <- ci_data_raw %>% mutate( date = as.Date(Date), week = isoweek(date), year = isoyear(date) ) %>% select( field_id = field, date, week, year, mean_ci = FitData ) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) cat("Daily time series ready:", nrow(time_series), "observations\n") cat("Fields:", n_distinct(time_series$field_id), "\n") cat("Date range:", as.character(min(time_series$date)), "to", as.character(max(time_series$date)), "\n\n") # Read actual harvest data harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% select(-age, -sub_area, -tonnage_ha, -sub_field) fields_with_data <- unique(field_boundaries_sf$field) harvest_actual <- harvest_actual_all %>% filter(field %in% fields_with_data) %>% filter(!is.na(season_end)) %>% mutate( actual_harvest_week = isoweek(season_end), actual_harvest_year = isoyear(season_end) ) cat("Analyzing CI values around actual harvest dates...\n\n") # For each actual harvest, find the NEAREST date in time series (within ±3 days) harvest_analysis <- harvest_actual %>% rowwise() %>% do({ h_field <- .$field h_date <- .$season_end h_week <- .$actual_harvest_week h_year <- .$actual_harvest_year # Find nearest date in time series for this field nearest_match <- time_series %>% filter(field_id == h_field) %>% mutate( date_diff = abs(as.numeric(date - h_date)) ) %>% filter(date_diff <= 3) %>% # Within 3 days arrange(date_diff) %>% head(1) if (nrow(nearest_match) > 0) { data.frame( field = h_field, season_end = h_date, actual_harvest_week = h_week, actual_harvest_year = h_year, matched_date = nearest_match$date, date_diff = nearest_match$date_diff, mean_ci = nearest_match$mean_ci, stringsAsFactors = FALSE ) } else { data.frame( field = h_field, season_end = h_date, actual_harvest_week = h_week, actual_harvest_year = h_year, matched_date = as.Date(NA), date_diff = NA, mean_ci = NA, stringsAsFactors = FALSE ) } }) %>% ungroup() %>% mutate(has_ci_data = !is.na(mean_ci)) # Summary statistics cat("=== CI VALUES AT ACTUAL HARVEST DATES ===\n") cat("Harvests with CI data:", sum(harvest_analysis$has_ci_data), "/", nrow(harvest_analysis), "\n\n") ci_at_harvest <- harvest_analysis %>% filter(has_ci_data) if (nrow(ci_at_harvest) > 0) { cat("CI Statistics at harvest:\n") cat(" Min:", round(min(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n") cat(" Max:", round(max(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n") cat(" Mean:", round(mean(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n") cat(" Median:", round(median(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n") cat(" Q25:", round(quantile(ci_at_harvest$mean_ci, 0.25, na.rm = TRUE), 2), "\n") cat(" Q75:", round(quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE), 2), "\n\n") cat("Distribution of CI at harvest:\n") cat(" CI < 1.0:", sum(ci_at_harvest$mean_ci < 1.0, na.rm = TRUE), "\n") cat(" CI < 1.5:", sum(ci_at_harvest$mean_ci < 1.5, na.rm = TRUE), "\n") cat(" CI < 2.0:", sum(ci_at_harvest$mean_ci < 2.0, na.rm = TRUE), "\n") cat(" CI < 2.5:", sum(ci_at_harvest$mean_ci < 2.5, na.rm = TRUE), "\n") cat(" CI < 3.0:", sum(ci_at_harvest$mean_ci < 3.0, na.rm = TRUE), "\n") cat(" CI >= 3.0:", sum(ci_at_harvest$mean_ci >= 3.0, na.rm = TRUE), "\n\n") } # Look at CI values in DAYS BEFORE and AFTER harvest cat("\n=== CI TEMPORAL PATTERN AROUND HARVEST (DAILY) ===\n") cat("Analyzing ±30 days around actual harvest dates...\n\n") # For each harvest, get CI values in surrounding days temporal_analysis <- harvest_actual %>% rowwise() %>% do({ field_name <- .$field harvest_date <- .$season_end # Get CI values for days around harvest field_ts <- time_series %>% filter(field_id == field_name, date >= (harvest_date - 30), date <= (harvest_date + 30)) %>% mutate( days_from_harvest = as.numeric(date - harvest_date), harvest_date_ref = harvest_date ) %>% select(field_id, date, days_from_harvest, mean_ci) field_ts }) %>% ungroup() if (nrow(temporal_analysis) > 0) { summary_by_offset <- temporal_analysis %>% group_by(days_from_harvest) %>% summarise( n = n(), mean_ci = mean(mean_ci, na.rm = TRUE), median_ci = median(mean_ci, na.rm = TRUE), min_ci = min(mean_ci, na.rm = TRUE), max_ci = max(mean_ci, na.rm = TRUE), sd_ci = sd(mean_ci, na.rm = TRUE), .groups = "drop" ) %>% arrange(days_from_harvest) cat("\nDaily CI pattern around harvest (±30 days):\n") print(summary_by_offset, n = 100) # Calculate CI drop from pre-harvest to post-harvest cat("\n=== CI DROP ANALYSIS ===\n") pre_harvest_ci <- summary_by_offset %>% filter(days_from_harvest >= -7, days_from_harvest <= -1) %>% summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>% pull(mean_ci) harvest_day_ci <- summary_by_offset %>% filter(days_from_harvest == 0) %>% pull(mean_ci) post_harvest_ci <- summary_by_offset %>% filter(days_from_harvest >= 1, days_from_harvest <= 7) %>% summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>% pull(mean_ci) cat("CI 7 days before harvest:", round(pre_harvest_ci, 2), "\n") cat("CI on harvest day:", round(harvest_day_ci, 2), "\n") cat("CI 7 days after harvest:", round(post_harvest_ci, 2), "\n") cat("Drop (pre to harvest day):", round(pre_harvest_ci - harvest_day_ci, 2), "\n") cat("Drop (harvest day to post):", round(harvest_day_ci - post_harvest_ci, 2), "\n") cat("Total drop (pre to post):", round(pre_harvest_ci - post_harvest_ci, 2), "\n\n") # Analyze when CI starts dropping cat("\n=== WHEN DOES CI DROP START? ===\n") baseline_ci <- summary_by_offset %>% filter(days_from_harvest >= -30, days_from_harvest <= -15) %>% summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>% pull(mean_ci) cat("Baseline CI (days -30 to -15):", round(baseline_ci, 2), "\n") # Find when CI first drops significantly below baseline drop_start <- summary_by_offset %>% filter(days_from_harvest < 0) %>% mutate(drop_from_baseline = baseline_ci - mean_ci) %>% filter(drop_from_baseline > 0.3) %>% # Significant drop arrange(days_from_harvest) %>% head(1) if (nrow(drop_start) > 0) { cat("First significant drop detected at day:", drop_start$days_from_harvest, "(CI:", round(drop_start$mean_ci, 2), ", drop:", round(drop_start$drop_from_baseline, 2), ")\n") } # Find when CI reaches minimum min_ci_day <- summary_by_offset %>% filter(days_from_harvest >= -30, days_from_harvest <= 30) %>% arrange(mean_ci) %>% head(1) cat("Minimum CI reached at day:", min_ci_day$days_from_harvest, "(CI:", round(min_ci_day$mean_ci, 2), ")\n") # Find when CI starts recovering recovery_start <- summary_by_offset %>% filter(days_from_harvest > 0) %>% mutate(recovery_from_harvest = mean_ci - harvest_day_ci) %>% filter(recovery_from_harvest > 0.3) %>% # Significant recovery arrange(days_from_harvest) %>% head(1) if (nrow(recovery_start) > 0) { cat("Recovery detected at day:", recovery_start$days_from_harvest, "(CI:", round(recovery_start$mean_ci, 2), ", gain:", round(recovery_start$recovery_from_harvest, 2), ")\n") } # Analyze the ENTIRE harvest period (not just a single day) cat("\n=== MULTI-DAY HARVEST PERIOD ANALYSIS ===\n") cat("Harvest may span multiple days/weeks. Looking for extended low CI periods...\n\n") # Count consecutive days below different thresholds for (threshold in c(1.5, 2.0, 2.5, 3.0)) { consecutive_low <- temporal_analysis %>% arrange(field_id, date) %>% group_by(field_id) %>% mutate( is_low = mean_ci < threshold, day_diff = as.numeric(date - lag(date)), new_period = is.na(day_diff) | day_diff > 3 | !is_low, # Gap or not low period_id = cumsum(new_period) ) %>% filter(is_low) %>% group_by(field_id, period_id) %>% summarise( start_day = min(days_from_harvest), end_day = max(days_from_harvest), duration = n(), mean_ci_period = mean(mean_ci), .groups = "drop" ) %>% filter(duration >= 3) # At least 3 consecutive days if (nrow(consecutive_low) > 0) { cat("\nConsecutive periods with CI <", threshold, ":\n") cat(" Number of periods:", nrow(consecutive_low), "\n") cat(" Average duration:", round(mean(consecutive_low$duration), 1), "days\n") cat(" Median start day:", round(median(consecutive_low$start_day), 1), "\n") cat(" Median end day:", round(median(consecutive_low$end_day), 1), "\n") # Show distribution of when these periods start periods_before <- sum(consecutive_low$start_day < -7) periods_during <- sum(consecutive_low$start_day >= -7 & consecutive_low$start_day <= 7) periods_after <- sum(consecutive_low$start_day > 7) cat(" Periods starting before harvest (-30 to -7):", periods_before, "\n") cat(" Periods starting during harvest (-7 to +7):", periods_during, "\n") cat(" Periods starting after harvest (+7 to +30):", periods_after, "\n") } } cat("\n=== RECOMMENDED THRESHOLDS (DAILY DATA) ===\n") ci_75th <- quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE) ci_90th <- quantile(ci_at_harvest$mean_ci, 0.90, na.rm = TRUE) cat("Based on actual harvest CI values:\n") cat(" Conservative threshold (captures 75% of harvests): CI <", round(ci_75th, 2), "\n") cat(" Aggressive threshold (captures 90% of harvests): CI <", round(ci_90th, 2), "\n\n") # Calculate drop thresholds if (!is.na(pre_harvest_ci) && !is.na(post_harvest_ci)) { typical_drop <- pre_harvest_ci - post_harvest_ci cat("Typical CI drop (7 days before to 7 days after):", round(typical_drop, 2), "\n") cat("Suggested drop_threshold:", round(typical_drop * 0.5, 2), "(half of typical drop)\n\n") } cat("Suggested detection parameters for daily data:\n") cat(" low_ci_threshold:", round(ci_75th, 1), "(75th percentile of harvest CI)\n") cat(" drop_threshold:", round((pre_harvest_ci - post_harvest_ci) * 0.5, 1), "(half of typical drop)\n") cat(" min_low_days: 7-10 (stay below threshold for this many days)\n") cat(" recovery_threshold:", round(pre_harvest_ci, 1), "(pre-harvest CI level)\n") } # Show sample cases where detection failed cat("\n\n=== SAMPLE HARVEST DATES WITH CI VALUES ===\n") sample_harvests <- harvest_analysis %>% filter(has_ci_data) %>% arrange(mean_ci) %>% select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>% head(15) cat("15 harvests with LOWEST CI on harvest day:\n") print(sample_harvests) sample_high <- harvest_analysis %>% filter(has_ci_data) %>% arrange(desc(mean_ci)) %>% select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>% head(10) cat("\n10 harvests with HIGHEST CI on harvest day:\n") print(sample_high)