SmartCane/r_app/experiments/harvest_prediction/old/analyze_harvest_ci.R

# Analyze CI values around actual harvest dates to tune detection parameters
suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(terra)
  library(sf)
  library(here)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

# Source required files
cat("Loading project configuration...\n")
source(here("r_app", "parameters_project.R"))

# Read pre-extracted DAILY CI data from script 02
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")

cat("Reading pre-extracted daily CI data from:\n")
cat("  ", ci_rds_file, "\n")

if (!file.exists(ci_rds_file)) {
  stop("CI data file not found: ", ci_rds_file)
}

ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

cat("Loaded CI data with", nrow(ci_data_raw), "rows\n\n")

# Transform to daily time series format
cat("Converting to daily time series format...\n")
time_series <- ci_data_raw %>%
  mutate(
    date = as.Date(Date),
    week = isoweek(date),
    year = isoyear(date)
  ) %>%
  select(
    field_id = field,
    date,
    week,
    year,
    mean_ci = FitData
  ) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

cat("Daily time series ready:", nrow(time_series), "observations\n")
cat("Fields:", n_distinct(time_series$field_id), "\n")
cat("Date range:", as.character(min(time_series$date)), "to", as.character(max(time_series$date)), "\n\n")

# Read actual harvest data
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>% select(-age, -sub_area, -tonnage_ha, -sub_field)

fields_with_data <- unique(field_boundaries_sf$field)

harvest_actual <- harvest_actual_all %>%
  filter(field %in% fields_with_data) %>%
  filter(!is.na(season_end)) %>%
  mutate(
    actual_harvest_week = isoweek(season_end),
    actual_harvest_year = isoyear(season_end)
  )

cat("Analyzing CI values around actual harvest dates...\n\n")

# For each actual harvest, find the NEAREST date in time series (within ±3 days)
harvest_analysis <- harvest_actual %>%
  rowwise() %>%
  do({
    h_field <- .$field
    h_date <- .$season_end
    h_week <- .$actual_harvest_week
    h_year <- .$actual_harvest_year

    # Find nearest date in time series for this field
    nearest_match <- time_series %>%
      filter(field_id == h_field) %>%
      mutate(
        date_diff = abs(as.numeric(date - h_date))
      ) %>%
      filter(date_diff <= 3) %>%  # Within 3 days
      arrange(date_diff) %>%
      head(1)

    if (nrow(nearest_match) > 0) {
      data.frame(
        field = h_field,
        season_end = h_date,
        actual_harvest_week = h_week,
        actual_harvest_year = h_year,
        matched_date = nearest_match$date,
        date_diff = nearest_match$date_diff,
        mean_ci = nearest_match$mean_ci,
        stringsAsFactors = FALSE
      )
    } else {
      data.frame(
        field = h_field,
        season_end = h_date,
        actual_harvest_week = h_week,
        actual_harvest_year = h_year,
        matched_date = as.Date(NA),
        date_diff = NA,
        mean_ci = NA,
        stringsAsFactors = FALSE
      )
    }
  }) %>%
  ungroup() %>%
  mutate(has_ci_data = !is.na(mean_ci))

# Summary statistics
cat("=== CI VALUES AT ACTUAL HARVEST DATES ===\n")
cat("Harvests with CI data:", sum(harvest_analysis$has_ci_data), "/", nrow(harvest_analysis), "\n\n")

ci_at_harvest <- harvest_analysis %>% filter(has_ci_data)

if (nrow(ci_at_harvest) > 0) {
  cat("CI Statistics at harvest:\n")
  cat("  Min:", round(min(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
  cat("  Max:", round(max(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
  cat("  Mean:", round(mean(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
  cat("  Median:", round(median(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
  cat("  Q25:", round(quantile(ci_at_harvest$mean_ci, 0.25, na.rm = TRUE), 2), "\n")
  cat("  Q75:", round(quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE), 2), "\n\n")

  cat("Distribution of CI at harvest:\n")
  cat("  CI < 1.0:", sum(ci_at_harvest$mean_ci < 1.0, na.rm = TRUE), "\n")
  cat("  CI < 1.5:", sum(ci_at_harvest$mean_ci < 1.5, na.rm = TRUE), "\n")
  cat("  CI < 2.0:", sum(ci_at_harvest$mean_ci < 2.0, na.rm = TRUE), "\n")
  cat("  CI < 2.5:", sum(ci_at_harvest$mean_ci < 2.5, na.rm = TRUE), "\n")
  cat("  CI < 3.0:", sum(ci_at_harvest$mean_ci < 3.0, na.rm = TRUE), "\n")
  cat("  CI >= 3.0:", sum(ci_at_harvest$mean_ci >= 3.0, na.rm = TRUE), "\n\n")
}

# Look at CI values in DAYS BEFORE and AFTER harvest
cat("\n=== CI TEMPORAL PATTERN AROUND HARVEST (DAILY) ===\n")
cat("Analyzing ±30 days around actual harvest dates...\n\n")

# For each harvest, get CI values in surrounding days
temporal_analysis <- harvest_actual %>%
  rowwise() %>%
  do({
    field_name <- .$field
    harvest_date <- .$season_end

    # Get CI values for days around harvest
    field_ts <- time_series %>%
      filter(field_id == field_name,
             date >= (harvest_date - 30),
             date <= (harvest_date + 30)) %>%
      mutate(
        days_from_harvest = as.numeric(date - harvest_date),
        harvest_date_ref = harvest_date
      ) %>%
      select(field_id, date, days_from_harvest, mean_ci)

    field_ts
  }) %>%
  ungroup()

if (nrow(temporal_analysis) > 0) {
  summary_by_offset <- temporal_analysis %>%
    group_by(days_from_harvest) %>%
    summarise(
      n = n(),
      mean_ci = mean(mean_ci, na.rm = TRUE),
      median_ci = median(mean_ci, na.rm = TRUE),
      min_ci = min(mean_ci, na.rm = TRUE),
      max_ci = max(mean_ci, na.rm = TRUE),
      sd_ci = sd(mean_ci, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    arrange(days_from_harvest)

  cat("\nDaily CI pattern around harvest (±30 days):\n")
  print(summary_by_offset, n = 100)

  # Calculate CI drop from pre-harvest to post-harvest
  cat("\n=== CI DROP ANALYSIS ===\n")
  pre_harvest_ci <- summary_by_offset %>%
    filter(days_from_harvest >= -7, days_from_harvest <= -1) %>%
    summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
    pull(mean_ci)

  harvest_day_ci <- summary_by_offset %>%
    filter(days_from_harvest == 0) %>%
    pull(mean_ci)

  post_harvest_ci <- summary_by_offset %>%
    filter(days_from_harvest >= 1, days_from_harvest <= 7) %>%
    summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
    pull(mean_ci)

  cat("CI 7 days before harvest:", round(pre_harvest_ci, 2), "\n")
  cat("CI on harvest day:", round(harvest_day_ci, 2), "\n")
  cat("CI 7 days after harvest:", round(post_harvest_ci, 2), "\n")
  cat("Drop (pre to harvest day):", round(pre_harvest_ci - harvest_day_ci, 2), "\n")
  cat("Drop (harvest day to post):", round(harvest_day_ci - post_harvest_ci, 2), "\n")
  cat("Total drop (pre to post):", round(pre_harvest_ci - post_harvest_ci, 2), "\n\n")

  # Analyze when CI starts dropping
  cat("\n=== WHEN DOES CI DROP START? ===\n")
  baseline_ci <- summary_by_offset %>%
    filter(days_from_harvest >= -30, days_from_harvest <= -15) %>%
    summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
    pull(mean_ci)

  cat("Baseline CI (days -30 to -15):", round(baseline_ci, 2), "\n")

  # Find when CI first drops significantly below baseline
  drop_start <- summary_by_offset %>%
    filter(days_from_harvest < 0) %>%
    mutate(drop_from_baseline = baseline_ci - mean_ci) %>%
    filter(drop_from_baseline > 0.3) %>%  # Significant drop
    arrange(days_from_harvest) %>%
    head(1)

  if (nrow(drop_start) > 0) {
    cat("First significant drop detected at day:", drop_start$days_from_harvest,
        "(CI:", round(drop_start$mean_ci, 2), ", drop:", round(drop_start$drop_from_baseline, 2), ")\n")
  }

  # Find when CI reaches minimum
  min_ci_day <- summary_by_offset %>%
    filter(days_from_harvest >= -30, days_from_harvest <= 30) %>%
    arrange(mean_ci) %>%
    head(1)

  cat("Minimum CI reached at day:", min_ci_day$days_from_harvest,
      "(CI:", round(min_ci_day$mean_ci, 2), ")\n")

  # Find when CI starts recovering
  recovery_start <- summary_by_offset %>%
    filter(days_from_harvest > 0) %>%
    mutate(recovery_from_harvest = mean_ci - harvest_day_ci) %>%
    filter(recovery_from_harvest > 0.3) %>%  # Significant recovery
    arrange(days_from_harvest) %>%
    head(1)

  if (nrow(recovery_start) > 0) {
    cat("Recovery detected at day:", recovery_start$days_from_harvest,
        "(CI:", round(recovery_start$mean_ci, 2), ", gain:", round(recovery_start$recovery_from_harvest, 2), ")\n")
  }

  # Analyze the ENTIRE harvest period (not just a single day)
  cat("\n=== MULTI-DAY HARVEST PERIOD ANALYSIS ===\n")
  cat("Harvest may span multiple days/weeks. Looking for extended low CI periods...\n\n")

  # Count consecutive days below different thresholds
  for (threshold in c(1.5, 2.0, 2.5, 3.0)) {
    consecutive_low <- temporal_analysis %>%
      arrange(field_id, date) %>%
      group_by(field_id) %>%
      mutate(
        is_low = mean_ci < threshold,
        day_diff = as.numeric(date - lag(date)),
        new_period = is.na(day_diff) | day_diff > 3 | !is_low,  # Gap or not low
        period_id = cumsum(new_period)
      ) %>%
      filter(is_low) %>%
      group_by(field_id, period_id) %>%
      summarise(
        start_day = min(days_from_harvest),
        end_day = max(days_from_harvest),
        duration = n(),
        mean_ci_period = mean(mean_ci),
        .groups = "drop"
      ) %>%
      filter(duration >= 3)  # At least 3 consecutive days

    if (nrow(consecutive_low) > 0) {
      cat("\nConsecutive periods with CI <", threshold, ":\n")
      cat("  Number of periods:", nrow(consecutive_low), "\n")
      cat("  Average duration:", round(mean(consecutive_low$duration), 1), "days\n")
      cat("  Median start day:", round(median(consecutive_low$start_day), 1), "\n")
      cat("  Median end day:", round(median(consecutive_low$end_day), 1), "\n")

      # Show distribution of when these periods start
      periods_before <- sum(consecutive_low$start_day < -7)
      periods_during <- sum(consecutive_low$start_day >= -7 & consecutive_low$start_day <= 7)
      periods_after <- sum(consecutive_low$start_day > 7)

      cat("  Periods starting before harvest (-30 to -7):", periods_before, "\n")
      cat("  Periods starting during harvest (-7 to +7):", periods_during, "\n")
      cat("  Periods starting after harvest (+7 to +30):", periods_after, "\n")
    }
  }

  cat("\n=== RECOMMENDED THRESHOLDS (DAILY DATA) ===\n")
  ci_75th <- quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE)
  ci_90th <- quantile(ci_at_harvest$mean_ci, 0.90, na.rm = TRUE)

  cat("Based on actual harvest CI values:\n")
  cat("  Conservative threshold (captures 75% of harvests): CI <", round(ci_75th, 2), "\n")
  cat("  Aggressive threshold (captures 90% of harvests): CI <", round(ci_90th, 2), "\n\n")

  # Calculate drop thresholds
  if (!is.na(pre_harvest_ci) && !is.na(post_harvest_ci)) {
    typical_drop <- pre_harvest_ci - post_harvest_ci
    cat("Typical CI drop (7 days before to 7 days after):", round(typical_drop, 2), "\n")
    cat("Suggested drop_threshold:", round(typical_drop * 0.5, 2), "(half of typical drop)\n\n")
  }

  cat("Suggested detection parameters for daily data:\n")
  cat("  low_ci_threshold:", round(ci_75th, 1), "(75th percentile of harvest CI)\n")
  cat("  drop_threshold:", round((pre_harvest_ci - post_harvest_ci) * 0.5, 1), "(half of typical drop)\n")
  cat("  min_low_days: 7-10 (stay below threshold for this many days)\n")
  cat("  recovery_threshold:", round(pre_harvest_ci, 1), "(pre-harvest CI level)\n")
}

# Show sample cases where detection failed
cat("\n\n=== SAMPLE HARVEST DATES WITH CI VALUES ===\n")
sample_harvests <- harvest_analysis %>%
  filter(has_ci_data) %>%
  arrange(mean_ci) %>%
  select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>%
  head(15)

cat("15 harvests with LOWEST CI on harvest day:\n")
print(sample_harvests)

sample_high <- harvest_analysis %>%
  filter(has_ci_data) %>%
  arrange(desc(mean_ci)) %>%
  select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>%
  head(10)

cat("\n10 harvests with HIGHEST CI on harvest day:\n")
print(sample_high)