SmartCane/r_app/experiments/harvest_prediction/old/analyze_missed_harvests.R

# Analyze specific MISSED harvests to understand why detection failed
suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(terra)
  library(sf)
  library(here)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

# Source required files
cat("Loading project configuration...\n")
source(here("r_app", "parameters_project.R"))

# Read pre-extracted CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series <- ci_data_raw %>%
  mutate(
    date = as.Date(Date),
    week = isoweek(date),
    year = isoyear(date)
  ) %>%
  select(
    field_id = field,
    date,
    week,
    year,
    mean_ci = FitData
  ) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

# Read actual harvest data
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

fields_with_data <- unique(field_boundaries_sf$field)

harvest_actual <- harvest_actual_all %>%
  filter(field %in% fields_with_data) %>%
  filter(!is.na(season_end))

cat("=== ANALYZING MISSED HARVESTS ===\n\n")

# Fields that were missed in detection results (from previous output)
missed_cases <- c("00302", "00F25", "00F28", "00P81", "00P82", "00P83", "00P84", "KHWA", "KHWB", "KHWC", "LOMDA")

# Analyze each missed field's harvests
for (field_name in missed_cases[1:5]) {  # Analyze first 5 fields

  field_harvests <- harvest_actual %>%
    filter(field == field_name) %>%
    arrange(season_end)

  if (nrow(field_harvests) == 0) next

  cat("\n========================================\n")
  cat("FIELD:", field_name, "\n")
  cat("Total harvests:", nrow(field_harvests), "\n")
  cat("========================================\n\n")

  # Analyze each harvest for this field
  for (i in 1:min(3, nrow(field_harvests))) {  # First 3 harvests
    harvest_date <- field_harvests$season_end[i]
    harvest_week <- isoweek(harvest_date)
    harvest_year <- isoyear(harvest_date)

    cat("\n--- Harvest", i, "---\n")
    cat("Date:", as.character(harvest_date), "(Week", harvest_week, harvest_year, ")\n\n")

    # Get CI values around this harvest
    harvest_window <- time_series %>%
      filter(
        field_id == field_name,
        date >= (harvest_date - 30),
        date <= (harvest_date + 30)
      ) %>%
      mutate(
        days_from_harvest = as.numeric(date - harvest_date),
        ci_smooth = zoo::rollmean(mean_ci, k = 7, fill = NA, align = "center"),
        ci_lag7 = lag(ci_smooth, 7),
        ci_drop = ci_lag7 - ci_smooth,
        is_low_1.5 = mean_ci < 1.5,
        is_low_2.0 = mean_ci < 2.0,
        is_low_2.5 = mean_ci < 2.5,
        is_drop_0.3 = ci_drop > 0.3,
        is_drop_0.5 = ci_drop > 0.5
      )

    if (nrow(harvest_window) == 0) {
      cat("  NO DATA available for this harvest period\n")
      next
    }

    # Summary statistics
    cat("CI Summary (±30 days):\n")
    cat("  Min CI:", round(min(harvest_window$mean_ci, na.rm = TRUE), 2), "\n")
    cat("  Max CI:", round(max(harvest_window$mean_ci, na.rm = TRUE), 2), "\n")
    cat("  Mean CI:", round(mean(harvest_window$mean_ci, na.rm = TRUE), 2), "\n")

    # CI at/near harvest date
    near_harvest <- harvest_window %>%
      filter(abs(days_from_harvest) <= 3) %>%
      arrange(abs(days_from_harvest))

    if (nrow(near_harvest) > 0) {
      cat("  CI at harvest date (±3 days):", round(near_harvest$mean_ci[1], 2), "\n")
    }

    # Find minimum CI and when it occurred
    min_ci_row <- harvest_window %>%
      filter(mean_ci == min(mean_ci, na.rm = TRUE)) %>%
      head(1)

    cat("  Minimum CI:", round(min_ci_row$mean_ci, 2), "at day", min_ci_row$days_from_harvest, "\n\n")

    # Count days below different thresholds
    cat("Days with low CI:\n")
    cat("  CI < 1.5:", sum(harvest_window$is_low_1.5, na.rm = TRUE), "days\n")
    cat("  CI < 2.0:", sum(harvest_window$is_low_2.0, na.rm = TRUE), "days\n")
    cat("  CI < 2.5:", sum(harvest_window$is_low_2.5, na.rm = TRUE), "days\n\n")

    # Find longest consecutive period below threshold
    for (threshold in c(1.5, 2.0, 2.5)) {
      consecutive <- harvest_window %>%
        arrange(date) %>%
        mutate(
          is_low = mean_ci < threshold,
          day_diff = as.numeric(date - lag(date)),
          new_period = is.na(day_diff) | day_diff > 3 | !is_low,
          period_id = cumsum(new_period)
        ) %>%
        filter(is_low) %>%
        group_by(period_id) %>%
        summarise(
          start_day = min(days_from_harvest),
          end_day = max(days_from_harvest),
          duration = n(),
          mean_ci_period = mean(mean_ci),
          .groups = "drop"
        ) %>%
        arrange(desc(duration))

      if (nrow(consecutive) > 0) {
        longest <- consecutive[1, ]
        cat("Longest consecutive period (CI <", threshold, "):\n")
        cat("  Duration:", longest$duration, "days\n")
        cat("  Start day:", longest$start_day, ", End day:", longest$end_day, "\n")
        cat("  Mean CI:", round(longest$mean_ci_period, 2), "\n\n")
      }
    }

    # Show when significant drops occurred
    drops <- harvest_window %>%
      filter(!is.na(ci_drop), ci_drop > 0.3) %>%
      arrange(days_from_harvest)

    if (nrow(drops) > 0) {
      cat("Significant CI drops (>0.3) detected:\n")
      cat("  First drop at day:", drops$days_from_harvest[1], "(drop:", round(drops$ci_drop[1], 2), ")\n")
      if (nrow(drops) > 1) {
        cat("  Total drops detected:", nrow(drops), "\n")
      }
      cat("\n")
    } else {
      cat("No significant CI drops (>0.3) detected in this period\n\n")
    }

    # Show daily data around harvest
    cat("Daily CI values (days -7 to +21):\n")
    daily_view <- harvest_window %>%
      filter(days_from_harvest >= -7, days_from_harvest <= 21) %>%
      select(days_from_harvest, date, mean_ci, is_low_2.0) %>%
      arrange(days_from_harvest)

    print(daily_view, n = 100)
  }
}

cat("\n\n=== SUMMARY ===\n")
cat("Key observations:\n")
cat("1. Check if CI actually drops below 2.0 around harvest dates\n")
cat("2. Check when the minimum CI occurs (before, during, or after harvest)\n")
cat("3. Check duration of low CI periods\n")
cat("4. Identify timing offset between reported harvest date and actual low CI period\n")