SmartCane/r_app/experiments/harvest_prediction/old/compare_harvest_detection.R

# Compare detected harvest events with actual harvest data
suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(terra)
  library(sf)
  library(here)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

# Source required files
cat("Loading project configuration and utilities...\n")
source(here("r_app", "parameters_project.R"))

# Load the harvest detection function from 20_generate_kpi_excel.R
source(here("r_app", "20_generate_kpi_excel.R"))

cat("Building time series from all weekly mosaics...\n")
# Get current week/year for context (use most recent available data)
current_date <- Sys.Date()
current_week <- isoweek(current_date)
current_year <- isoyear(current_date)

# Build complete time series from all weekly mosaics
time_series <- build_time_series_from_weekly_mosaics(
  weekly_mosaic_dir = weekly_CI_mosaic,
  field_boundaries_sf = field_boundaries_sf,
  current_week = current_week,
  current_year = current_year
)

# Create week_year column for unique week identification
time_series$week_year <- paste(time_series$year, time_series$week, sep = "-")

# Add date column based on ISO week and year
time_series$date <- parse_date_time(paste(time_series$year, time_series$week, 1, sep = "-"), orders = "Y-W-w")

cat("Found", length(unique(time_series$week_year)), "weeks of data\n")
cat("Date range:", as.character(min(time_series$date, na.rm = TRUE)), "to", as.character(max(time_series$date, na.rm = TRUE)), "\n")

cat("\nDetecting harvest events...\n")
# Detect harvest events from the complete time series
# Key insight from temporal analysis:
# - Harvest week (0): CI = 3.30 (still has some vegetation)
# - Week +1: CI = 2.39 (dropping)
# - Week +2: CI = 1.70 (bare field)
# So we need to detect DROPS in CI, not just low absolute values
#
# Strategy: Look for periods where CI drops BELOW normal and stays low
# - ci_threshold = 2.5: Captures the post-harvest bare period (weeks +1 to +4)
# - consecutive_weeks = 3: Must stay bare for 3+ weeks (filters noise)
# - recovery_threshold = 4.0: Field recovers when CI > 4 (back to normal growth)
# - max_harvest_duration = 6: Harvest + bare period should be < 6 weeks
harvest_detected <- detect_harvest_events(
  time_series_data = time_series,
  ci_threshold = 2.5,           # Bare field threshold (post-harvest)
  consecutive_weeks = 3,         # Must be bare for 3+ weeks
  recovery_threshold = 4.0,      # Recovered = back to normal growth
  max_harvest_duration = 6       # Max 6 weeks bare
) %>%
  mutate(
    # Add detected_date as the end_date (when harvest was complete)
    detected_date = as.Date(end_date)
  )

cat("Detected", nrow(harvest_detected), "harvest events\n\n")

# Read actual harvest data
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  )

# Get list of fields from the GeoJSON (fields we have time series data for)
fields_with_data <- unique(field_boundaries_sf$field)

# Filter harvest data to only fields with time series data
harvest_actual <- harvest_actual_all %>%
  filter(field %in% fields_with_data) %>%
  filter(!is.na(season_end)) %>%  # Remove records without harvest date
  mutate(
    # Convert harvest date to week/year format for comparison
    actual_harvest_week = isoweek(season_end),
    actual_harvest_year = isoyear(season_end)
  )

# Display summary of actual harvest data
cat("=== ACTUAL HARVEST DATA SUMMARY ===\n")
cat("Total records in Excel:", nrow(harvest_actual_all), "\n")
cat("Records for fields with time series (with harvest dates):", nrow(harvest_actual), "\n")
cat("Unique fields:", n_distinct(harvest_actual$field), "\n")
cat("Year range:", min(harvest_actual$year, na.rm = TRUE), "to", max(harvest_actual$year, na.rm = TRUE), "\n\n")

cat("=== DETECTED HARVEST EVENTS ===\n")
cat("Total events:", nrow(harvest_detected), "\n")
harvest_by_year <- harvest_detected %>%
  group_by(harvest_year) %>%
  summarise(count = n()) %>%
  arrange(harvest_year)
cat("By year:\n")
print(harvest_by_year)
cat("\n")

cat("=== ACTUAL HARVEST EVENTS ===\n")
cat("Total events:", nrow(harvest_actual), "\n")
actual_by_year <- harvest_actual %>%
  group_by(year) %>%
  summarise(count = n()) %>%
  arrange(year)
cat("By year:\n")
print(actual_by_year)
cat("\n")

# ==============================================================================
# COMPARISON: Match detected with actual by field, week, and year
# ==============================================================================

cat("\n=== MATCHING DETECTED WITH ACTUAL ===\n")

harvest_actual2 <- harvest_actual %>%
      select(field,
             actual_week = actual_harvest_week,
             actual_year = actual_harvest_year)

harvest_detected2 <- harvest_detected %>%
  select(field_id,
         detected_week = harvest_week,
         detected_year = harvest_year,
         duration_weeks,
         mean_ci_during)

# Full outer join to see all detected AND all actual events
# Match by field and year, then compare weeks
comparison_full <- harvest_actual2 %>%
  full_join(
    harvest_detected2,
    by = c("field" = "field_id", "actual_year" = "detected_year")
  ) %>%
  mutate(
    week_difference = ifelse(!is.na(actual_week) & !is.na(detected_week),
                             abs(actual_week - detected_week), NA),
    status = case_when(
      # Both exist - check if weeks are close (within 2 weeks)
      !is.na(actual_week) & !is.na(detected_week) & week_difference <= 2 ~ "✓ MATCHED",
      !is.na(actual_week) & !is.na(detected_week) & week_difference > 2 ~ paste0("⚠ MISMATCH (±", week_difference, "w)"),
      # Only detected exists (false positive)
      is.na(actual_week) & !is.na(detected_week) ~ "⚠ FALSE POSITIVE",
      # Only actual exists (missed)
      !is.na(actual_week) & is.na(detected_week) ~ "✗ MISSED",
      TRUE ~ "Unknown"
    )
  ) %>%
  select(field, actual_year, actual_week, detected_week, week_difference,
         status, duration_weeks, mean_ci_during) %>%
  arrange(field, actual_year, actual_week)

cat("\n=== COMPARISON TABLE ===\n")
print(comparison_full, n = 100)

cat("\n\n=== SUMMARY STATISTICS ===\n")
matched <- sum(comparison_full$status == "✓ MATCHED", na.rm = TRUE)
false_pos <- sum(comparison_full$status == "⚠ FALSE POSITIVE", na.rm = TRUE)
missed <- sum(comparison_full$status == "✗ MISSED", na.rm = TRUE)
mismatch <- sum(grepl("MISMATCH", comparison_full$status), na.rm = TRUE)

cat("Total actual events:", nrow(harvest_actual), "\n")
cat("Total detected events:", nrow(harvest_detected), "\n\n")

cat("✓ MATCHED (±2 weeks):", matched, "\n")
cat("⚠ WEEK MISMATCH (>2 weeks):", mismatch, "\n")
cat("⚠ FALSE POSITIVES:", false_pos, "\n")
cat("✗ MISSED:", missed, "\n\n")

if (nrow(harvest_actual) > 0) {
  cat("Detection rate:", round(100 * (matched + mismatch) / nrow(harvest_actual), 1), "%\n")
  cat("Accuracy (within 2 weeks):", round(100 * matched / nrow(harvest_actual), 1), "%\n")
}