# Compare detected harvest events with actual harvest data suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(terra) library(sf) library(here) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) # Source required files cat("Loading project configuration and utilities...\n") source(here("r_app", "parameters_project.R")) # Load the harvest detection function from 20_generate_kpi_excel.R source(here("r_app", "20_generate_kpi_excel.R")) cat("Building time series from all weekly mosaics...\n") # Get current week/year for context (use most recent available data) current_date <- Sys.Date() current_week <- isoweek(current_date) current_year <- isoyear(current_date) # Build complete time series from all weekly mosaics time_series <- build_time_series_from_weekly_mosaics( weekly_mosaic_dir = weekly_CI_mosaic, field_boundaries_sf = field_boundaries_sf, current_week = current_week, current_year = current_year ) # Create week_year column for unique week identification time_series$week_year <- paste(time_series$year, time_series$week, sep = "-") # Add date column based on ISO week and year time_series$date <- parse_date_time(paste(time_series$year, time_series$week, 1, sep = "-"), orders = "Y-W-w") cat("Found", length(unique(time_series$week_year)), "weeks of data\n") cat("Date range:", as.character(min(time_series$date, na.rm = TRUE)), "to", as.character(max(time_series$date, na.rm = TRUE)), "\n") cat("\nDetecting harvest events...\n") # Detect harvest events from the complete time series # Key insight from temporal analysis: # - Harvest week (0): CI = 3.30 (still has some vegetation) # - Week +1: CI = 2.39 (dropping) # - Week +2: CI = 1.70 (bare field) # So we need to detect DROPS in CI, not just low absolute values # # Strategy: Look for periods where CI drops BELOW normal and stays low # - ci_threshold = 2.5: Captures the post-harvest bare period (weeks +1 to +4) # - consecutive_weeks = 3: Must stay bare for 3+ weeks (filters noise) # - recovery_threshold = 4.0: Field recovers when CI > 4 (back to normal growth) # - max_harvest_duration = 6: Harvest + bare period should be < 6 weeks harvest_detected <- detect_harvest_events( time_series_data = time_series, ci_threshold = 2.5, # Bare field threshold (post-harvest) consecutive_weeks = 3, # Must be bare for 3+ weeks recovery_threshold = 4.0, # Recovered = back to normal growth max_harvest_duration = 6 # Max 6 weeks bare ) %>% mutate( # Add detected_date as the end_date (when harvest was complete) detected_date = as.Date(end_date) ) cat("Detected", nrow(harvest_detected), "harvest events\n\n") # Read actual harvest data harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) # Get list of fields from the GeoJSON (fields we have time series data for) fields_with_data <- unique(field_boundaries_sf$field) # Filter harvest data to only fields with time series data harvest_actual <- harvest_actual_all %>% filter(field %in% fields_with_data) %>% filter(!is.na(season_end)) %>% # Remove records without harvest date mutate( # Convert harvest date to week/year format for comparison actual_harvest_week = isoweek(season_end), actual_harvest_year = isoyear(season_end) ) # Display summary of actual harvest data cat("=== ACTUAL HARVEST DATA SUMMARY ===\n") cat("Total records in Excel:", nrow(harvest_actual_all), "\n") cat("Records for fields with time series (with harvest dates):", nrow(harvest_actual), "\n") cat("Unique fields:", n_distinct(harvest_actual$field), "\n") cat("Year range:", min(harvest_actual$year, na.rm = TRUE), "to", max(harvest_actual$year, na.rm = TRUE), "\n\n") cat("=== DETECTED HARVEST EVENTS ===\n") cat("Total events:", nrow(harvest_detected), "\n") harvest_by_year <- harvest_detected %>% group_by(harvest_year) %>% summarise(count = n()) %>% arrange(harvest_year) cat("By year:\n") print(harvest_by_year) cat("\n") cat("=== ACTUAL HARVEST EVENTS ===\n") cat("Total events:", nrow(harvest_actual), "\n") actual_by_year <- harvest_actual %>% group_by(year) %>% summarise(count = n()) %>% arrange(year) cat("By year:\n") print(actual_by_year) cat("\n") # ============================================================================== # COMPARISON: Match detected with actual by field, week, and year # ============================================================================== cat("\n=== MATCHING DETECTED WITH ACTUAL ===\n") harvest_actual2 <- harvest_actual %>% select(field, actual_week = actual_harvest_week, actual_year = actual_harvest_year) harvest_detected2 <- harvest_detected %>% select(field_id, detected_week = harvest_week, detected_year = harvest_year, duration_weeks, mean_ci_during) # Full outer join to see all detected AND all actual events # Match by field and year, then compare weeks comparison_full <- harvest_actual2 %>% full_join( harvest_detected2, by = c("field" = "field_id", "actual_year" = "detected_year") ) %>% mutate( week_difference = ifelse(!is.na(actual_week) & !is.na(detected_week), abs(actual_week - detected_week), NA), status = case_when( # Both exist - check if weeks are close (within 2 weeks) !is.na(actual_week) & !is.na(detected_week) & week_difference <= 2 ~ "✓ MATCHED", !is.na(actual_week) & !is.na(detected_week) & week_difference > 2 ~ paste0("⚠ MISMATCH (±", week_difference, "w)"), # Only detected exists (false positive) is.na(actual_week) & !is.na(detected_week) ~ "⚠ FALSE POSITIVE", # Only actual exists (missed) !is.na(actual_week) & is.na(detected_week) ~ "✗ MISSED", TRUE ~ "Unknown" ) ) %>% select(field, actual_year, actual_week, detected_week, week_difference, status, duration_weeks, mean_ci_during) %>% arrange(field, actual_year, actual_week) cat("\n=== COMPARISON TABLE ===\n") print(comparison_full, n = 100) cat("\n\n=== SUMMARY STATISTICS ===\n") matched <- sum(comparison_full$status == "✓ MATCHED", na.rm = TRUE) false_pos <- sum(comparison_full$status == "⚠ FALSE POSITIVE", na.rm = TRUE) missed <- sum(comparison_full$status == "✗ MISSED", na.rm = TRUE) mismatch <- sum(grepl("MISMATCH", comparison_full$status), na.rm = TRUE) cat("Total actual events:", nrow(harvest_actual), "\n") cat("Total detected events:", nrow(harvest_detected), "\n\n") cat("✓ MATCHED (±2 weeks):", matched, "\n") cat("⚠ WEEK MISMATCH (>2 weeks):", mismatch, "\n") cat("⚠ FALSE POSITIVES:", false_pos, "\n") cat("✗ MISSED:", missed, "\n\n") if (nrow(harvest_actual) > 0) { cat("Detection rate:", round(100 * (matched + mismatch) / nrow(harvest_actual), 1), "%\n") cat("Accuracy (within 2 weeks):", round(100 * matched / nrow(harvest_actual), 1), "%\n") }