SmartCane/r_app/experiments/harvest_prediction/old/compare_harvest_detection.R
2026-01-06 14:17:37 +01:00

187 lines
7 KiB
R

# Compare detected harvest events with actual harvest data
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(terra)
library(sf)
library(here)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
# Source required files
cat("Loading project configuration and utilities...\n")
source(here("r_app", "parameters_project.R"))
# Load the harvest detection function from 20_generate_kpi_excel.R
source(here("r_app", "20_generate_kpi_excel.R"))
cat("Building time series from all weekly mosaics...\n")
# Get current week/year for context (use most recent available data)
current_date <- Sys.Date()
current_week <- isoweek(current_date)
current_year <- isoyear(current_date)
# Build complete time series from all weekly mosaics
time_series <- build_time_series_from_weekly_mosaics(
weekly_mosaic_dir = weekly_CI_mosaic,
field_boundaries_sf = field_boundaries_sf,
current_week = current_week,
current_year = current_year
)
# Create week_year column for unique week identification
time_series$week_year <- paste(time_series$year, time_series$week, sep = "-")
# Add date column based on ISO week and year
time_series$date <- parse_date_time(paste(time_series$year, time_series$week, 1, sep = "-"), orders = "Y-W-w")
cat("Found", length(unique(time_series$week_year)), "weeks of data\n")
cat("Date range:", as.character(min(time_series$date, na.rm = TRUE)), "to", as.character(max(time_series$date, na.rm = TRUE)), "\n")
cat("\nDetecting harvest events...\n")
# Detect harvest events from the complete time series
# Key insight from temporal analysis:
# - Harvest week (0): CI = 3.30 (still has some vegetation)
# - Week +1: CI = 2.39 (dropping)
# - Week +2: CI = 1.70 (bare field)
# So we need to detect DROPS in CI, not just low absolute values
#
# Strategy: Look for periods where CI drops BELOW normal and stays low
# - ci_threshold = 2.5: Captures the post-harvest bare period (weeks +1 to +4)
# - consecutive_weeks = 3: Must stay bare for 3+ weeks (filters noise)
# - recovery_threshold = 4.0: Field recovers when CI > 4 (back to normal growth)
# - max_harvest_duration = 6: Harvest + bare period should be < 6 weeks
harvest_detected <- detect_harvest_events(
time_series_data = time_series,
ci_threshold = 2.5, # Bare field threshold (post-harvest)
consecutive_weeks = 3, # Must be bare for 3+ weeks
recovery_threshold = 4.0, # Recovered = back to normal growth
max_harvest_duration = 6 # Max 6 weeks bare
) %>%
mutate(
# Add detected_date as the end_date (when harvest was complete)
detected_date = as.Date(end_date)
)
cat("Detected", nrow(harvest_detected), "harvest events\n\n")
# Read actual harvest data
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
)
# Get list of fields from the GeoJSON (fields we have time series data for)
fields_with_data <- unique(field_boundaries_sf$field)
# Filter harvest data to only fields with time series data
harvest_actual <- harvest_actual_all %>%
filter(field %in% fields_with_data) %>%
filter(!is.na(season_end)) %>% # Remove records without harvest date
mutate(
# Convert harvest date to week/year format for comparison
actual_harvest_week = isoweek(season_end),
actual_harvest_year = isoyear(season_end)
)
# Display summary of actual harvest data
cat("=== ACTUAL HARVEST DATA SUMMARY ===\n")
cat("Total records in Excel:", nrow(harvest_actual_all), "\n")
cat("Records for fields with time series (with harvest dates):", nrow(harvest_actual), "\n")
cat("Unique fields:", n_distinct(harvest_actual$field), "\n")
cat("Year range:", min(harvest_actual$year, na.rm = TRUE), "to", max(harvest_actual$year, na.rm = TRUE), "\n\n")
cat("=== DETECTED HARVEST EVENTS ===\n")
cat("Total events:", nrow(harvest_detected), "\n")
harvest_by_year <- harvest_detected %>%
group_by(harvest_year) %>%
summarise(count = n()) %>%
arrange(harvest_year)
cat("By year:\n")
print(harvest_by_year)
cat("\n")
cat("=== ACTUAL HARVEST EVENTS ===\n")
cat("Total events:", nrow(harvest_actual), "\n")
actual_by_year <- harvest_actual %>%
group_by(year) %>%
summarise(count = n()) %>%
arrange(year)
cat("By year:\n")
print(actual_by_year)
cat("\n")
# ==============================================================================
# COMPARISON: Match detected with actual by field, week, and year
# ==============================================================================
cat("\n=== MATCHING DETECTED WITH ACTUAL ===\n")
harvest_actual2 <- harvest_actual %>%
select(field,
actual_week = actual_harvest_week,
actual_year = actual_harvest_year)
harvest_detected2 <- harvest_detected %>%
select(field_id,
detected_week = harvest_week,
detected_year = harvest_year,
duration_weeks,
mean_ci_during)
# Full outer join to see all detected AND all actual events
# Match by field and year, then compare weeks
comparison_full <- harvest_actual2 %>%
full_join(
harvest_detected2,
by = c("field" = "field_id", "actual_year" = "detected_year")
) %>%
mutate(
week_difference = ifelse(!is.na(actual_week) & !is.na(detected_week),
abs(actual_week - detected_week), NA),
status = case_when(
# Both exist - check if weeks are close (within 2 weeks)
!is.na(actual_week) & !is.na(detected_week) & week_difference <= 2 ~ "✓ MATCHED",
!is.na(actual_week) & !is.na(detected_week) & week_difference > 2 ~ paste0("⚠ MISMATCH (±", week_difference, "w)"),
# Only detected exists (false positive)
is.na(actual_week) & !is.na(detected_week) ~ "⚠ FALSE POSITIVE",
# Only actual exists (missed)
!is.na(actual_week) & is.na(detected_week) ~ "✗ MISSED",
TRUE ~ "Unknown"
)
) %>%
select(field, actual_year, actual_week, detected_week, week_difference,
status, duration_weeks, mean_ci_during) %>%
arrange(field, actual_year, actual_week)
cat("\n=== COMPARISON TABLE ===\n")
print(comparison_full, n = 100)
cat("\n\n=== SUMMARY STATISTICS ===\n")
matched <- sum(comparison_full$status == "✓ MATCHED", na.rm = TRUE)
false_pos <- sum(comparison_full$status == "⚠ FALSE POSITIVE", na.rm = TRUE)
missed <- sum(comparison_full$status == "✗ MISSED", na.rm = TRUE)
mismatch <- sum(grepl("MISMATCH", comparison_full$status), na.rm = TRUE)
cat("Total actual events:", nrow(harvest_actual), "\n")
cat("Total detected events:", nrow(harvest_detected), "\n\n")
cat("✓ MATCHED (±2 weeks):", matched, "\n")
cat("⚠ WEEK MISMATCH (>2 weeks):", mismatch, "\n")
cat("⚠ FALSE POSITIVES:", false_pos, "\n")
cat("✗ MISSED:", missed, "\n\n")
if (nrow(harvest_actual) > 0) {
cat("Detection rate:", round(100 * (matched + mismatch) / nrow(harvest_actual), 1), "%\n")
cat("Accuracy (within 2 weeks):", round(100 * matched / nrow(harvest_actual), 1), "%\n")
}