187 lines
7 KiB
R
187 lines
7 KiB
R
# Compare detected harvest events with actual harvest data
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(terra)
|
|
library(sf)
|
|
library(here)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
# Source required files
|
|
cat("Loading project configuration and utilities...\n")
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# Load the harvest detection function from 20_generate_kpi_excel.R
|
|
source(here("r_app", "20_generate_kpi_excel.R"))
|
|
|
|
cat("Building time series from all weekly mosaics...\n")
|
|
# Get current week/year for context (use most recent available data)
|
|
current_date <- Sys.Date()
|
|
current_week <- isoweek(current_date)
|
|
current_year <- isoyear(current_date)
|
|
|
|
# Build complete time series from all weekly mosaics
|
|
time_series <- build_time_series_from_weekly_mosaics(
|
|
weekly_mosaic_dir = weekly_CI_mosaic,
|
|
field_boundaries_sf = field_boundaries_sf,
|
|
current_week = current_week,
|
|
current_year = current_year
|
|
)
|
|
|
|
# Create week_year column for unique week identification
|
|
time_series$week_year <- paste(time_series$year, time_series$week, sep = "-")
|
|
|
|
# Add date column based on ISO week and year
|
|
time_series$date <- parse_date_time(paste(time_series$year, time_series$week, 1, sep = "-"), orders = "Y-W-w")
|
|
|
|
cat("Found", length(unique(time_series$week_year)), "weeks of data\n")
|
|
cat("Date range:", as.character(min(time_series$date, na.rm = TRUE)), "to", as.character(max(time_series$date, na.rm = TRUE)), "\n")
|
|
|
|
cat("\nDetecting harvest events...\n")
|
|
# Detect harvest events from the complete time series
|
|
# Key insight from temporal analysis:
|
|
# - Harvest week (0): CI = 3.30 (still has some vegetation)
|
|
# - Week +1: CI = 2.39 (dropping)
|
|
# - Week +2: CI = 1.70 (bare field)
|
|
# So we need to detect DROPS in CI, not just low absolute values
|
|
#
|
|
# Strategy: Look for periods where CI drops BELOW normal and stays low
|
|
# - ci_threshold = 2.5: Captures the post-harvest bare period (weeks +1 to +4)
|
|
# - consecutive_weeks = 3: Must stay bare for 3+ weeks (filters noise)
|
|
# - recovery_threshold = 4.0: Field recovers when CI > 4 (back to normal growth)
|
|
# - max_harvest_duration = 6: Harvest + bare period should be < 6 weeks
|
|
harvest_detected <- detect_harvest_events(
|
|
time_series_data = time_series,
|
|
ci_threshold = 2.5, # Bare field threshold (post-harvest)
|
|
consecutive_weeks = 3, # Must be bare for 3+ weeks
|
|
recovery_threshold = 4.0, # Recovered = back to normal growth
|
|
max_harvest_duration = 6 # Max 6 weeks bare
|
|
) %>%
|
|
mutate(
|
|
# Add detected_date as the end_date (when harvest was complete)
|
|
detected_date = as.Date(end_date)
|
|
)
|
|
|
|
cat("Detected", nrow(harvest_detected), "harvest events\n\n")
|
|
|
|
# Read actual harvest data
|
|
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
)
|
|
|
|
# Get list of fields from the GeoJSON (fields we have time series data for)
|
|
fields_with_data <- unique(field_boundaries_sf$field)
|
|
|
|
# Filter harvest data to only fields with time series data
|
|
harvest_actual <- harvest_actual_all %>%
|
|
filter(field %in% fields_with_data) %>%
|
|
filter(!is.na(season_end)) %>% # Remove records without harvest date
|
|
mutate(
|
|
# Convert harvest date to week/year format for comparison
|
|
actual_harvest_week = isoweek(season_end),
|
|
actual_harvest_year = isoyear(season_end)
|
|
)
|
|
|
|
# Display summary of actual harvest data
|
|
cat("=== ACTUAL HARVEST DATA SUMMARY ===\n")
|
|
cat("Total records in Excel:", nrow(harvest_actual_all), "\n")
|
|
cat("Records for fields with time series (with harvest dates):", nrow(harvest_actual), "\n")
|
|
cat("Unique fields:", n_distinct(harvest_actual$field), "\n")
|
|
cat("Year range:", min(harvest_actual$year, na.rm = TRUE), "to", max(harvest_actual$year, na.rm = TRUE), "\n\n")
|
|
|
|
cat("=== DETECTED HARVEST EVENTS ===\n")
|
|
cat("Total events:", nrow(harvest_detected), "\n")
|
|
harvest_by_year <- harvest_detected %>%
|
|
group_by(harvest_year) %>%
|
|
summarise(count = n()) %>%
|
|
arrange(harvest_year)
|
|
cat("By year:\n")
|
|
print(harvest_by_year)
|
|
cat("\n")
|
|
|
|
cat("=== ACTUAL HARVEST EVENTS ===\n")
|
|
cat("Total events:", nrow(harvest_actual), "\n")
|
|
actual_by_year <- harvest_actual %>%
|
|
group_by(year) %>%
|
|
summarise(count = n()) %>%
|
|
arrange(year)
|
|
cat("By year:\n")
|
|
print(actual_by_year)
|
|
cat("\n")
|
|
|
|
# ==============================================================================
|
|
# COMPARISON: Match detected with actual by field, week, and year
|
|
# ==============================================================================
|
|
|
|
cat("\n=== MATCHING DETECTED WITH ACTUAL ===\n")
|
|
|
|
harvest_actual2 <- harvest_actual %>%
|
|
select(field,
|
|
actual_week = actual_harvest_week,
|
|
actual_year = actual_harvest_year)
|
|
|
|
harvest_detected2 <- harvest_detected %>%
|
|
select(field_id,
|
|
detected_week = harvest_week,
|
|
detected_year = harvest_year,
|
|
duration_weeks,
|
|
mean_ci_during)
|
|
|
|
# Full outer join to see all detected AND all actual events
|
|
# Match by field and year, then compare weeks
|
|
comparison_full <- harvest_actual2 %>%
|
|
full_join(
|
|
harvest_detected2,
|
|
by = c("field" = "field_id", "actual_year" = "detected_year")
|
|
) %>%
|
|
mutate(
|
|
week_difference = ifelse(!is.na(actual_week) & !is.na(detected_week),
|
|
abs(actual_week - detected_week), NA),
|
|
status = case_when(
|
|
# Both exist - check if weeks are close (within 2 weeks)
|
|
!is.na(actual_week) & !is.na(detected_week) & week_difference <= 2 ~ "✓ MATCHED",
|
|
!is.na(actual_week) & !is.na(detected_week) & week_difference > 2 ~ paste0("⚠ MISMATCH (±", week_difference, "w)"),
|
|
# Only detected exists (false positive)
|
|
is.na(actual_week) & !is.na(detected_week) ~ "⚠ FALSE POSITIVE",
|
|
# Only actual exists (missed)
|
|
!is.na(actual_week) & is.na(detected_week) ~ "✗ MISSED",
|
|
TRUE ~ "Unknown"
|
|
)
|
|
) %>%
|
|
select(field, actual_year, actual_week, detected_week, week_difference,
|
|
status, duration_weeks, mean_ci_during) %>%
|
|
arrange(field, actual_year, actual_week)
|
|
|
|
cat("\n=== COMPARISON TABLE ===\n")
|
|
print(comparison_full, n = 100)
|
|
|
|
cat("\n\n=== SUMMARY STATISTICS ===\n")
|
|
matched <- sum(comparison_full$status == "✓ MATCHED", na.rm = TRUE)
|
|
false_pos <- sum(comparison_full$status == "⚠ FALSE POSITIVE", na.rm = TRUE)
|
|
missed <- sum(comparison_full$status == "✗ MISSED", na.rm = TRUE)
|
|
mismatch <- sum(grepl("MISMATCH", comparison_full$status), na.rm = TRUE)
|
|
|
|
cat("Total actual events:", nrow(harvest_actual), "\n")
|
|
cat("Total detected events:", nrow(harvest_detected), "\n\n")
|
|
|
|
cat("✓ MATCHED (±2 weeks):", matched, "\n")
|
|
cat("⚠ WEEK MISMATCH (>2 weeks):", mismatch, "\n")
|
|
cat("⚠ FALSE POSITIVES:", false_pos, "\n")
|
|
cat("✗ MISSED:", missed, "\n\n")
|
|
|
|
if (nrow(harvest_actual) > 0) {
|
|
cat("Detection rate:", round(100 * (matched + mismatch) / nrow(harvest_actual), 1), "%\n")
|
|
cat("Accuracy (within 2 weeks):", round(100 * matched / nrow(harvest_actual), 1), "%\n")
|
|
}
|
|
|
|
|
|
|