# State-based harvest detection considering crop lifecycle # Detects: GROWING → MATURING → DECLINING → HARVEST → RECOVERING suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(terra) library(sf) library(here) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) source(here("r_app", "parameters_project.R")) # Read pre-extracted CI data ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate( date = as.Date(Date), week = isoweek(date), year = isoyear(date) ) %>% select( field_id = field, date, week, year, mean_ci = FitData ) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) cat("Loaded", nrow(time_series_daily), "daily observations\n\n") # ============================================================================== # STATE-BASED HARVEST DETECTION # ============================================================================== detect_harvest_stateful <- function(daily_ts, field_name, mature_ci = 3.5, # CI > this = mature crop harvest_ci = 2.5, # CI < this = harvest phase mature_window = 30, # Days to confirm mature state decline_rate = -0.02, # CI/day decline rate to detect pre-harvest harvest_min_days = 14, # Minimum days below harvest_ci (increased to delay detection) recovery_threshold = 3.0) { # CI rising above this = recovery field_ts <- daily_ts %>% filter(field_id == field_name) %>% arrange(date) %>% mutate( # Smoothing: 7-day rolling median to reduce noise ci_smooth = zoo::rollmedian(mean_ci, k = 7, fill = NA, align = "center"), ci_smooth = ifelse(is.na(ci_smooth), mean_ci, ci_smooth), # Trend: 14-day rolling slope (CI change rate) ci_trend = (ci_smooth - lag(ci_smooth, 14)) / 14, # Rolling statistics for context ci_mean_60d = zoo::rollmean(ci_smooth, k = 60, fill = NA, align = "right"), ci_max_60d = zoo::rollmax(ci_smooth, k = 60, fill = NA, align = "right") ) if (nrow(field_ts) < 100) { return(tibble( field_id = character(), harvest_date = as.Date(character()), harvest_week = numeric(), harvest_year = numeric(), state = character(), ci_at_harvest = numeric() )) } # State machine: track crop lifecycle states field_ts <- field_ts %>% mutate( # Define states based on CI level and trend is_mature = ci_smooth > mature_ci & ci_mean_60d > mature_ci, is_declining = ci_trend < decline_rate & !is.na(ci_trend), is_harvest = ci_smooth < harvest_ci, is_recovering = ci_smooth > recovery_threshold & ci_trend > 0.01 ) # Detect harvest events: MATURE phase → CI drops below threshold → declare harvest harvests <- tibble() i <- mature_window + 1 last_harvest_date <- as.Date("1900-01-01") consecutive_low_days <- 0 potential_harvest_start <- NA while (i <= nrow(field_ts)) { current_date <- field_ts$date[i] days_since_last_harvest <- as.numeric(current_date - last_harvest_date) # Only look for new harvest if enough time has passed (min 6 months) if (days_since_last_harvest > 180) { # Check if currently in low CI period if (field_ts$is_harvest[i]) { if (consecutive_low_days == 0) { # Start of new low period - check if came from mature state recent_was_mature <- any(field_ts$is_mature[(max(1,i-60)):(i-1)], na.rm = TRUE) if (recent_was_mature) { potential_harvest_start <- current_date consecutive_low_days <- 1 } } else { consecutive_low_days <- consecutive_low_days + 1 } # Declare harvest after consecutive low days threshold met if (consecutive_low_days == harvest_min_days) { harvests <- bind_rows(harvests, tibble( field_id = field_name, harvest_date = potential_harvest_start, harvest_week = isoweek(potential_harvest_start), harvest_year = isoyear(potential_harvest_start), state = "APPROACHING", # Stage 1: CI declining, harvest approaching alert_message = "⚠️ Field CI declining - harvest expected in 2-4 weeks", ci_at_harvest = field_ts$ci_smooth[field_ts$date == potential_harvest_start], low_days = consecutive_low_days )) last_harvest_date <- potential_harvest_start } } else { # CI rose above threshold - reset counter consecutive_low_days <- 0 potential_harvest_start <- NA } } i <- i + 1 } # ============================================================================ # STAGE 2: Detect harvest completion (CI stabilized at low level) # ============================================================================ # For each detected "APPROACHING" harvest, check if we can upgrade to "COMPLETED" if (nrow(harvests) > 0) { for (h in 1:nrow(harvests)) { if (harvests$state[h] == "APPROACHING") { approach_date <- harvests$harvest_date[h] # Look 7-21 days after approach detection for stabilization stable_window <- field_ts %>% filter(date >= approach_date + 7, date <= approach_date + 21) if (nrow(stable_window) >= 7) { # Calculate stability: low CI with low variability stable_window <- stable_window %>% mutate( ci_sd_7d = zoo::rollapply(ci_smooth, width = 7, FUN = sd, fill = NA, align = "center") ) # Check if CI is stable (SD < 0.3) and low (< 2.0) for at least 7 days stable_days <- stable_window %>% filter(!is.na(ci_sd_7d), ci_sd_7d < 0.3, ci_smooth < 2.0) %>% nrow() if (stable_days >= 7) { # Upgrade to COMPLETED harvests$state[h] <- "COMPLETED" harvests$alert_message[h] <- "✓ Harvest likely completed in recent days - CI stable at low level" } } } } } # Remove the low_days column before returning to match expected schema harvests <- harvests %>% select(-low_days, -alert_message) return(harvests) } cat("Running state-based harvest detection...\n") all_harvests <- lapply(unique(time_series_daily$field_id), function(field_name) { detect_harvest_stateful(daily_ts = time_series_daily, field_name) }) %>% bind_rows() cat("Detected", nrow(all_harvests), "harvest events\n") cat(" APPROACHING (CI declining):", sum(all_harvests$state == "APPROACHING"), "\n") cat(" COMPLETED (CI stable low):", sum(all_harvests$state == "COMPLETED"), "\n\n") # ============================================================================== # COMPARE WITH ACTUAL HARVEST DATA # ============================================================================== harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) fields_with_data <- unique(field_boundaries_sf$field) harvest_actual <- harvest_actual_all %>% filter(field %in% fields_with_data) %>% filter(!is.na(season_end)) %>% mutate( actual_harvest_week = isoweek(season_end), actual_harvest_year = isoyear(season_end) ) cat("=== COMPARISON: STATE-BASED DETECTION vs ACTUAL ===\n\n") harvest_actual2 <- harvest_actual %>% select(field, actual_week = actual_harvest_week, actual_year = actual_harvest_year) harvest_detected2 <- all_harvests %>% select(field_id, detected_week = harvest_week, detected_year = harvest_year, state, ci_at_harvest) comparison <- harvest_actual2 %>% full_join( harvest_detected2, by = c("field" = "field_id", "actual_year" = "detected_year") ) %>% mutate( week_difference_signed = ifelse(!is.na(actual_week) & !is.na(detected_week), detected_week - actual_week, NA), # Negative = detected early week_difference = abs(week_difference_signed), status = case_when( !is.na(actual_week) & !is.na(detected_week) & week_difference <= 2 ~ "✓ MATCHED", !is.na(actual_week) & !is.na(detected_week) & week_difference > 2 ~ paste0("⚠ MISMATCH (", ifelse(week_difference_signed < 0, week_difference_signed, paste0("+", week_difference_signed)), "w)"), is.na(actual_week) & !is.na(detected_week) ~ "⚠ FALSE POSITIVE", !is.na(actual_week) & is.na(detected_week) ~ "✗ MISSED", TRUE ~ "Unknown" ) ) %>% select(field, actual_year, actual_week, detected_week, week_diff = week_difference_signed, status, state, ci_at_harvest) %>% filter(!is.na(actual_week)) %>% # Only compare against actual recorded harvests arrange(field, actual_year) cat("Filtered to only fields with recorded harvest dates\n") cat("(Removed rows where actual_week = NA)\n\n") print(comparison, n = 100) cat("\n\n=== SUMMARY STATISTICS (FILTERED DATA ONLY) ===\n") matched <- sum(comparison$status == "✓ MATCHED", na.rm = TRUE) false_pos <- sum(comparison$status == "⚠ FALSE POSITIVE", na.rm = TRUE) missed <- sum(comparison$status == "✗ MISSED", na.rm = TRUE) mismatch <- sum(grepl("MISMATCH", comparison$status), na.rm = TRUE) cat("Total actual harvest events (with records):", nrow(harvest_actual), "\n") cat("Total rows in filtered comparison:", nrow(comparison), "\n\n") cat("✓ MATCHED (±2 weeks):", matched, "\n") cat("⚠ WEEK MISMATCH (>2 weeks):", mismatch, "\n") cat("⚠ FALSE POSITIVES:", false_pos, "\n") cat("✗ MISSED:", missed, "\n\n") if (nrow(harvest_actual) > 0) { cat("Detection rate:", round(100 * (matched + mismatch) / nrow(harvest_actual), 1), "%\n") cat("Accuracy (within 2 weeks):", round(100 * matched / nrow(harvest_actual), 1), "%\n") } cat("\n\nDetection approach: STATE-BASED\n") cat("States: MATURE (CI>3.5) → DECLINING (slope<-0.02) → HARVEST (CI<2.5) → RECOVERY (CI rising)\n") cat("Natural duplicate prevention: Must be 6+ months since last harvest to enter new cycle\n") cat("Confirmation: Only counts as harvest if followed by recovery (CI rising)\n")