SmartCane/r_app/experiments/harvest_prediction/detect_harvest_stateful.R
2026-01-06 14:17:37 +01:00

273 lines
10 KiB
R

# State-based harvest detection considering crop lifecycle
# Detects: GROWING → MATURING → DECLINING → HARVEST → RECOVERING
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(terra)
library(sf)
library(here)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))
# Read pre-extracted CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(
date = as.Date(Date),
week = isoweek(date),
year = isoyear(date)
) %>%
select(
field_id = field,
date,
week,
year,
mean_ci = FitData
) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
cat("Loaded", nrow(time_series_daily), "daily observations\n\n")
# ==============================================================================
# STATE-BASED HARVEST DETECTION
# ==============================================================================
detect_harvest_stateful <- function(daily_ts, field_name,
mature_ci = 3.5, # CI > this = mature crop
harvest_ci = 2.5, # CI < this = harvest phase
mature_window = 30, # Days to confirm mature state
decline_rate = -0.02, # CI/day decline rate to detect pre-harvest
harvest_min_days = 14, # Minimum days below harvest_ci (increased to delay detection)
recovery_threshold = 3.0) { # CI rising above this = recovery
field_ts <- daily_ts %>%
filter(field_id == field_name) %>%
arrange(date) %>%
mutate(
# Smoothing: 7-day rolling median to reduce noise
ci_smooth = zoo::rollmedian(mean_ci, k = 7, fill = NA, align = "center"),
ci_smooth = ifelse(is.na(ci_smooth), mean_ci, ci_smooth),
# Trend: 14-day rolling slope (CI change rate)
ci_trend = (ci_smooth - lag(ci_smooth, 14)) / 14,
# Rolling statistics for context
ci_mean_60d = zoo::rollmean(ci_smooth, k = 60, fill = NA, align = "right"),
ci_max_60d = zoo::rollmax(ci_smooth, k = 60, fill = NA, align = "right")
)
if (nrow(field_ts) < 100) {
return(tibble(
field_id = character(),
harvest_date = as.Date(character()),
harvest_week = numeric(),
harvest_year = numeric(),
state = character(),
ci_at_harvest = numeric()
))
}
# State machine: track crop lifecycle states
field_ts <- field_ts %>%
mutate(
# Define states based on CI level and trend
is_mature = ci_smooth > mature_ci & ci_mean_60d > mature_ci,
is_declining = ci_trend < decline_rate & !is.na(ci_trend),
is_harvest = ci_smooth < harvest_ci,
is_recovering = ci_smooth > recovery_threshold & ci_trend > 0.01
)
# Detect harvest events: MATURE phase → CI drops below threshold → declare harvest
harvests <- tibble()
i <- mature_window + 1
last_harvest_date <- as.Date("1900-01-01")
consecutive_low_days <- 0
potential_harvest_start <- NA
while (i <= nrow(field_ts)) {
current_date <- field_ts$date[i]
days_since_last_harvest <- as.numeric(current_date - last_harvest_date)
# Only look for new harvest if enough time has passed (min 6 months)
if (days_since_last_harvest > 180) {
# Check if currently in low CI period
if (field_ts$is_harvest[i]) {
if (consecutive_low_days == 0) {
# Start of new low period - check if came from mature state
recent_was_mature <- any(field_ts$is_mature[(max(1,i-60)):(i-1)], na.rm = TRUE)
if (recent_was_mature) {
potential_harvest_start <- current_date
consecutive_low_days <- 1
}
} else {
consecutive_low_days <- consecutive_low_days + 1
}
# Declare harvest after consecutive low days threshold met
if (consecutive_low_days == harvest_min_days) {
harvests <- bind_rows(harvests, tibble(
field_id = field_name,
harvest_date = potential_harvest_start,
harvest_week = isoweek(potential_harvest_start),
harvest_year = isoyear(potential_harvest_start),
state = "APPROACHING", # Stage 1: CI declining, harvest approaching
alert_message = "⚠️ Field CI declining - harvest expected in 2-4 weeks",
ci_at_harvest = field_ts$ci_smooth[field_ts$date == potential_harvest_start],
low_days = consecutive_low_days
))
last_harvest_date <- potential_harvest_start
}
} else {
# CI rose above threshold - reset counter
consecutive_low_days <- 0
potential_harvest_start <- NA
}
}
i <- i + 1
}
# ============================================================================
# STAGE 2: Detect harvest completion (CI stabilized at low level)
# ============================================================================
# For each detected "APPROACHING" harvest, check if we can upgrade to "COMPLETED"
if (nrow(harvests) > 0) {
for (h in 1:nrow(harvests)) {
if (harvests$state[h] == "APPROACHING") {
approach_date <- harvests$harvest_date[h]
# Look 7-21 days after approach detection for stabilization
stable_window <- field_ts %>%
filter(date >= approach_date + 7, date <= approach_date + 21)
if (nrow(stable_window) >= 7) {
# Calculate stability: low CI with low variability
stable_window <- stable_window %>%
mutate(
ci_sd_7d = zoo::rollapply(ci_smooth, width = 7, FUN = sd, fill = NA, align = "center")
)
# Check if CI is stable (SD < 0.3) and low (< 2.0) for at least 7 days
stable_days <- stable_window %>%
filter(!is.na(ci_sd_7d), ci_sd_7d < 0.3, ci_smooth < 2.0) %>%
nrow()
if (stable_days >= 7) {
# Upgrade to COMPLETED
harvests$state[h] <- "COMPLETED"
harvests$alert_message[h] <- "✓ Harvest likely completed in recent days - CI stable at low level"
}
}
}
}
}
# Remove the low_days column before returning to match expected schema
harvests <- harvests %>% select(-low_days, -alert_message)
return(harvests)
}
cat("Running state-based harvest detection...\n")
all_harvests <- lapply(unique(time_series_daily$field_id), function(field_name) {
detect_harvest_stateful(daily_ts = time_series_daily, field_name)
}) %>% bind_rows()
cat("Detected", nrow(all_harvests), "harvest events\n")
cat(" APPROACHING (CI declining):", sum(all_harvests$state == "APPROACHING"), "\n")
cat(" COMPLETED (CI stable low):", sum(all_harvests$state == "COMPLETED"), "\n\n")
# ==============================================================================
# COMPARE WITH ACTUAL HARVEST DATA
# ==============================================================================
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
fields_with_data <- unique(field_boundaries_sf$field)
harvest_actual <- harvest_actual_all %>%
filter(field %in% fields_with_data) %>%
filter(!is.na(season_end)) %>%
mutate(
actual_harvest_week = isoweek(season_end),
actual_harvest_year = isoyear(season_end)
)
cat("=== COMPARISON: STATE-BASED DETECTION vs ACTUAL ===\n\n")
harvest_actual2 <- harvest_actual %>%
select(field, actual_week = actual_harvest_week, actual_year = actual_harvest_year)
harvest_detected2 <- all_harvests %>%
select(field_id, detected_week = harvest_week, detected_year = harvest_year,
state, ci_at_harvest)
comparison <- harvest_actual2 %>%
full_join(
harvest_detected2,
by = c("field" = "field_id", "actual_year" = "detected_year")
) %>%
mutate(
week_difference_signed = ifelse(!is.na(actual_week) & !is.na(detected_week),
detected_week - actual_week, NA), # Negative = detected early
week_difference = abs(week_difference_signed),
status = case_when(
!is.na(actual_week) & !is.na(detected_week) & week_difference <= 2 ~ "✓ MATCHED",
!is.na(actual_week) & !is.na(detected_week) & week_difference > 2 ~ paste0("⚠ MISMATCH (", ifelse(week_difference_signed < 0, week_difference_signed, paste0("+", week_difference_signed)), "w)"),
is.na(actual_week) & !is.na(detected_week) ~ "⚠ FALSE POSITIVE",
!is.na(actual_week) & is.na(detected_week) ~ "✗ MISSED",
TRUE ~ "Unknown"
)
) %>%
select(field, actual_year, actual_week, detected_week, week_diff = week_difference_signed,
status, state, ci_at_harvest) %>%
filter(!is.na(actual_week)) %>% # Only compare against actual recorded harvests
arrange(field, actual_year)
cat("Filtered to only fields with recorded harvest dates\n")
cat("(Removed rows where actual_week = NA)\n\n")
print(comparison, n = 100)
cat("\n\n=== SUMMARY STATISTICS (FILTERED DATA ONLY) ===\n")
matched <- sum(comparison$status == "✓ MATCHED", na.rm = TRUE)
false_pos <- sum(comparison$status == "⚠ FALSE POSITIVE", na.rm = TRUE)
missed <- sum(comparison$status == "✗ MISSED", na.rm = TRUE)
mismatch <- sum(grepl("MISMATCH", comparison$status), na.rm = TRUE)
cat("Total actual harvest events (with records):", nrow(harvest_actual), "\n")
cat("Total rows in filtered comparison:", nrow(comparison), "\n\n")
cat("✓ MATCHED (±2 weeks):", matched, "\n")
cat("⚠ WEEK MISMATCH (>2 weeks):", mismatch, "\n")
cat("⚠ FALSE POSITIVES:", false_pos, "\n")
cat("✗ MISSED:", missed, "\n\n")
if (nrow(harvest_actual) > 0) {
cat("Detection rate:", round(100 * (matched + mismatch) / nrow(harvest_actual), 1), "%\n")
cat("Accuracy (within 2 weeks):", round(100 * matched / nrow(harvest_actual), 1), "%\n")
}
cat("\n\nDetection approach: STATE-BASED\n")
cat("States: MATURE (CI>3.5) → DECLINING (slope<-0.02) → HARVEST (CI<2.5) → RECOVERY (CI rising)\n")
cat("Natural duplicate prevention: Must be 6+ months since last harvest to enter new cycle\n")
cat("Confirmation: Only counts as harvest if followed by recovery (CI rising)\n")