SmartCane/r_app/experiments/harvest_prediction/old/detect_harvest_daily.R
2026-01-06 14:17:37 +01:00

448 lines
16 KiB
R

# ============================================================================
# DAILY-SCALE HARVEST DETECTION AND VALIDATION
# Real-time detection using CI drop patterns
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG <- list(
min_field_age_days = 240, # 8 months minimum age
ci_threshold = 2.5, # Below this = potential harvest
drop_threshold_low = 1.0, # Minimum CI drop to flag
drop_threshold_high = 2.0, # Strong CI drop
lookback_days_min = 7, # Compare with 7-14 days ago
lookback_days_max = 14,
confirmation_days = 3, # Days below threshold for confirmation
test_window_days = 14 # Test ±14 days around actual harvest
)
cat("=== DAILY HARVEST DETECTION CONFIGURATION ===\n\n")
cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n")
cat("CI threshold:", CONFIG$ci_threshold, "\n")
cat("Drop thresholds:", CONFIG$drop_threshold_low, "and", CONFIG$drop_threshold_high, "\n")
cat("Lookback window:", CONFIG$lookback_days_min, "-", CONFIG$lookback_days_max, "days\n")
cat("Confirmation window:", CONFIG$confirmation_days, "consecutive days\n\n")
# ============================================================================
# LOAD DATA
# ============================================================================
cat("=== LOADING DATA ===\n\n")
# Load CI time series
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
# Load harvest data
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
# Get fields with both CI and harvest data
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
filter(field %in% fields_with_ci) %>%
arrange(field, season_end)
cat("Fields with CI data:", length(fields_with_ci), "\n")
cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n")
cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n")
# ============================================================================
# DETECTION FUNCTION
# ============================================================================
detect_harvest_on_date <- function(field_ts, check_date, last_harvest_date, config = CONFIG) {
# Check if harvest can be detected on a specific date
#
# Args:
# field_ts: Daily CI time series for field
# check_date: Date to check for harvest detection
# last_harvest_date: Previous harvest date (to calculate field age)
# config: Detection parameters
#
# Returns:
# List with detection status and metrics
# Get CI value on check date
current_ci <- field_ts %>%
filter(date == check_date) %>%
pull(mean_ci)
if (length(current_ci) == 0) {
return(list(
detected = FALSE,
reason = "no_data",
current_ci = NA,
lookback_ci = NA,
ci_drop = NA,
field_age = NA,
consecutive_days_low = 0
))
}
# Check field age
field_age <- as.numeric(check_date - last_harvest_date)
if (field_age < config$min_field_age_days) {
return(list(
detected = FALSE,
reason = "too_young",
current_ci = current_ci,
lookback_ci = NA,
ci_drop = NA,
field_age = field_age,
consecutive_days_low = 0
))
}
# Get lookback period CI (mean of 7-14 days ago)
lookback_start <- check_date - config$lookback_days_max
lookback_end <- check_date - config$lookback_days_min
lookback_ci <- field_ts %>%
filter(date >= lookback_start, date <= lookback_end) %>%
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
pull(mean_ci)
if (is.na(lookback_ci) || length(lookback_ci) == 0) {
return(list(
detected = FALSE,
reason = "no_lookback_data",
current_ci = current_ci,
lookback_ci = NA,
ci_drop = NA,
field_age = field_age,
consecutive_days_low = 0
))
}
# Calculate CI drop
ci_drop <- lookback_ci - current_ci
# Check consecutive days below threshold
consecutive_days <- 0
for (i in 0:config$confirmation_days) {
test_date <- check_date - i
test_ci <- field_ts %>%
filter(date == test_date) %>%
pull(mean_ci)
if (length(test_ci) > 0 && test_ci < config$ci_threshold) {
consecutive_days <- consecutive_days + 1
} else {
break
}
}
# Detection logic
detected <- FALSE
confidence <- "none"
reason <- "no_trigger"
# Check conditions
below_threshold <- current_ci < config$ci_threshold
strong_drop <- ci_drop >= config$drop_threshold_high
moderate_drop <- ci_drop >= config$drop_threshold_low
if (below_threshold && strong_drop) {
detected <- TRUE
reason <- "strong_drop"
if (consecutive_days >= 3) {
confidence <- "confirmed"
} else if (consecutive_days >= 2) {
confidence <- "likely"
} else {
confidence <- "possible"
}
} else if (below_threshold && moderate_drop) {
if (consecutive_days >= 2) {
detected <- TRUE
reason <- "moderate_drop_confirmed"
confidence <- "likely"
} else {
detected <- TRUE
reason <- "moderate_drop"
confidence <- "possible"
}
} else if (below_threshold) {
reason <- "below_threshold_no_drop"
} else if (strong_drop) {
reason <- "strong_drop_above_threshold"
}
return(list(
detected = detected,
reason = reason,
confidence = confidence,
current_ci = current_ci,
lookback_ci = lookback_ci,
ci_drop = ci_drop,
field_age = field_age,
consecutive_days_low = consecutive_days,
below_threshold = below_threshold,
strong_drop = strong_drop
))
}
# ============================================================================
# VALIDATION: TEST AROUND KNOWN HARVEST DATES
# ============================================================================
cat("=== TESTING DETECTION AROUND KNOWN HARVEST DATES ===\n\n")
# Start with first field
test_field <- fields_with_ci[1]
cat("Testing field:", test_field, "\n\n")
# Get field's time series and harvests
field_ts <- time_series_daily %>%
filter(field_id == test_field)
field_harvests <- harvest_data_filtered %>%
filter(field == test_field) %>%
arrange(season_end)
cat("Field has", nrow(field_harvests), "recorded harvest events\n")
cat("CI observations:", nrow(field_ts), "days from", min(field_ts$date), "to", max(field_ts$date), "\n\n")
# Test each harvest event
validation_results <- list()
for (h in 1:nrow(field_harvests)) {
actual_harvest <- field_harvests$season_end[h]
# Get previous harvest for field age calculation
if (h == 1) {
last_harvest <- field_harvests$season_start[h] - 365 # Assume ~1 year before first record
} else {
last_harvest <- field_harvests$season_end[h-1]
}
cat("\n--- Harvest Event", h, ": ", as.character(actual_harvest), " ---\n")
cat("Field age at harvest:", round(as.numeric(actual_harvest - last_harvest)), "days\n\n")
# Test ±14 days around actual harvest
test_dates <- seq.Date(
from = actual_harvest - CONFIG$test_window_days,
to = actual_harvest + CONFIG$test_window_days,
by = "day"
)
# Run detection for each test date
daily_results <- data.frame()
for (i in 1:length(test_dates)) {
test_date <- test_dates[i]
result <- detect_harvest_on_date(field_ts, test_date, last_harvest)
daily_results <- rbind(daily_results, data.frame(
test_date = test_date,
days_from_actual = as.numeric(test_date - actual_harvest),
detected = result$detected,
confidence = ifelse(is.null(result$confidence) || length(result$confidence) == 0, "none", result$confidence),
reason = result$reason,
current_ci = result$current_ci,
lookback_ci = result$lookback_ci,
ci_drop = result$ci_drop,
consecutive_days = result$consecutive_days_low,
stringsAsFactors = FALSE
))
}
# Find first detection
first_detection <- daily_results %>%
filter(detected == TRUE) %>%
arrange(test_date) %>%
slice(1)
if (nrow(first_detection) > 0) {
cat("✓ First detection:", as.character(first_detection$test_date),
"(", first_detection$days_from_actual, "days from actual harvest )\n")
cat(" Confidence:", first_detection$confidence, "\n")
cat(" CI drop:", round(first_detection$ci_drop, 2), "\n\n")
} else {
cat("✗ No detection within test window\n\n")
}
# Print detailed daily table
cat("Day-by-Day Detection Results:\n")
cat(sprintf("%-12s | %-15s | %8s | %10s | %10s | %8s | %8s | %15s\n",
"Date", "Days from Actual", "Detected", "Confidence", "Drop (1.0)", "Drop (2.0)", "CI", "Reason"))
cat(paste(rep("-", 110), collapse = ""), "\n")
for (i in 1:nrow(daily_results)) {
row <- daily_results[i, ]
# Check both thresholds explicitly
drop_1 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 1.0, "YES", "NO")
drop_2 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 2.0, "YES", "NO")
cat(sprintf("%-12s | %+15d | %8s | %10s | %10s | %10s | %8.2f | %15s\n",
as.character(row$test_date),
row$days_from_actual,
ifelse(row$detected, "YES", "NO"),
row$confidence,
drop_1,
drop_2,
ifelse(is.na(row$current_ci), NA, row$current_ci),
substr(row$reason, 1, 15)))
}
cat("\n")
# Store results
validation_results[[h]] <- list(
harvest_id = h,
actual_date = actual_harvest,
daily_results = daily_results,
first_detection = first_detection
)
}
# ============================================================================
# SUMMARY STATISTICS
# ============================================================================
cat("\n\n=== VALIDATION SUMMARY ===\n\n")
# Create comprehensive summary showing WHEN detection happens
detection_timing_table <- data.frame()
for (i in 1:length(validation_results)) {
result <- validation_results[[i]]
actual_date <- result$actual_date
# Get detections in key time windows
daily <- result$daily_results
# Check specific days
day_minus_14 <- daily %>% filter(days_from_actual == -14) %>% pull(detected)
day_minus_7 <- daily %>% filter(days_from_actual == -7) %>% pull(detected)
day_minus_3 <- daily %>% filter(days_from_actual == -3) %>% pull(detected)
day_minus_1 <- daily %>% filter(days_from_actual == -1) %>% pull(detected)
day_0 <- daily %>% filter(days_from_actual == 0) %>% pull(detected)
day_plus_1 <- daily %>% filter(days_from_actual == 1) %>% pull(detected)
day_plus_3 <- daily %>% filter(days_from_actual == 3) %>% pull(detected)
day_plus_7 <- daily %>% filter(days_from_actual == 7) %>% pull(detected)
day_plus_14 <- daily %>% filter(days_from_actual == 14) %>% pull(detected)
# First detection info
first_det <- result$first_detection
detection_timing_table <- rbind(detection_timing_table, data.frame(
harvest_event = i,
actual_date = actual_date,
detected_minus_14d = ifelse(length(day_minus_14) > 0 && day_minus_14, "YES", "NO"),
detected_minus_7d = ifelse(length(day_minus_7) > 0 && day_minus_7, "YES", "NO"),
detected_minus_3d = ifelse(length(day_minus_3) > 0 && day_minus_3, "YES", "NO"),
detected_minus_1d = ifelse(length(day_minus_1) > 0 && day_minus_1, "YES", "NO"),
detected_day_0 = ifelse(length(day_0) > 0 && day_0, "YES", "NO"),
detected_plus_1d = ifelse(length(day_plus_1) > 0 && day_plus_1, "YES", "NO"),
detected_plus_3d = ifelse(length(day_plus_3) > 0 && day_plus_3, "YES", "NO"),
detected_plus_7d = ifelse(length(day_plus_7) > 0 && day_plus_7, "YES", "NO"),
detected_plus_14d = ifelse(length(day_plus_14) > 0 && day_plus_14, "YES", "NO"),
first_detection_day = ifelse(nrow(first_det) > 0, first_det$days_from_actual, NA),
stringsAsFactors = FALSE
))
}
cat("Detection Timing Table (when does system flag harvest?):\n\n")
print(detection_timing_table)
cat("\n\nAcceptable Detection Window Analysis:\n")
cat("(Harvest day ±1 day is realistic detection window)\n\n")
acceptable_window <- detection_timing_table %>%
mutate(
detected_in_window = detected_minus_1d == "YES" | detected_day_0 == "YES" | detected_plus_1d == "YES"
)
cat("Harvests detected within ±1 day of actual:", sum(acceptable_window$detected_in_window), "/", nrow(acceptable_window), "\n")
cat("Accuracy within realistic window:", round(100 * mean(acceptable_window$detected_in_window), 1), "%\n\n")
cat("False Early Detections (>3 days before harvest):\n")
early_false <- sum(detection_timing_table$detected_minus_7d == "YES" |
detection_timing_table$detected_minus_14d == "YES", na.rm = TRUE)
cat(" Count:", early_false, "\n")
cat(" These are likely detecting decline phase, not actual harvest\n\n")
# Original summary for comparison
detection_summary <- data.frame()
for (i in 1:length(validation_results)) {
result <- validation_results[[i]]
if (nrow(result$first_detection) > 0) {
detection_summary <- rbind(detection_summary, data.frame(
harvest_event = i,
actual_date = result$actual_date,
detected_date = result$first_detection$test_date,
days_offset = result$first_detection$days_from_actual,
confidence = result$first_detection$confidence,
ci_drop = result$first_detection$ci_drop
))
} else {
detection_summary <- rbind(detection_summary, data.frame(
harvest_event = i,
actual_date = result$actual_date,
detected_date = NA,
days_offset = NA,
confidence = "not_detected",
ci_drop = NA
))
}
}
print(detection_summary)
cat("\n\nDetection Performance:\n")
cat(" Total harvests tested:", nrow(detection_summary), "\n")
cat(" Successfully detected:", sum(!is.na(detection_summary$detected_date)), "\n")
cat(" Detection rate:", round(100 * sum(!is.na(detection_summary$detected_date)) / nrow(detection_summary), 1), "%\n\n")
if (sum(!is.na(detection_summary$days_offset)) > 0) {
cat("Detection Timing:\n")
cat(" Average days from actual:", round(mean(detection_summary$days_offset, na.rm = TRUE), 1), "\n")
cat(" Median days from actual:", round(median(detection_summary$days_offset, na.rm = TRUE), 1), "\n")
cat(" Detected before harvest:", sum(detection_summary$days_offset < 0, na.rm = TRUE), "\n")
cat(" Detected on harvest day:", sum(detection_summary$days_offset == 0, na.rm = TRUE), "\n")
cat(" Detected after harvest:", sum(detection_summary$days_offset > 0, na.rm = TRUE), "\n")
}
cat("\n=== TEST COMPLETE ===\n")
cat("\nNext steps:\n")
cat("1. Review detection timing and adjust thresholds if needed\n")
cat("2. Expand to all fields in dataset\n")
cat("3. Analyze which configuration gives best early detection\n")