448 lines
16 KiB
R
448 lines
16 KiB
R
# ============================================================================
|
|
# DAILY-SCALE HARVEST DETECTION AND VALIDATION
|
|
# Real-time detection using CI drop patterns
|
|
# ============================================================================
|
|
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(here)
|
|
library(ggplot2)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
# Navigate to project root if in experiments folder
|
|
if (basename(getwd()) == "harvest_prediction") {
|
|
setwd("../../..")
|
|
}
|
|
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION
|
|
# ============================================================================
|
|
|
|
CONFIG <- list(
|
|
min_field_age_days = 240, # 8 months minimum age
|
|
ci_threshold = 2.5, # Below this = potential harvest
|
|
drop_threshold_low = 1.0, # Minimum CI drop to flag
|
|
drop_threshold_high = 2.0, # Strong CI drop
|
|
lookback_days_min = 7, # Compare with 7-14 days ago
|
|
lookback_days_max = 14,
|
|
confirmation_days = 3, # Days below threshold for confirmation
|
|
test_window_days = 14 # Test ±14 days around actual harvest
|
|
)
|
|
|
|
cat("=== DAILY HARVEST DETECTION CONFIGURATION ===\n\n")
|
|
cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n")
|
|
cat("CI threshold:", CONFIG$ci_threshold, "\n")
|
|
cat("Drop thresholds:", CONFIG$drop_threshold_low, "and", CONFIG$drop_threshold_high, "\n")
|
|
cat("Lookback window:", CONFIG$lookback_days_min, "-", CONFIG$lookback_days_max, "days\n")
|
|
cat("Confirmation window:", CONFIG$confirmation_days, "consecutive days\n\n")
|
|
|
|
# ============================================================================
|
|
# LOAD DATA
|
|
# ============================================================================
|
|
|
|
cat("=== LOADING DATA ===\n\n")
|
|
|
|
# Load CI time series
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(date = as.Date(Date)) %>%
|
|
select(field_id = field, date, mean_ci = FitData) %>%
|
|
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
|
arrange(field_id, date)
|
|
|
|
# Load harvest data
|
|
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
# Get fields with both CI and harvest data
|
|
fields_with_ci <- unique(time_series_daily$field_id)
|
|
harvest_data_filtered <- harvest_data %>%
|
|
filter(field %in% fields_with_ci) %>%
|
|
arrange(field, season_end)
|
|
|
|
cat("Fields with CI data:", length(fields_with_ci), "\n")
|
|
cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n")
|
|
cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n")
|
|
|
|
# ============================================================================
|
|
# DETECTION FUNCTION
|
|
# ============================================================================
|
|
|
|
detect_harvest_on_date <- function(field_ts, check_date, last_harvest_date, config = CONFIG) {
|
|
# Check if harvest can be detected on a specific date
|
|
#
|
|
# Args:
|
|
# field_ts: Daily CI time series for field
|
|
# check_date: Date to check for harvest detection
|
|
# last_harvest_date: Previous harvest date (to calculate field age)
|
|
# config: Detection parameters
|
|
#
|
|
# Returns:
|
|
# List with detection status and metrics
|
|
|
|
# Get CI value on check date
|
|
current_ci <- field_ts %>%
|
|
filter(date == check_date) %>%
|
|
pull(mean_ci)
|
|
|
|
if (length(current_ci) == 0) {
|
|
return(list(
|
|
detected = FALSE,
|
|
reason = "no_data",
|
|
current_ci = NA,
|
|
lookback_ci = NA,
|
|
ci_drop = NA,
|
|
field_age = NA,
|
|
consecutive_days_low = 0
|
|
))
|
|
}
|
|
|
|
# Check field age
|
|
field_age <- as.numeric(check_date - last_harvest_date)
|
|
|
|
if (field_age < config$min_field_age_days) {
|
|
return(list(
|
|
detected = FALSE,
|
|
reason = "too_young",
|
|
current_ci = current_ci,
|
|
lookback_ci = NA,
|
|
ci_drop = NA,
|
|
field_age = field_age,
|
|
consecutive_days_low = 0
|
|
))
|
|
}
|
|
|
|
# Get lookback period CI (mean of 7-14 days ago)
|
|
lookback_start <- check_date - config$lookback_days_max
|
|
lookback_end <- check_date - config$lookback_days_min
|
|
|
|
lookback_ci <- field_ts %>%
|
|
filter(date >= lookback_start, date <= lookback_end) %>%
|
|
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
|
|
pull(mean_ci)
|
|
|
|
if (is.na(lookback_ci) || length(lookback_ci) == 0) {
|
|
return(list(
|
|
detected = FALSE,
|
|
reason = "no_lookback_data",
|
|
current_ci = current_ci,
|
|
lookback_ci = NA,
|
|
ci_drop = NA,
|
|
field_age = field_age,
|
|
consecutive_days_low = 0
|
|
))
|
|
}
|
|
|
|
# Calculate CI drop
|
|
ci_drop <- lookback_ci - current_ci
|
|
|
|
# Check consecutive days below threshold
|
|
consecutive_days <- 0
|
|
for (i in 0:config$confirmation_days) {
|
|
test_date <- check_date - i
|
|
test_ci <- field_ts %>%
|
|
filter(date == test_date) %>%
|
|
pull(mean_ci)
|
|
|
|
if (length(test_ci) > 0 && test_ci < config$ci_threshold) {
|
|
consecutive_days <- consecutive_days + 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
|
|
# Detection logic
|
|
detected <- FALSE
|
|
confidence <- "none"
|
|
reason <- "no_trigger"
|
|
|
|
# Check conditions
|
|
below_threshold <- current_ci < config$ci_threshold
|
|
strong_drop <- ci_drop >= config$drop_threshold_high
|
|
moderate_drop <- ci_drop >= config$drop_threshold_low
|
|
|
|
if (below_threshold && strong_drop) {
|
|
detected <- TRUE
|
|
reason <- "strong_drop"
|
|
if (consecutive_days >= 3) {
|
|
confidence <- "confirmed"
|
|
} else if (consecutive_days >= 2) {
|
|
confidence <- "likely"
|
|
} else {
|
|
confidence <- "possible"
|
|
}
|
|
} else if (below_threshold && moderate_drop) {
|
|
if (consecutive_days >= 2) {
|
|
detected <- TRUE
|
|
reason <- "moderate_drop_confirmed"
|
|
confidence <- "likely"
|
|
} else {
|
|
detected <- TRUE
|
|
reason <- "moderate_drop"
|
|
confidence <- "possible"
|
|
}
|
|
} else if (below_threshold) {
|
|
reason <- "below_threshold_no_drop"
|
|
} else if (strong_drop) {
|
|
reason <- "strong_drop_above_threshold"
|
|
}
|
|
|
|
return(list(
|
|
detected = detected,
|
|
reason = reason,
|
|
confidence = confidence,
|
|
current_ci = current_ci,
|
|
lookback_ci = lookback_ci,
|
|
ci_drop = ci_drop,
|
|
field_age = field_age,
|
|
consecutive_days_low = consecutive_days,
|
|
below_threshold = below_threshold,
|
|
strong_drop = strong_drop
|
|
))
|
|
}
|
|
|
|
# ============================================================================
|
|
# VALIDATION: TEST AROUND KNOWN HARVEST DATES
|
|
# ============================================================================
|
|
|
|
cat("=== TESTING DETECTION AROUND KNOWN HARVEST DATES ===\n\n")
|
|
|
|
# Start with first field
|
|
test_field <- fields_with_ci[1]
|
|
cat("Testing field:", test_field, "\n\n")
|
|
|
|
# Get field's time series and harvests
|
|
field_ts <- time_series_daily %>%
|
|
filter(field_id == test_field)
|
|
|
|
field_harvests <- harvest_data_filtered %>%
|
|
filter(field == test_field) %>%
|
|
arrange(season_end)
|
|
|
|
cat("Field has", nrow(field_harvests), "recorded harvest events\n")
|
|
cat("CI observations:", nrow(field_ts), "days from", min(field_ts$date), "to", max(field_ts$date), "\n\n")
|
|
|
|
# Test each harvest event
|
|
validation_results <- list()
|
|
|
|
for (h in 1:nrow(field_harvests)) {
|
|
actual_harvest <- field_harvests$season_end[h]
|
|
|
|
# Get previous harvest for field age calculation
|
|
if (h == 1) {
|
|
last_harvest <- field_harvests$season_start[h] - 365 # Assume ~1 year before first record
|
|
} else {
|
|
last_harvest <- field_harvests$season_end[h-1]
|
|
}
|
|
|
|
cat("\n--- Harvest Event", h, ": ", as.character(actual_harvest), " ---\n")
|
|
cat("Field age at harvest:", round(as.numeric(actual_harvest - last_harvest)), "days\n\n")
|
|
|
|
# Test ±14 days around actual harvest
|
|
test_dates <- seq.Date(
|
|
from = actual_harvest - CONFIG$test_window_days,
|
|
to = actual_harvest + CONFIG$test_window_days,
|
|
by = "day"
|
|
)
|
|
|
|
# Run detection for each test date
|
|
daily_results <- data.frame()
|
|
|
|
for (i in 1:length(test_dates)) {
|
|
test_date <- test_dates[i]
|
|
result <- detect_harvest_on_date(field_ts, test_date, last_harvest)
|
|
|
|
daily_results <- rbind(daily_results, data.frame(
|
|
test_date = test_date,
|
|
days_from_actual = as.numeric(test_date - actual_harvest),
|
|
detected = result$detected,
|
|
confidence = ifelse(is.null(result$confidence) || length(result$confidence) == 0, "none", result$confidence),
|
|
reason = result$reason,
|
|
current_ci = result$current_ci,
|
|
lookback_ci = result$lookback_ci,
|
|
ci_drop = result$ci_drop,
|
|
consecutive_days = result$consecutive_days_low,
|
|
stringsAsFactors = FALSE
|
|
))
|
|
}
|
|
|
|
# Find first detection
|
|
first_detection <- daily_results %>%
|
|
filter(detected == TRUE) %>%
|
|
arrange(test_date) %>%
|
|
slice(1)
|
|
|
|
if (nrow(first_detection) > 0) {
|
|
cat("✓ First detection:", as.character(first_detection$test_date),
|
|
"(", first_detection$days_from_actual, "days from actual harvest )\n")
|
|
cat(" Confidence:", first_detection$confidence, "\n")
|
|
cat(" CI drop:", round(first_detection$ci_drop, 2), "\n\n")
|
|
} else {
|
|
cat("✗ No detection within test window\n\n")
|
|
}
|
|
|
|
# Print detailed daily table
|
|
cat("Day-by-Day Detection Results:\n")
|
|
cat(sprintf("%-12s | %-15s | %8s | %10s | %10s | %8s | %8s | %15s\n",
|
|
"Date", "Days from Actual", "Detected", "Confidence", "Drop (1.0)", "Drop (2.0)", "CI", "Reason"))
|
|
cat(paste(rep("-", 110), collapse = ""), "\n")
|
|
|
|
for (i in 1:nrow(daily_results)) {
|
|
row <- daily_results[i, ]
|
|
|
|
# Check both thresholds explicitly
|
|
drop_1 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 1.0, "YES", "NO")
|
|
drop_2 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 2.0, "YES", "NO")
|
|
|
|
cat(sprintf("%-12s | %+15d | %8s | %10s | %10s | %10s | %8.2f | %15s\n",
|
|
as.character(row$test_date),
|
|
row$days_from_actual,
|
|
ifelse(row$detected, "YES", "NO"),
|
|
row$confidence,
|
|
drop_1,
|
|
drop_2,
|
|
ifelse(is.na(row$current_ci), NA, row$current_ci),
|
|
substr(row$reason, 1, 15)))
|
|
}
|
|
cat("\n")
|
|
|
|
# Store results
|
|
validation_results[[h]] <- list(
|
|
harvest_id = h,
|
|
actual_date = actual_harvest,
|
|
daily_results = daily_results,
|
|
first_detection = first_detection
|
|
)
|
|
}
|
|
|
|
# ============================================================================
|
|
# SUMMARY STATISTICS
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== VALIDATION SUMMARY ===\n\n")
|
|
|
|
# Create comprehensive summary showing WHEN detection happens
|
|
detection_timing_table <- data.frame()
|
|
|
|
for (i in 1:length(validation_results)) {
|
|
result <- validation_results[[i]]
|
|
actual_date <- result$actual_date
|
|
|
|
# Get detections in key time windows
|
|
daily <- result$daily_results
|
|
|
|
# Check specific days
|
|
day_minus_14 <- daily %>% filter(days_from_actual == -14) %>% pull(detected)
|
|
day_minus_7 <- daily %>% filter(days_from_actual == -7) %>% pull(detected)
|
|
day_minus_3 <- daily %>% filter(days_from_actual == -3) %>% pull(detected)
|
|
day_minus_1 <- daily %>% filter(days_from_actual == -1) %>% pull(detected)
|
|
day_0 <- daily %>% filter(days_from_actual == 0) %>% pull(detected)
|
|
day_plus_1 <- daily %>% filter(days_from_actual == 1) %>% pull(detected)
|
|
day_plus_3 <- daily %>% filter(days_from_actual == 3) %>% pull(detected)
|
|
day_plus_7 <- daily %>% filter(days_from_actual == 7) %>% pull(detected)
|
|
day_plus_14 <- daily %>% filter(days_from_actual == 14) %>% pull(detected)
|
|
|
|
# First detection info
|
|
first_det <- result$first_detection
|
|
|
|
detection_timing_table <- rbind(detection_timing_table, data.frame(
|
|
harvest_event = i,
|
|
actual_date = actual_date,
|
|
detected_minus_14d = ifelse(length(day_minus_14) > 0 && day_minus_14, "YES", "NO"),
|
|
detected_minus_7d = ifelse(length(day_minus_7) > 0 && day_minus_7, "YES", "NO"),
|
|
detected_minus_3d = ifelse(length(day_minus_3) > 0 && day_minus_3, "YES", "NO"),
|
|
detected_minus_1d = ifelse(length(day_minus_1) > 0 && day_minus_1, "YES", "NO"),
|
|
detected_day_0 = ifelse(length(day_0) > 0 && day_0, "YES", "NO"),
|
|
detected_plus_1d = ifelse(length(day_plus_1) > 0 && day_plus_1, "YES", "NO"),
|
|
detected_plus_3d = ifelse(length(day_plus_3) > 0 && day_plus_3, "YES", "NO"),
|
|
detected_plus_7d = ifelse(length(day_plus_7) > 0 && day_plus_7, "YES", "NO"),
|
|
detected_plus_14d = ifelse(length(day_plus_14) > 0 && day_plus_14, "YES", "NO"),
|
|
first_detection_day = ifelse(nrow(first_det) > 0, first_det$days_from_actual, NA),
|
|
stringsAsFactors = FALSE
|
|
))
|
|
}
|
|
|
|
cat("Detection Timing Table (when does system flag harvest?):\n\n")
|
|
print(detection_timing_table)
|
|
|
|
cat("\n\nAcceptable Detection Window Analysis:\n")
|
|
cat("(Harvest day ±1 day is realistic detection window)\n\n")
|
|
|
|
acceptable_window <- detection_timing_table %>%
|
|
mutate(
|
|
detected_in_window = detected_minus_1d == "YES" | detected_day_0 == "YES" | detected_plus_1d == "YES"
|
|
)
|
|
|
|
cat("Harvests detected within ±1 day of actual:", sum(acceptable_window$detected_in_window), "/", nrow(acceptable_window), "\n")
|
|
cat("Accuracy within realistic window:", round(100 * mean(acceptable_window$detected_in_window), 1), "%\n\n")
|
|
|
|
cat("False Early Detections (>3 days before harvest):\n")
|
|
early_false <- sum(detection_timing_table$detected_minus_7d == "YES" |
|
|
detection_timing_table$detected_minus_14d == "YES", na.rm = TRUE)
|
|
cat(" Count:", early_false, "\n")
|
|
cat(" These are likely detecting decline phase, not actual harvest\n\n")
|
|
|
|
# Original summary for comparison
|
|
detection_summary <- data.frame()
|
|
|
|
for (i in 1:length(validation_results)) {
|
|
result <- validation_results[[i]]
|
|
|
|
if (nrow(result$first_detection) > 0) {
|
|
detection_summary <- rbind(detection_summary, data.frame(
|
|
harvest_event = i,
|
|
actual_date = result$actual_date,
|
|
detected_date = result$first_detection$test_date,
|
|
days_offset = result$first_detection$days_from_actual,
|
|
confidence = result$first_detection$confidence,
|
|
ci_drop = result$first_detection$ci_drop
|
|
))
|
|
} else {
|
|
detection_summary <- rbind(detection_summary, data.frame(
|
|
harvest_event = i,
|
|
actual_date = result$actual_date,
|
|
detected_date = NA,
|
|
days_offset = NA,
|
|
confidence = "not_detected",
|
|
ci_drop = NA
|
|
))
|
|
}
|
|
}
|
|
|
|
print(detection_summary)
|
|
|
|
cat("\n\nDetection Performance:\n")
|
|
cat(" Total harvests tested:", nrow(detection_summary), "\n")
|
|
cat(" Successfully detected:", sum(!is.na(detection_summary$detected_date)), "\n")
|
|
cat(" Detection rate:", round(100 * sum(!is.na(detection_summary$detected_date)) / nrow(detection_summary), 1), "%\n\n")
|
|
|
|
if (sum(!is.na(detection_summary$days_offset)) > 0) {
|
|
cat("Detection Timing:\n")
|
|
cat(" Average days from actual:", round(mean(detection_summary$days_offset, na.rm = TRUE), 1), "\n")
|
|
cat(" Median days from actual:", round(median(detection_summary$days_offset, na.rm = TRUE), 1), "\n")
|
|
cat(" Detected before harvest:", sum(detection_summary$days_offset < 0, na.rm = TRUE), "\n")
|
|
cat(" Detected on harvest day:", sum(detection_summary$days_offset == 0, na.rm = TRUE), "\n")
|
|
cat(" Detected after harvest:", sum(detection_summary$days_offset > 0, na.rm = TRUE), "\n")
|
|
}
|
|
|
|
cat("\n=== TEST COMPLETE ===\n")
|
|
cat("\nNext steps:\n")
|
|
cat("1. Review detection timing and adjust thresholds if needed\n")
|
|
cat("2. Expand to all fields in dataset\n")
|
|
cat("3. Analyze which configuration gives best early detection\n")
|