SmartCane/r_app/experiments/harvest_prediction/old/predict_harvest_window.R
2026-01-06 14:17:37 +01:00

423 lines
15 KiB
R

# ============================================================================
# HARVEST WINDOW PREDICTION - FORWARD-LOOKING SYSTEM
# Predict harvest 7-14 days AHEAD for factory logistics planning
# ============================================================================
# Use case: Factory needs advance warning when harvest is imminent
# Strategy: Detect when field enters "harvest-ready window" based on
# sustained low CI indicating crop maturation complete
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG <- list(
# Minimum field age before harvest is possible (8 months)
min_field_age_days = 240,
# CI thresholds for maturity assessment
ci_threshold_low = 2.5, # Below this = mature crop
ci_threshold_very_low = 1.5, # Below this = very mature/bare patches
# Sustained low CI indicates "harvest window"
sustained_low_days = 5, # CI below threshold for N consecutive days
# Advanced warning levels
warning_early = 14, # Days ahead for early warning
warning_imminent = 7, # Days ahead for imminent warning
# Minimum days since last harvest (ratoon cycle)
min_days_since_harvest = 200,
# Validation window
test_window_days = 21 # Test ±21 days around actual harvest
)
cat("=== HARVEST WINDOW PREDICTION CONFIGURATION ===\n\n")
cat("Goal: Predict harvest 7-14 days AHEAD for factory planning\n\n")
cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n")
cat("CI thresholds: Low =", CONFIG$ci_threshold_low, "| Very Low =", CONFIG$ci_threshold_very_low, "\n")
cat("Sustained low CI requirement:", CONFIG$sustained_low_days, "consecutive days\n")
cat("Warning levels: Early =", CONFIG$warning_early, "days | Imminent =", CONFIG$warning_imminent, "days\n\n")
# ============================================================================
# LOAD DATA
# ============================================================================
cat("=== LOADING DATA ===\n\n")
# Load CI time series
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
# Load harvest data
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
# Get fields with both CI and harvest data
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
filter(field %in% fields_with_ci) %>%
arrange(field, season_end)
cat("Fields with CI data:", length(fields_with_ci), "\n")
cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n")
cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n")
# ============================================================================
# PREDICTION FUNCTION
# ============================================================================
predict_harvest_window <- function(field_ts, check_date, last_harvest_date, config = CONFIG) {
# Predict if harvest is likely in next 7-14 days based on sustained low CI
#
# Logic:
# 1. Check field age (must be ≥ 240 days)
# 2. Check CI has been below threshold for N consecutive days
# 3. Assess severity (low vs very low CI)
# 4. Return prediction confidence and expected harvest window
# Get current CI
current_ci <- field_ts %>%
filter(date == check_date) %>%
pull(mean_ci)
if (length(current_ci) == 0) {
return(list(
predicted = FALSE,
confidence = "no_data",
current_ci = NA,
consecutive_days_low = 0,
field_age = NA,
harvest_window = "unknown"
))
}
# Calculate field age
if (is.null(last_harvest_date) || is.na(last_harvest_date)) {
# First harvest - use earliest CI date as planting proxy
earliest_date <- min(field_ts$date, na.rm = TRUE)
field_age <- as.numeric(check_date - earliest_date)
} else {
field_age <- as.numeric(check_date - last_harvest_date)
}
# Check minimum age requirement
if (field_age < config$min_field_age_days) {
return(list(
predicted = FALSE,
confidence = "too_young",
current_ci = current_ci,
consecutive_days_low = 0,
field_age = field_age,
harvest_window = "not_ready"
))
}
# Count consecutive days with CI below threshold (looking backward from check_date)
recent_data <- field_ts %>%
filter(date <= check_date, date >= check_date - 30) %>%
arrange(desc(date))
consecutive_days_low <- 0
for (i in 1:nrow(recent_data)) {
if (recent_data$mean_ci[i] <= config$ci_threshold_low) {
consecutive_days_low <- consecutive_days_low + 1
} else {
break # Stop at first day above threshold
}
}
# Calculate mean CI over sustained period
mean_ci_sustained <- if (consecutive_days_low > 0) {
recent_data %>%
slice(1:consecutive_days_low) %>%
summarise(mean = mean(mean_ci, na.rm = TRUE)) %>%
pull(mean)
} else {
NA
}
# Determine prediction confidence and harvest window
predicted <- FALSE
confidence <- "none"
harvest_window <- "not_ready"
if (consecutive_days_low >= config$sustained_low_days) {
predicted <- TRUE
# Assess severity based on mean CI during sustained period
if (!is.na(mean_ci_sustained) && mean_ci_sustained <= config$ci_threshold_very_low) {
confidence <- "imminent" # Very low CI = harvest within 7 days
harvest_window <- "7_days"
} else {
confidence <- "likely" # Low CI = harvest within 7-14 days
harvest_window <- "7_14_days"
}
} else if (consecutive_days_low >= 2) {
# Starting to show maturity signals
predicted <- TRUE
confidence <- "possible"
harvest_window <- "14_21_days"
}
return(list(
predicted = predicted,
confidence = confidence,
current_ci = current_ci,
mean_ci_sustained = mean_ci_sustained,
consecutive_days_low = consecutive_days_low,
field_age = field_age,
harvest_window = harvest_window
))
}
# ============================================================================
# VALIDATION FUNCTION
# ============================================================================
validate_harvest_prediction <- function(field_id, test_field = NULL) {
# Test prediction accuracy by checking ±21 days around actual harvest dates
# Get field data
field_ts <- time_series_daily %>%
filter(field_id == !!field_id) %>%
arrange(date)
field_harvests <- harvest_data_filtered %>%
filter(field == field_id) %>%
arrange(season_end)
if (nrow(field_harvests) == 0) {
cat("No harvest records for field", field_id, "\n")
return(NULL)
}
cat("\n", rep("=", 80), "\n", sep = "")
cat("Testing field:", field_id, "\n")
cat("Field has", nrow(field_harvests), "recorded harvest events\n")
cat(rep("=", 80), "\n\n", sep = "")
all_results <- list()
detection_timing <- data.frame()
# Test each harvest event
for (h in 1:nrow(field_harvests)) {
harvest_date <- field_harvests$season_end[h]
# Get previous harvest for field age calculation
if (h == 1) {
last_harvest <- NA
} else {
last_harvest <- field_harvests$season_end[h - 1]
}
# Test dates from -21 to +14 days around harvest
test_dates_seq <- seq.Date(
from = harvest_date - CONFIG$test_window_days,
to = harvest_date + 14,
by = "1 day"
)
# Run prediction for each test date
event_results <- data.frame()
first_detection <- NULL
for (i in 1:length(test_dates_seq)) {
test_date <- test_dates_seq[i]
days_from_harvest <- as.numeric(test_date - harvest_date)
result <- predict_harvest_window(field_ts, test_date, last_harvest, CONFIG)
# Track first detection
if (result$predicted && is.null(first_detection)) {
first_detection <- list(
date = test_date,
days_before = -days_from_harvest,
confidence = result$confidence,
consecutive_days = result$consecutive_days_low,
mean_ci = result$mean_ci_sustained,
harvest_window = result$harvest_window
)
}
event_results <- bind_rows(event_results, data.frame(
harvest_event = h,
harvest_date = harvest_date,
test_date = test_date,
days_from_harvest = days_from_harvest,
predicted = result$predicted,
confidence = result$confidence,
current_ci = result$current_ci,
consecutive_days_low = result$consecutive_days_low,
field_age = result$field_age,
harvest_window = result$harvest_window
))
}
all_results[[h]] <- event_results
# Print harvest event summary
cat("--- Harvest Event", h, ":", format(harvest_date, "%Y-%m-%d"), "---\n")
if (!is.null(first_detection)) {
cat("✓ First prediction:", format(first_detection$date, "%Y-%m-%d"),
"(", first_detection$days_before, "days before harvest )\n")
cat(" Confidence:", first_detection$confidence, "\n")
cat(" Harvest window:", first_detection$harvest_window, "\n")
cat(" Consecutive days low CI:", first_detection$consecutive_days, "\n")
cat(" Mean CI during period:", round(first_detection$mean_ci, 2), "\n")
# Categorize detection timing
if (first_detection$days_before >= 7 && first_detection$days_before <= 21) {
cat(" ✓ GOOD: Detected in optimal window (7-21 days ahead)\n")
} else if (first_detection$days_before > 21) {
cat(" ⚠️ EARLY: Detected >21 days ahead\n")
} else if (first_detection$days_before >= 0) {
cat(" ⚠️ LATE: Detected <7 days ahead\n")
} else {
cat(" ✗ MISSED: Detected after harvest\n")
}
} else {
cat("✗ No prediction detected\n")
}
cat("\n")
# Build detection timing matrix
timing_row <- data.frame(harvest_event = h)
for (offset in c(-21, -14, -7, -3, -1, 0, 1, 3, 7, 14)) {
detected_on_day <- event_results %>%
filter(days_from_harvest == offset) %>%
pull(predicted)
timing_row[[paste0("d", ifelse(offset >= 0, "_plus_", "_minus_"), abs(offset))]] <-
ifelse(length(detected_on_day) > 0 && detected_on_day, "YES", "NO")
}
detection_timing <- bind_rows(detection_timing, timing_row)
}
# Print detection timing table
cat("\n", rep("=", 80), "\n", sep = "")
cat("PREDICTION TIMING TABLE\n")
cat("Columns: Days relative to harvest date\n")
cat(rep("=", 80), "\n\n", sep = "")
print(detection_timing, row.names = FALSE)
# Calculate summary statistics
all_results_df <- bind_rows(all_results)
# Find optimal prediction window (7-21 days before)
optimal_detections <- all_results_df %>%
filter(predicted == TRUE, days_from_harvest >= -21, days_from_harvest <= -7) %>%
group_by(harvest_event) %>%
slice(1) %>% # First detection in optimal window
ungroup()
early_detections <- all_results_df %>%
filter(predicted == TRUE, days_from_harvest < -21) %>%
group_by(harvest_event) %>%
slice(1) %>%
ungroup()
late_detections <- all_results_df %>%
filter(predicted == TRUE, days_from_harvest > -7) %>%
group_by(harvest_event) %>%
slice(1) %>%
ungroup()
total_harvests <- nrow(field_harvests)
cat("\n", rep("=", 80), "\n", sep = "")
cat("VALIDATION SUMMARY\n")
cat(rep("=", 80), "\n\n", sep = "")
cat("Total harvest events tested:", total_harvests, "\n\n")
cat("Predictions in OPTIMAL window (7-21 days ahead):", nrow(optimal_detections), "/", total_harvests,
"(", round(100 * nrow(optimal_detections) / total_harvests, 1), "% )\n")
cat("Predictions TOO EARLY (>21 days ahead):", nrow(early_detections), "\n")
cat("Predictions TOO LATE (<7 days ahead):", nrow(late_detections), "\n")
cat("Missed harvests:", total_harvests - nrow(optimal_detections) - nrow(early_detections) - nrow(late_detections), "\n\n")
# Overall detection rate
detected_total <- all_results_df %>%
filter(predicted == TRUE, days_from_harvest <= 0) %>%
distinct(harvest_event) %>%
nrow()
cat("Overall detection rate (any time before harvest):", detected_total, "/", total_harvests,
"(", round(100 * detected_total / total_harvests, 1), "% )\n\n")
# Return detailed results
invisible(list(
all_results = all_results_df,
detection_timing = detection_timing,
optimal_detections = optimal_detections,
summary = data.frame(
field = field_id,
total_harvests = total_harvests,
optimal_window = nrow(optimal_detections),
too_early = nrow(early_detections),
too_late = nrow(late_detections),
missed = total_harvests - detected_total,
detection_rate = round(100 * detected_total / total_harvests, 1)
)
))
}
# ============================================================================
# RUN VALIDATION
# ============================================================================
# Test on Field 00110 (from your graphs)
test_field <- "00110"
results <- validate_harvest_prediction(test_field)
cat("\n", rep("=", 80), "\n", sep = "")
cat("INTERPRETATION FOR FACTORY CLIENT\n")
cat(rep("=", 80), "\n\n", sep = "")
cat("This system provides ADVANCE WARNING when harvest is likely imminent:\n\n")
cat(" 📊 HARVEST WINDOW PREDICTIONS:\n")
cat(" - '7_days': Harvest expected within 7 days (IMMINENT)\n")
cat(" - '7_14_days': Harvest expected in 7-14 days (LIKELY)\n")
cat(" - '14_21_days': Harvest possible in 14-21 days (WATCH)\n\n")
cat(" ⚙️ DETECTION LOGIC:\n")
cat(" - CI below 2.5 for", CONFIG$sustained_low_days, "consecutive days = crop mature\n")
cat(" - Very low CI (<1.5) = harvest imminent (7 days)\n")
cat(" - Low CI (1.5-2.5) = harvest likely (7-14 days)\n\n")
cat(" 🏭 FACTORY USE CASE:\n")
cat(" - Factory gets 7-21 days advance notice to plan logistics\n")
cat(" - Can schedule processing capacity and transport\n")
cat(" - Avoids surprise harvest deliveries\n\n")
cat("=== ANALYSIS COMPLETE ===\n")