SmartCane/r_app/experiments/harvest_prediction/old/predict_harvest_window.R

# ============================================================================
# HARVEST WINDOW PREDICTION - FORWARD-LOOKING SYSTEM
# Predict harvest 7-14 days AHEAD for factory logistics planning
# ============================================================================
# Use case: Factory needs advance warning when harvest is imminent
# Strategy: Detect when field enters "harvest-ready window" based on
#           sustained low CI indicating crop maturation complete
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG <- list(
  # Minimum field age before harvest is possible (8 months)
  min_field_age_days = 240,

  # CI thresholds for maturity assessment
  ci_threshold_low = 2.5,      # Below this = mature crop
  ci_threshold_very_low = 1.5,  # Below this = very mature/bare patches

  # Sustained low CI indicates "harvest window"
  sustained_low_days = 5,       # CI below threshold for N consecutive days

  # Advanced warning levels
  warning_early = 14,           # Days ahead for early warning
  warning_imminent = 7,         # Days ahead for imminent warning

  # Minimum days since last harvest (ratoon cycle)
  min_days_since_harvest = 200,

  # Validation window
  test_window_days = 21         # Test ±21 days around actual harvest
)

cat("=== HARVEST WINDOW PREDICTION CONFIGURATION ===\n\n")
cat("Goal: Predict harvest 7-14 days AHEAD for factory planning\n\n")
cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n")
cat("CI thresholds: Low =", CONFIG$ci_threshold_low, "| Very Low =", CONFIG$ci_threshold_very_low, "\n")
cat("Sustained low CI requirement:", CONFIG$sustained_low_days, "consecutive days\n")
cat("Warning levels: Early =", CONFIG$warning_early, "days | Imminent =", CONFIG$warning_imminent, "days\n\n")

# ============================================================================
# LOAD DATA
# ============================================================================

cat("=== LOADING DATA ===\n\n")

# Load CI time series
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

# Load harvest data
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

# Get fields with both CI and harvest data
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
  filter(field %in% fields_with_ci) %>%
  arrange(field, season_end)

cat("Fields with CI data:", length(fields_with_ci), "\n")
cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n")
cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n")

# ============================================================================
# PREDICTION FUNCTION
# ============================================================================

predict_harvest_window <- function(field_ts, check_date, last_harvest_date, config = CONFIG) {
  # Predict if harvest is likely in next 7-14 days based on sustained low CI
  #
  # Logic:
  # 1. Check field age (must be ≥ 240 days)
  # 2. Check CI has been below threshold for N consecutive days
  # 3. Assess severity (low vs very low CI)
  # 4. Return prediction confidence and expected harvest window

  # Get current CI
  current_ci <- field_ts %>%
    filter(date == check_date) %>%
    pull(mean_ci)

  if (length(current_ci) == 0) {
    return(list(
      predicted = FALSE,
      confidence = "no_data",
      current_ci = NA,
      consecutive_days_low = 0,
      field_age = NA,
      harvest_window = "unknown"
    ))
  }

  # Calculate field age
  if (is.null(last_harvest_date) || is.na(last_harvest_date)) {
    # First harvest - use earliest CI date as planting proxy
    earliest_date <- min(field_ts$date, na.rm = TRUE)
    field_age <- as.numeric(check_date - earliest_date)
  } else {
    field_age <- as.numeric(check_date - last_harvest_date)
  }

  # Check minimum age requirement
  if (field_age < config$min_field_age_days) {
    return(list(
      predicted = FALSE,
      confidence = "too_young",
      current_ci = current_ci,
      consecutive_days_low = 0,
      field_age = field_age,
      harvest_window = "not_ready"
    ))
  }

  # Count consecutive days with CI below threshold (looking backward from check_date)
  recent_data <- field_ts %>%
    filter(date <= check_date, date >= check_date - 30) %>%
    arrange(desc(date))

  consecutive_days_low <- 0
  for (i in 1:nrow(recent_data)) {
    if (recent_data$mean_ci[i] <= config$ci_threshold_low) {
      consecutive_days_low <- consecutive_days_low + 1
    } else {
      break  # Stop at first day above threshold
    }
  }

  # Calculate mean CI over sustained period
  mean_ci_sustained <- if (consecutive_days_low > 0) {
    recent_data %>%
      slice(1:consecutive_days_low) %>%
      summarise(mean = mean(mean_ci, na.rm = TRUE)) %>%
      pull(mean)
  } else {
    NA
  }

  # Determine prediction confidence and harvest window
  predicted <- FALSE
  confidence <- "none"
  harvest_window <- "not_ready"

  if (consecutive_days_low >= config$sustained_low_days) {
    predicted <- TRUE

    # Assess severity based on mean CI during sustained period
    if (!is.na(mean_ci_sustained) && mean_ci_sustained <= config$ci_threshold_very_low) {
      confidence <- "imminent"  # Very low CI = harvest within 7 days
      harvest_window <- "7_days"
    } else {
      confidence <- "likely"    # Low CI = harvest within 7-14 days
      harvest_window <- "7_14_days"
    }
  } else if (consecutive_days_low >= 2) {
    # Starting to show maturity signals
    predicted <- TRUE
    confidence <- "possible"
    harvest_window <- "14_21_days"
  }

  return(list(
    predicted = predicted,
    confidence = confidence,
    current_ci = current_ci,
    mean_ci_sustained = mean_ci_sustained,
    consecutive_days_low = consecutive_days_low,
    field_age = field_age,
    harvest_window = harvest_window
  ))
}

# ============================================================================
# VALIDATION FUNCTION
# ============================================================================

validate_harvest_prediction <- function(field_id, test_field = NULL) {
  # Test prediction accuracy by checking ±21 days around actual harvest dates

  # Get field data
  field_ts <- time_series_daily %>%
    filter(field_id == !!field_id) %>%
    arrange(date)

  field_harvests <- harvest_data_filtered %>%
    filter(field == field_id) %>%
    arrange(season_end)

  if (nrow(field_harvests) == 0) {
    cat("No harvest records for field", field_id, "\n")
    return(NULL)
  }

  cat("\n", rep("=", 80), "\n", sep = "")
  cat("Testing field:", field_id, "\n")
  cat("Field has", nrow(field_harvests), "recorded harvest events\n")
  cat(rep("=", 80), "\n\n", sep = "")

  all_results <- list()
  detection_timing <- data.frame()

  # Test each harvest event
  for (h in 1:nrow(field_harvests)) {
    harvest_date <- field_harvests$season_end[h]

    # Get previous harvest for field age calculation
    if (h == 1) {
      last_harvest <- NA
    } else {
      last_harvest <- field_harvests$season_end[h - 1]
    }

    # Test dates from -21 to +14 days around harvest
    test_dates_seq <- seq.Date(
      from = harvest_date - CONFIG$test_window_days,
      to = harvest_date + 14,
      by = "1 day"
    )

    # Run prediction for each test date
    event_results <- data.frame()
    first_detection <- NULL

    for (i in 1:length(test_dates_seq)) {
      test_date <- test_dates_seq[i]
      days_from_harvest <- as.numeric(test_date - harvest_date)

      result <- predict_harvest_window(field_ts, test_date, last_harvest, CONFIG)

      # Track first detection
      if (result$predicted && is.null(first_detection)) {
        first_detection <- list(
          date = test_date,
          days_before = -days_from_harvest,
          confidence = result$confidence,
          consecutive_days = result$consecutive_days_low,
          mean_ci = result$mean_ci_sustained,
          harvest_window = result$harvest_window
        )
      }

      event_results <- bind_rows(event_results, data.frame(
        harvest_event = h,
        harvest_date = harvest_date,
        test_date = test_date,
        days_from_harvest = days_from_harvest,
        predicted = result$predicted,
        confidence = result$confidence,
        current_ci = result$current_ci,
        consecutive_days_low = result$consecutive_days_low,
        field_age = result$field_age,
        harvest_window = result$harvest_window
      ))
    }

    all_results[[h]] <- event_results

    # Print harvest event summary
    cat("--- Harvest Event", h, ":", format(harvest_date, "%Y-%m-%d"), "---\n")

    if (!is.null(first_detection)) {
      cat("✓ First prediction:", format(first_detection$date, "%Y-%m-%d"),
          "(", first_detection$days_before, "days before harvest )\n")
      cat("  Confidence:", first_detection$confidence, "\n")
      cat("  Harvest window:", first_detection$harvest_window, "\n")
      cat("  Consecutive days low CI:", first_detection$consecutive_days, "\n")
      cat("  Mean CI during period:", round(first_detection$mean_ci, 2), "\n")

      # Categorize detection timing
      if (first_detection$days_before >= 7 && first_detection$days_before <= 21) {
        cat("  ✓ GOOD: Detected in optimal window (7-21 days ahead)\n")
      } else if (first_detection$days_before > 21) {
        cat("  ⚠️ EARLY: Detected >21 days ahead\n")
      } else if (first_detection$days_before >= 0) {
        cat("  ⚠️ LATE: Detected <7 days ahead\n")
      } else {
        cat("  ✗ MISSED: Detected after harvest\n")
      }
    } else {
      cat("✗ No prediction detected\n")
    }

    cat("\n")

    # Build detection timing matrix
    timing_row <- data.frame(harvest_event = h)
    for (offset in c(-21, -14, -7, -3, -1, 0, 1, 3, 7, 14)) {
      detected_on_day <- event_results %>%
        filter(days_from_harvest == offset) %>%
        pull(predicted)

      timing_row[[paste0("d", ifelse(offset >= 0, "_plus_", "_minus_"), abs(offset))]] <-
        ifelse(length(detected_on_day) > 0 && detected_on_day, "YES", "NO")
    }
    detection_timing <- bind_rows(detection_timing, timing_row)
  }

  # Print detection timing table
  cat("\n", rep("=", 80), "\n", sep = "")
  cat("PREDICTION TIMING TABLE\n")
  cat("Columns: Days relative to harvest date\n")
  cat(rep("=", 80), "\n\n", sep = "")
  print(detection_timing, row.names = FALSE)

  # Calculate summary statistics
  all_results_df <- bind_rows(all_results)

  # Find optimal prediction window (7-21 days before)
  optimal_detections <- all_results_df %>%
    filter(predicted == TRUE, days_from_harvest >= -21, days_from_harvest <= -7) %>%
    group_by(harvest_event) %>%
    slice(1) %>%  # First detection in optimal window
    ungroup()

  early_detections <- all_results_df %>%
    filter(predicted == TRUE, days_from_harvest < -21) %>%
    group_by(harvest_event) %>%
    slice(1) %>%
    ungroup()

  late_detections <- all_results_df %>%
    filter(predicted == TRUE, days_from_harvest > -7) %>%
    group_by(harvest_event) %>%
    slice(1) %>%
    ungroup()

  total_harvests <- nrow(field_harvests)

  cat("\n", rep("=", 80), "\n", sep = "")
  cat("VALIDATION SUMMARY\n")
  cat(rep("=", 80), "\n\n", sep = "")
  cat("Total harvest events tested:", total_harvests, "\n\n")
  cat("Predictions in OPTIMAL window (7-21 days ahead):", nrow(optimal_detections), "/", total_harvests,
      "(", round(100 * nrow(optimal_detections) / total_harvests, 1), "% )\n")
  cat("Predictions TOO EARLY (>21 days ahead):", nrow(early_detections), "\n")
  cat("Predictions TOO LATE (<7 days ahead):", nrow(late_detections), "\n")
  cat("Missed harvests:", total_harvests - nrow(optimal_detections) - nrow(early_detections) - nrow(late_detections), "\n\n")

  # Overall detection rate
  detected_total <- all_results_df %>%
    filter(predicted == TRUE, days_from_harvest <= 0) %>%
    distinct(harvest_event) %>%
    nrow()

  cat("Overall detection rate (any time before harvest):", detected_total, "/", total_harvests,
      "(", round(100 * detected_total / total_harvests, 1), "% )\n\n")

  # Return detailed results
  invisible(list(
    all_results = all_results_df,
    detection_timing = detection_timing,
    optimal_detections = optimal_detections,
    summary = data.frame(
      field = field_id,
      total_harvests = total_harvests,
      optimal_window = nrow(optimal_detections),
      too_early = nrow(early_detections),
      too_late = nrow(late_detections),
      missed = total_harvests - detected_total,
      detection_rate = round(100 * detected_total / total_harvests, 1)
    )
  ))
}

# ============================================================================
# RUN VALIDATION
# ============================================================================

# Test on Field 00110 (from your graphs)
test_field <- "00110"
results <- validate_harvest_prediction(test_field)

cat("\n", rep("=", 80), "\n", sep = "")
cat("INTERPRETATION FOR FACTORY CLIENT\n")
cat(rep("=", 80), "\n\n", sep = "")
cat("This system provides ADVANCE WARNING when harvest is likely imminent:\n\n")
cat("  📊 HARVEST WINDOW PREDICTIONS:\n")
cat("     - '7_days': Harvest expected within 7 days (IMMINENT)\n")
cat("     - '7_14_days': Harvest expected in 7-14 days (LIKELY)\n")
cat("     - '14_21_days': Harvest possible in 14-21 days (WATCH)\n\n")
cat("  ⚙️  DETECTION LOGIC:\n")
cat("     - CI below 2.5 for", CONFIG$sustained_low_days, "consecutive days = crop mature\n")
cat("     - Very low CI (<1.5) = harvest imminent (7 days)\n")
cat("     - Low CI (1.5-2.5) = harvest likely (7-14 days)\n\n")
cat("  🏭 FACTORY USE CASE:\n")
cat("     - Factory gets 7-21 days advance notice to plan logistics\n")
cat("     - Can schedule processing capacity and transport\n")
cat("     - Avoids surprise harvest deliveries\n\n")

cat("=== ANALYSIS COMPLETE ===\n")