# ============================================================================
# DAILY-SCALE HARVEST DETECTION AND VALIDATION
# Real-time detection using CI drop patterns
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG <- list(
  min_field_age_days = 240,        # 8 months minimum age
  ci_threshold = 2.5,              # Below this = potential harvest
  drop_threshold_low = 1.0,        # Minimum CI drop to flag
  drop_threshold_high = 2.0,       # Strong CI drop
  lookback_days_min = 7,           # Compare with 7-14 days ago
  lookback_days_max = 14,
  confirmation_days = 3,           # Days below threshold for confirmation
  test_window_days = 14            # Test ±14 days around actual harvest
)

cat("=== DAILY HARVEST DETECTION CONFIGURATION ===\n\n")
cat("Minimum field age:", CONFIG$min_field_age_days, "days (", round(CONFIG$min_field_age_days/30, 1), "months )\n")
cat("CI threshold:", CONFIG$ci_threshold, "\n")
cat("Drop thresholds:", CONFIG$drop_threshold_low, "and", CONFIG$drop_threshold_high, "\n")
cat("Lookback window:", CONFIG$lookback_days_min, "-", CONFIG$lookback_days_max, "days\n")
cat("Confirmation window:", CONFIG$confirmation_days, "consecutive days\n\n")

# ============================================================================
# LOAD DATA
# ============================================================================

cat("=== LOADING DATA ===\n\n")

# Load CI time series
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

# Load harvest data
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

# Get fields with both CI and harvest data
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
  filter(field %in% fields_with_ci) %>%
  arrange(field, season_end)

cat("Fields with CI data:", length(fields_with_ci), "\n")
cat("Fields with harvest records:", length(unique(harvest_data_filtered$field)), "\n")
cat("Total harvest events:", nrow(harvest_data_filtered), "\n\n")

# ============================================================================
# DETECTION FUNCTION
# ============================================================================

detect_harvest_on_date <- function(field_ts, check_date, last_harvest_date, config = CONFIG) {
  # Check if harvest can be detected on a specific date
  #
  # Args:
  #   field_ts: Daily CI time series for field
  #   check_date: Date to check for harvest detection
  #   last_harvest_date: Previous harvest date (to calculate field age)
  #   config: Detection parameters
  #
  # Returns:
  #   List with detection status and metrics
  
  # Get CI value on check date
  current_ci <- field_ts %>%
    filter(date == check_date) %>%
    pull(mean_ci)
  
  if (length(current_ci) == 0) {
    return(list(
      detected = FALSE,
      reason = "no_data",
      current_ci = NA,
      lookback_ci = NA,
      ci_drop = NA,
      field_age = NA,
      consecutive_days_low = 0
    ))
  }
  
  # Check field age
  field_age <- as.numeric(check_date - last_harvest_date)
  
  if (field_age < config$min_field_age_days) {
    return(list(
      detected = FALSE,
      reason = "too_young",
      current_ci = current_ci,
      lookback_ci = NA,
      ci_drop = NA,
      field_age = field_age,
      consecutive_days_low = 0
    ))
  }
  
  # Get lookback period CI (mean of 7-14 days ago)
  lookback_start <- check_date - config$lookback_days_max
  lookback_end <- check_date - config$lookback_days_min
  
  lookback_ci <- field_ts %>%
    filter(date >= lookback_start, date <= lookback_end) %>%
    summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
    pull(mean_ci)
  
  if (is.na(lookback_ci) || length(lookback_ci) == 0) {
    return(list(
      detected = FALSE,
      reason = "no_lookback_data",
      current_ci = current_ci,
      lookback_ci = NA,
      ci_drop = NA,
      field_age = field_age,
      consecutive_days_low = 0
    ))
  }
  
  # Calculate CI drop
  ci_drop <- lookback_ci - current_ci
  
  # Check consecutive days below threshold
  consecutive_days <- 0
  for (i in 0:config$confirmation_days) {
    test_date <- check_date - i
    test_ci <- field_ts %>%
      filter(date == test_date) %>%
      pull(mean_ci)
    
    if (length(test_ci) > 0 && test_ci < config$ci_threshold) {
      consecutive_days <- consecutive_days + 1
    } else {
      break
    }
  }
  
  # Detection logic
  detected <- FALSE
  confidence <- "none"
  reason <- "no_trigger"
  
  # Check conditions
  below_threshold <- current_ci < config$ci_threshold
  strong_drop <- ci_drop >= config$drop_threshold_high
  moderate_drop <- ci_drop >= config$drop_threshold_low
  
  if (below_threshold && strong_drop) {
    detected <- TRUE
    reason <- "strong_drop"
    if (consecutive_days >= 3) {
      confidence <- "confirmed"
    } else if (consecutive_days >= 2) {
      confidence <- "likely"
    } else {
      confidence <- "possible"
    }
  } else if (below_threshold && moderate_drop) {
    if (consecutive_days >= 2) {
      detected <- TRUE
      reason <- "moderate_drop_confirmed"
      confidence <- "likely"
    } else {
      detected <- TRUE
      reason <- "moderate_drop"
      confidence <- "possible"
    }
  } else if (below_threshold) {
    reason <- "below_threshold_no_drop"
  } else if (strong_drop) {
    reason <- "strong_drop_above_threshold"
  }
  
  return(list(
    detected = detected,
    reason = reason,
    confidence = confidence,
    current_ci = current_ci,
    lookback_ci = lookback_ci,
    ci_drop = ci_drop,
    field_age = field_age,
    consecutive_days_low = consecutive_days,
    below_threshold = below_threshold,
    strong_drop = strong_drop
  ))
}

# ============================================================================
# VALIDATION: TEST AROUND KNOWN HARVEST DATES
# ============================================================================

cat("=== TESTING DETECTION AROUND KNOWN HARVEST DATES ===\n\n")

# Start with first field
test_field <- fields_with_ci[1]
cat("Testing field:", test_field, "\n\n")

# Get field's time series and harvests
field_ts <- time_series_daily %>%
  filter(field_id == test_field)

field_harvests <- harvest_data_filtered %>%
  filter(field == test_field) %>%
  arrange(season_end)

cat("Field has", nrow(field_harvests), "recorded harvest events\n")
cat("CI observations:", nrow(field_ts), "days from", min(field_ts$date), "to", max(field_ts$date), "\n\n")

# Test each harvest event
validation_results <- list()

for (h in 1:nrow(field_harvests)) {
  actual_harvest <- field_harvests$season_end[h]
  
  # Get previous harvest for field age calculation
  if (h == 1) {
    last_harvest <- field_harvests$season_start[h] - 365  # Assume ~1 year before first record
  } else {
    last_harvest <- field_harvests$season_end[h-1]
  }
  
  cat("\n--- Harvest Event", h, ": ", as.character(actual_harvest), " ---\n")
  cat("Field age at harvest:", round(as.numeric(actual_harvest - last_harvest)), "days\n\n")
  
  # Test ±14 days around actual harvest
  test_dates <- seq.Date(
    from = actual_harvest - CONFIG$test_window_days, 
    to = actual_harvest + CONFIG$test_window_days, 
    by = "day"
  )
  
  # Run detection for each test date
  daily_results <- data.frame()
  
  for (i in 1:length(test_dates)) {
    test_date <- test_dates[i]
    result <- detect_harvest_on_date(field_ts, test_date, last_harvest)
    
    daily_results <- rbind(daily_results, data.frame(
      test_date = test_date,
      days_from_actual = as.numeric(test_date - actual_harvest),
      detected = result$detected,
      confidence = ifelse(is.null(result$confidence) || length(result$confidence) == 0, "none", result$confidence),
      reason = result$reason,
      current_ci = result$current_ci,
      lookback_ci = result$lookback_ci,
      ci_drop = result$ci_drop,
      consecutive_days = result$consecutive_days_low,
      stringsAsFactors = FALSE
    ))
  }
  
  # Find first detection
  first_detection <- daily_results %>%
    filter(detected == TRUE) %>%
    arrange(test_date) %>%
    slice(1)
  
  if (nrow(first_detection) > 0) {
    cat("✓ First detection:", as.character(first_detection$test_date), 
        "(", first_detection$days_from_actual, "days from actual harvest )\n")
    cat("  Confidence:", first_detection$confidence, "\n")
    cat("  CI drop:", round(first_detection$ci_drop, 2), "\n\n")
  } else {
    cat("✗ No detection within test window\n\n")
  }
  
  # Print detailed daily table
  cat("Day-by-Day Detection Results:\n")
  cat(sprintf("%-12s | %-15s | %8s | %10s | %10s | %8s | %8s | %15s\n",
              "Date", "Days from Actual", "Detected", "Confidence", "Drop (1.0)", "Drop (2.0)", "CI", "Reason"))
  cat(paste(rep("-", 110), collapse = ""), "\n")
  
  for (i in 1:nrow(daily_results)) {
    row <- daily_results[i, ]
    
    # Check both thresholds explicitly
    drop_1 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 1.0, "YES", "NO")
    drop_2 <- ifelse(!is.na(row$ci_drop) && row$ci_drop >= 2.0, "YES", "NO")
    
    cat(sprintf("%-12s | %+15d | %8s | %10s | %10s | %10s | %8.2f | %15s\n",
                as.character(row$test_date),
                row$days_from_actual,
                ifelse(row$detected, "YES", "NO"),
                row$confidence,
                drop_1,
                drop_2,
                ifelse(is.na(row$current_ci), NA, row$current_ci),
                substr(row$reason, 1, 15)))
  }
  cat("\n")
  
  # Store results
  validation_results[[h]] <- list(
    harvest_id = h,
    actual_date = actual_harvest,
    daily_results = daily_results,
    first_detection = first_detection
  )
}

# ============================================================================
# SUMMARY STATISTICS
# ============================================================================

cat("\n\n=== VALIDATION SUMMARY ===\n\n")

# Create comprehensive summary showing WHEN detection happens
detection_timing_table <- data.frame()

for (i in 1:length(validation_results)) {
  result <- validation_results[[i]]
  actual_date <- result$actual_date
  
  # Get detections in key time windows
  daily <- result$daily_results
  
  # Check specific days
  day_minus_14 <- daily %>% filter(days_from_actual == -14) %>% pull(detected)
  day_minus_7 <- daily %>% filter(days_from_actual == -7) %>% pull(detected)
  day_minus_3 <- daily %>% filter(days_from_actual == -3) %>% pull(detected)
  day_minus_1 <- daily %>% filter(days_from_actual == -1) %>% pull(detected)
  day_0 <- daily %>% filter(days_from_actual == 0) %>% pull(detected)
  day_plus_1 <- daily %>% filter(days_from_actual == 1) %>% pull(detected)
  day_plus_3 <- daily %>% filter(days_from_actual == 3) %>% pull(detected)
  day_plus_7 <- daily %>% filter(days_from_actual == 7) %>% pull(detected)
  day_plus_14 <- daily %>% filter(days_from_actual == 14) %>% pull(detected)
  
  # First detection info
  first_det <- result$first_detection
  
  detection_timing_table <- rbind(detection_timing_table, data.frame(
    harvest_event = i,
    actual_date = actual_date,
    detected_minus_14d = ifelse(length(day_minus_14) > 0 && day_minus_14, "YES", "NO"),
    detected_minus_7d = ifelse(length(day_minus_7) > 0 && day_minus_7, "YES", "NO"),
    detected_minus_3d = ifelse(length(day_minus_3) > 0 && day_minus_3, "YES", "NO"),
    detected_minus_1d = ifelse(length(day_minus_1) > 0 && day_minus_1, "YES", "NO"),
    detected_day_0 = ifelse(length(day_0) > 0 && day_0, "YES", "NO"),
    detected_plus_1d = ifelse(length(day_plus_1) > 0 && day_plus_1, "YES", "NO"),
    detected_plus_3d = ifelse(length(day_plus_3) > 0 && day_plus_3, "YES", "NO"),
    detected_plus_7d = ifelse(length(day_plus_7) > 0 && day_plus_7, "YES", "NO"),
    detected_plus_14d = ifelse(length(day_plus_14) > 0 && day_plus_14, "YES", "NO"),
    first_detection_day = ifelse(nrow(first_det) > 0, first_det$days_from_actual, NA),
    stringsAsFactors = FALSE
  ))
}

cat("Detection Timing Table (when does system flag harvest?):\n\n")
print(detection_timing_table)

cat("\n\nAcceptable Detection Window Analysis:\n")
cat("(Harvest day ±1 day is realistic detection window)\n\n")

acceptable_window <- detection_timing_table %>%
  mutate(
    detected_in_window = detected_minus_1d == "YES" | detected_day_0 == "YES" | detected_plus_1d == "YES"
  )

cat("Harvests detected within ±1 day of actual:", sum(acceptable_window$detected_in_window), "/", nrow(acceptable_window), "\n")
cat("Accuracy within realistic window:", round(100 * mean(acceptable_window$detected_in_window), 1), "%\n\n")

cat("False Early Detections (>3 days before harvest):\n")
early_false <- sum(detection_timing_table$detected_minus_7d == "YES" | 
                   detection_timing_table$detected_minus_14d == "YES", na.rm = TRUE)
cat("  Count:", early_false, "\n")
cat("  These are likely detecting decline phase, not actual harvest\n\n")

# Original summary for comparison
detection_summary <- data.frame()

for (i in 1:length(validation_results)) {
  result <- validation_results[[i]]
  
  if (nrow(result$first_detection) > 0) {
    detection_summary <- rbind(detection_summary, data.frame(
      harvest_event = i,
      actual_date = result$actual_date,
      detected_date = result$first_detection$test_date,
      days_offset = result$first_detection$days_from_actual,
      confidence = result$first_detection$confidence,
      ci_drop = result$first_detection$ci_drop
    ))
  } else {
    detection_summary <- rbind(detection_summary, data.frame(
      harvest_event = i,
      actual_date = result$actual_date,
      detected_date = NA,
      days_offset = NA,
      confidence = "not_detected",
      ci_drop = NA
    ))
  }
}

print(detection_summary)

cat("\n\nDetection Performance:\n")
cat("  Total harvests tested:", nrow(detection_summary), "\n")
cat("  Successfully detected:", sum(!is.na(detection_summary$detected_date)), "\n")
cat("  Detection rate:", round(100 * sum(!is.na(detection_summary$detected_date)) / nrow(detection_summary), 1), "%\n\n")

if (sum(!is.na(detection_summary$days_offset)) > 0) {
  cat("Detection Timing:\n")
  cat("  Average days from actual:", round(mean(detection_summary$days_offset, na.rm = TRUE), 1), "\n")
  cat("  Median days from actual:", round(median(detection_summary$days_offset, na.rm = TRUE), 1), "\n")
  cat("  Detected before harvest:", sum(detection_summary$days_offset < 0, na.rm = TRUE), "\n")
  cat("  Detected on harvest day:", sum(detection_summary$days_offset == 0, na.rm = TRUE), "\n")
  cat("  Detected after harvest:", sum(detection_summary$days_offset > 0, na.rm = TRUE), "\n")
}

cat("\n=== TEST COMPLETE ===\n")
cat("\nNext steps:\n")
cat("1. Review detection timing and adjust thresholds if needed\n")
cat("2. Expand to all fields in dataset\n")
cat("3. Analyze which configuration gives best early detection\n")