SmartCane/r_app/experiments/harvest_prediction/test_bfast_rolling.R

# ============================================================================
# BFAST ROLLING WINDOW TEST - Single Season Analysis
# ============================================================================
# Test bfast's ability to detect harvest breakpoints progressively
# For each day from -20 to +20 days around harvest:
#   1. Load CI data from previous harvest to current test day
#   2. Run bfast to detect breaks
#   3. Check if harvest date is detected as a breakpoint
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(bfast)
  library(zoo)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG <- list(
  test_field = "KHWC",           # Field to test
  test_harvest_index = 2,         # Which harvest event (1 = first, 2 = second, etc.)
  days_before = 20,               # Days before harvest to start testing
  days_after = 20,                # Days after harvest to end testing

  # bfast parameters
  h = 0.15,                       # Minimal segment size (15% of data)
  season = "none",                # Seasonal model type ("dummy", "harmonic", "none")
  max_iter = 10,                  # Maximum iterations
  breaks = NULL                   # Let bfast determine number of breaks
)

cat("============================================================================\n")
cat("BFAST ROLLING WINDOW TEST - PROGRESSIVE HARVEST DETECTION\n")
cat("============================================================================\n\n")

cat("Test configuration:\n")
cat("  Field:", CONFIG$test_field, "\n")
cat("  Harvest event:", CONFIG$test_harvest_index, "\n")
cat("  Test window:", CONFIG$days_before, "days before to", CONFIG$days_after, "days after\n")
cat("  bfast h parameter:", CONFIG$h, "\n")
cat("  Seasonal model:", CONFIG$season, "\n\n")

# ============================================================================
# LOAD DATA
# ============================================================================

cat("=== LOADING DATA ===\n\n")

ci_rds_file <- here("laravel_app/storage/app", project_dir,
                    "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

# ============================================================================
# GET TEST FIELD AND HARVEST EVENT
# ============================================================================

field_harvests <- harvest_data %>%
  filter(field == CONFIG$test_field) %>%
  arrange(season_end)

if (nrow(field_harvests) < CONFIG$test_harvest_index) {
  stop("Field ", CONFIG$test_field, " does not have harvest event #", CONFIG$test_harvest_index)
}

test_harvest <- field_harvests[CONFIG$test_harvest_index, ]
harvest_date <- test_harvest$season_end

# Get previous harvest date to define season start
if (CONFIG$test_harvest_index == 1) {
  season_start <- test_harvest$season_start
  if (is.na(season_start)) {
    # If no season_start, use earliest data for this field
    season_start <- min(time_series_daily$date[time_series_daily$field_id == CONFIG$test_field])
  }
} else {
  season_start <- field_harvests$season_end[CONFIG$test_harvest_index - 1]
}

cat("Test harvest event:\n")
cat("  Field:", CONFIG$test_field, "\n")
cat("  Season start:", format(season_start, "%Y-%m-%d"), "\n")
cat("  Harvest date:", format(harvest_date, "%Y-%m-%d"), "\n")
cat("  Season length:", as.numeric(harvest_date - season_start), "days\n\n")

# ============================================================================
# PREPARE FIELD TIME SERIES
# ============================================================================

field_ts <- time_series_daily %>%
  filter(field_id == CONFIG$test_field,
         date >= season_start,
         date <= harvest_date + CONFIG$days_after) %>%
  arrange(date)

cat("CI data available:\n")
cat("  Total observations:", nrow(field_ts), "\n")
cat("  Date range:", format(min(field_ts$date), "%Y-%m-%d"), "to",
    format(max(field_ts$date), "%Y-%m-%d"), "\n\n")

# ============================================================================
# BFAST WRAPPER FUNCTION
# ============================================================================

run_bfast_on_window <- function(ts_data, end_date, harvest_date, config = CONFIG) {
  # Filter data up to end_date
  window_data <- ts_data %>%
    filter(date <= end_date) %>%
    arrange(date)

  if (nrow(window_data) < 50) {
    return(list(
      success = FALSE,
      reason = "insufficient_data",
      n_obs = nrow(window_data),
      breaks_detected = 0,
      harvest_detected = FALSE
    ))
  }

  # Create regular time series (fill gaps with NA for now, bfast can handle)
  date_seq <- seq.Date(min(window_data$date), max(window_data$date), by = "1 day")
  ts_regular <- data.frame(date = date_seq) %>%
    left_join(window_data, by = "date")

  # Interpolate missing values (linear interpolation)
  ts_regular$mean_ci_interp <- na.approx(ts_regular$mean_ci, rule = 2)

  # Convert to ts object (yearly frequency = 365 days)
  start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
  start_doy <- as.numeric(format(min(ts_regular$date), "%j"))

  ts_obj <- ts(ts_regular$mean_ci_interp,
               start = c(start_year, start_doy),
               frequency = 365)

  # Run bfast
  result <- tryCatch({
    bfast_result <- bfast(ts_obj,
                          h = config$h,
                          season = config$season,
                          max.iter = config$max_iter,
                          breaks = config$breaks)

    # Check if any breaks were detected in the trend component
    if (!is.null(bfast_result$output[[1]]$bp.Vt) &&
        length(bfast_result$output[[1]]$bp.Vt$breakpoints) > 0) {

      # Get breakpoint dates
      bp_indices <- bfast_result$output[[1]]$bp.Vt$breakpoints
      bp_indices <- bp_indices[!is.na(bp_indices)]

      if (length(bp_indices) > 0) {
        bp_dates <- ts_regular$date[bp_indices]

        # Check if any breakpoint is close to harvest date (within 14 days)
        days_from_harvest <- as.numeric(bp_dates - harvest_date)
        harvest_detected <- any(abs(days_from_harvest) <= 14)

        closest_bp <- bp_dates[which.min(abs(days_from_harvest))]
        closest_bp_offset <- as.numeric(closest_bp - harvest_date)

        return(list(
          success = TRUE,
          n_obs = nrow(window_data),
          breaks_detected = length(bp_dates),
          breakpoint_dates = bp_dates,
          days_from_harvest = days_from_harvest,
          harvest_detected = harvest_detected,
          closest_breakpoint = closest_bp,
          closest_bp_offset = closest_bp_offset,
          bfast_object = bfast_result
        ))
      }
    }

    # No breaks detected
    list(
      success = TRUE,
      n_obs = nrow(window_data),
      breaks_detected = 0,
      harvest_detected = FALSE
    )

  }, error = function(e) {
    list(
      success = FALSE,
      reason = paste0("bfast_error: ", e$message),
      n_obs = nrow(window_data),
      breaks_detected = 0,
      harvest_detected = FALSE
    )
  })

  return(result)
}

# ============================================================================
# ROLLING WINDOW TEST
# ============================================================================

cat("============================================================================\n")
cat("RUNNING ROLLING WINDOW TEST\n")
cat("============================================================================\n\n")

test_dates <- seq.Date(
  from = harvest_date - CONFIG$days_before,
  to = harvest_date + CONFIG$days_after,
  by = "1 day"
)

cat("Testing", length(test_dates), "different end dates...\n\n")

results_df <- data.frame()

pb <- txtProgressBar(min = 0, max = length(test_dates), style = 3)

for (i in 1:length(test_dates)) {
  test_date <- test_dates[i]
  days_from_harvest <- as.numeric(test_date - harvest_date)

  result <- run_bfast_on_window(field_ts, test_date, harvest_date, CONFIG)

  results_df <- bind_rows(results_df, data.frame(
    test_date = test_date,
    days_from_harvest = days_from_harvest,
    n_obs = result$n_obs,
    success = result$success,
    breaks_detected = result$breaks_detected,
    harvest_detected = ifelse(is.null(result$harvest_detected), FALSE, result$harvest_detected),
    closest_bp_offset = ifelse(is.null(result$closest_bp_offset), NA, result$closest_bp_offset),
    reason = ifelse(is.null(result$reason), "ok", result$reason)
  ))

  setTxtProgressBar(pb, i)
}

close(pb)

# ============================================================================
# RESULTS SUMMARY
# ============================================================================

cat("\n\n============================================================================\n")
cat("RESULTS SUMMARY\n")
cat("============================================================================\n\n")

cat("Total tests run:", nrow(results_df), "\n")
cat("Successful bfast runs:", sum(results_df$success), "\n")
cat("Failed runs:", sum(!results_df$success), "\n\n")

if (any(!results_df$success)) {
  cat("Failure reasons:\n")
  failure_summary <- results_df %>%
    filter(!success) %>%
    count(reason) %>%
    arrange(desc(n))
  print(failure_summary, row.names = FALSE)
  cat("\n")
}

# When was harvest first detected?
first_detection <- results_df %>%
  filter(harvest_detected == TRUE) %>%
  arrange(days_from_harvest) %>%
  slice(1)

if (nrow(first_detection) > 0) {
  cat("🎯 FIRST HARVEST DETECTION:\n")
  cat("  Test date:", format(first_detection$test_date, "%Y-%m-%d"), "\n")
  cat("  Days from actual harvest:", first_detection$days_from_harvest, "\n")
  cat("  Observations used:", first_detection$n_obs, "\n")
  cat("  Total breaks detected:", first_detection$breaks_detected, "\n")
  cat("  Closest breakpoint offset:", first_detection$closest_bp_offset, "days\n\n")

  if (first_detection$days_from_harvest < 0) {
    cat("  ✓ Detected", abs(first_detection$days_from_harvest), "days BEFORE actual harvest\n")
  } else if (first_detection$days_from_harvest == 0) {
    cat("  ✓ Detected ON harvest day\n")
  } else {
    cat("  ℹ Detected", first_detection$days_from_harvest, "days AFTER actual harvest\n")
  }
  cat("\n")
} else {
  cat("❌ Harvest NOT detected in any test window\n\n")
}

# Show detection pattern over time
cat("============================================================================\n")
cat("DETECTION PATTERN DAY-BY-DAY\n")
cat("============================================================================\n\n")

detection_summary <- results_df %>%
  mutate(
    status = case_when(
      !success ~ paste0("FAIL: ", reason),
      harvest_detected ~ paste0("✓ DETECTED (", breaks_detected, " breaks)"),
      breaks_detected > 0 ~ paste0("○ ", breaks_detected, " breaks (not harvest)"),
      TRUE ~ "○ No breaks"
    )
  ) %>%
  select(
    Date = test_date,
    Days_from_harvest = days_from_harvest,
    N_obs = n_obs,
    Status = status,
    BP_offset = closest_bp_offset
  )

print(as.data.frame(detection_summary), row.names = FALSE)

# ============================================================================
# VISUALIZATION
# ============================================================================

cat("\n\n============================================================================\n")
cat("GENERATING VISUALIZATION\n")
cat("============================================================================\n\n")

# Plot 1: CI time series with harvest date and detection windows
p1 <- ggplot(field_ts, aes(x = date, y = mean_ci)) +
  geom_line(color = "darkgreen", linewidth = 1) +
  geom_vline(xintercept = harvest_date, color = "red", linetype = "dashed", linewidth = 1) +
  geom_vline(xintercept = harvest_date - CONFIG$days_before, color = "blue", linetype = "dotted") +
  geom_vline(xintercept = harvest_date + CONFIG$days_after, color = "blue", linetype = "dotted") +
  annotate("text", x = harvest_date, y = max(field_ts$mean_ci, na.rm = TRUE),
           label = "Actual Harvest", vjust = -0.5, color = "red") +
  labs(
    title = paste0("Field ", CONFIG$test_field, " - CI Time Series"),
    subtitle = paste0("Season: ", format(season_start, "%Y-%m-%d"), " to ",
                      format(harvest_date + CONFIG$days_after, "%Y-%m-%d")),
    x = "Date",
    y = "CI Value"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

# Plot 2: Detection success over test window
p2 <- ggplot(results_df %>% filter(success),
             aes(x = days_from_harvest, y = as.numeric(harvest_detected))) +
  geom_line(color = "blue", linewidth = 1) +
  geom_point(aes(color = harvest_detected), size = 2) +
  geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
  scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "green")) +
  labs(
    title = "bfast Harvest Detection Over Time",
    subtitle = paste0("Field ", CONFIG$test_field, " - Harvest ", CONFIG$test_harvest_index),
    x = "Days from Actual Harvest",
    y = "Harvest Detected",
    color = "Detection"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

# Plot 3: Number of breaks detected
p3 <- ggplot(results_df %>% filter(success),
             aes(x = days_from_harvest, y = breaks_detected)) +
  geom_line(color = "purple", linewidth = 1) +
  geom_point(aes(color = harvest_detected), size = 2) +
  geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
  scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "green")) +
  labs(
    title = "Number of Breakpoints Detected",
    x = "Days from Actual Harvest",
    y = "Breaks Detected",
    color = "Harvest\nDetected"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold"))

# Save plots
output_dir <- here("r_app/experiments/harvest_prediction")
ggsave(file.path(output_dir, "bfast_ci_timeseries.png"), p1, width = 10, height = 6, dpi = 300)
ggsave(file.path(output_dir, "bfast_detection_pattern.png"), p2, width = 10, height = 6, dpi = 300)
ggsave(file.path(output_dir, "bfast_breaks_count.png"), p3, width = 10, height = 6, dpi = 300)

cat("Plots saved to:", output_dir, "\n")
cat("  - bfast_ci_timeseries.png\n")
cat("  - bfast_detection_pattern.png\n")
cat("  - bfast_breaks_count.png\n\n")

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_file <- file.path(output_dir, "bfast_rolling_results.rds")
saveRDS(list(
  config = CONFIG,
  field = CONFIG$test_field,
  harvest_date = harvest_date,
  season_start = season_start,
  results = results_df,
  field_ts = field_ts
), output_file)

cat("Results saved to:", output_file, "\n\n")

cat("============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n")