SmartCane/r_app/experiments/harvest_prediction/test_bfastmonitor.R

# ============================================================================
# BFAST MONITOR TEST - Real-time Harvest Detection
# ============================================================================
# Use bfastmonitor() which is designed for:
# - A stable historical baseline period
# - Monitoring recent period for breaks
# - Real-time change detection
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(bfast)
  library(zoo)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG <- list(
  # FIELD SELECTION - Change these to test different fields
  test_field = "00302",           # Try: 00007, 00104, 00119, 00120, etc.
  test_harvest_index = 3,         # Which harvest event (1 = first, 2 = second, etc.)

  # HISTORY/MONITORING SPLIT
  history_days = 300,             # Fixed: use first 300 days as baseline history

  # Analysis extends to this many days after harvest
  days_after_harvest = 20,

  # SMOOTHING - Test different values
  smoothing_windows = c(1, 7, 14),  # 1 = no smoothing, 7 = weekly, 14 = biweekly

  # BFASTMONITOR PARAMETERS (tweakable!)
  # 1. Formula: how to model the baseline
  #    - response ~ trend : simple linear trend
  #    - response ~ trend + harmon : add seasonal harmonics
  formula = response ~ trend,

  # 2. Order: if using harmon, how many harmonics (1-3 typical)
  order = 1,

  # 3. Type: type of monitoring process
  #    - "OLS-MOSUM" (default): moving sum (good for gradual changes)
  #    - "OLS-CUSUM": cumulative sum (good for abrupt changes)
  #    - "RE": recursive estimates
  #    - "ME": moving estimates
  type = "OLS-MOSUM",

  # 4. h: bandwidth for moving/recursive estimates (0.15-0.5 typical)
  #    smaller = more sensitive to recent changes
  h = 0.25,

  # 5. level: significance level for break detection (0.01-0.1)
  #    lower = stricter (fewer false positives)
  level = 0.05
)

cat("============================================================================\n")
cat("BFAST MONITOR - REAL-TIME HARVEST DETECTION\n")
cat("============================================================================\n\n")

# ============================================================================
# LOAD DATA
# ============================================================================

ci_rds_file <- here("laravel_app/storage/app", project_dir,
                    "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

# Check which fields have CI data
fields_with_ci <- unique(time_series_daily$field_id)

cat("Fields with CI data:", length(fields_with_ci), "\n")
cat("Sample fields:", paste(head(fields_with_ci, 20), collapse = ", "), "\n\n")

# Filter harvest data to only fields that have CI data
harvest_data_with_ci <- harvest_data %>%
  filter(field %in% fields_with_ci)

cat("Available fields with BOTH harvest data AND CI data:\n")
field_summary <- harvest_data_with_ci %>%
  group_by(field) %>%
  summarise(
    n_harvests = n(),
    first_harvest = min(season_end),
    last_harvest = max(season_end),
    .groups = "drop"
  ) %>%
  arrange(field)
print(field_summary, n = 50)
cat("\n")

# Get test field and harvest
field_harvests <- harvest_data_with_ci %>%
  filter(field == CONFIG$test_field) %>%
  arrange(season_end)

if (nrow(field_harvests) == 0) {
  stop("Field ", CONFIG$test_field, " does not have both CI data and harvest records. ",
       "Please choose from the fields listed above.")
}

if (nrow(field_harvests) < CONFIG$test_harvest_index) {
  stop("Field ", CONFIG$test_field, " only has ", nrow(field_harvests),
       " harvest events, but you requested harvest #", CONFIG$test_harvest_index)
}

test_harvest <- field_harvests[CONFIG$test_harvest_index, ]
harvest_date <- test_harvest$season_end

if (CONFIG$test_harvest_index == 1) {
  season_start <- test_harvest$season_start
  if (is.na(season_start)) {
    season_start <- min(time_series_daily$date[time_series_daily$field_id == CONFIG$test_field])
  }
} else {
  season_start <- field_harvests$season_end[CONFIG$test_harvest_index - 1]
}

# Prepare field time series
end_date <- harvest_date + CONFIG$days_after_harvest

field_ts <- time_series_daily %>%
  filter(field_id == CONFIG$test_field,
         date >= season_start,
         date <= end_date) %>%
  arrange(date)

season_length <- as.numeric(harvest_date - season_start)
history_days <- CONFIG$history_days  # Use fixed 300 days
history_end <- season_start + history_days

cat("Field:", CONFIG$test_field, "\n")
cat("Season start:", format(season_start, "%Y-%m-%d"), "\n")
cat("Harvest date:", format(harvest_date, "%Y-%m-%d"), "\n")
cat("Analysis end:", format(end_date, "%Y-%m-%d"), "\n")
cat("Season length:", season_length, "days\n\n")

cat("History period (baseline):", format(season_start, "%Y-%m-%d"), "to",
    format(history_end, "%Y-%m-%d"), "(", history_days, "days)\n")
cat("Monitoring period:", format(history_end + 1, "%Y-%m-%d"), "to",
    format(end_date, "%Y-%m-%d"), "\n\n")

# ============================================================================
# PREPARE TIME SERIES
# ============================================================================

# Create regular time series
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
ts_regular <- data.frame(date = date_seq) %>%
  left_join(field_ts, by = "date")

# Interpolate missing values
ts_regular$mean_ci_interp <- na.approx(ts_regular$mean_ci, rule = 2)

# ============================================================================
# TEST DIFFERENT SMOOTHING WINDOWS
# ============================================================================

cat("============================================================================\n")
cat("TESTING DIFFERENT SMOOTHING WINDOWS\n")
cat("============================================================================\n\n")

all_results <- list()

for (smooth_window in CONFIG$smoothing_windows) {

  cat("----------------------------------------------------------------------------\n")
  cat("SMOOTHING WINDOW:", smooth_window, "days\n")
  cat("----------------------------------------------------------------------------\n\n")

  # Apply smoothing
  if (smooth_window == 1) {
    ts_regular$mean_ci_smooth <- ts_regular$mean_ci_interp
    smooth_label <- "No smoothing"
  } else {
    ts_regular$mean_ci_smooth <- rollmean(ts_regular$mean_ci_interp,
                                          k = smooth_window,
                                          fill = NA,
                                          align = "center")
    # Fill NAs at edges
    ts_regular$mean_ci_smooth <- na.approx(ts_regular$mean_ci_smooth, rule = 2)
    smooth_label <- paste0(smooth_window, "-day moving average")
  }

  # Convert to ts object
  start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
  start_doy <- as.numeric(format(min(ts_regular$date), "%j"))

  ts_obj <- ts(ts_regular$mean_ci_smooth,
               start = c(start_year, start_doy),
               frequency = 365)

  # Calculate start point for monitoring period
  history_start_decimal <- as.numeric(start_year) + (start_doy - 1) / 365
  history_end_decimal <- history_start_decimal + (history_days / 365)


  cat("Time series with", smooth_label, "\n")
  cat("  Length:", length(ts_obj), "observations\n\n")

  # ============================================================================
  # RUN BFASTMONITOR
  # ============================================================================

  tryCatch({
    # Run bfastmonitor
    bfm_result <- bfastmonitor(
      data = ts_obj,
      start = history_end_decimal,
      formula = CONFIG$formula,
      order = CONFIG$order,
      type = CONFIG$type,
      h = CONFIG$h,
      level = CONFIG$level
    )

    cat("bfastmonitor completed successfully\n\n")

    # Check if break was detected
    if (!is.na(bfm_result$breakpoint)) {
      # Convert breakpoint back to date
      bp_decimal <- bfm_result$breakpoint
      bp_year <- floor(bp_decimal)
      bp_doy <- round((bp_decimal - bp_year) * 365) + 1
      bp_date <- as.Date(paste0(bp_year, "-01-01")) + bp_doy - 1

      days_from_harvest <- as.numeric(bp_date - harvest_date)

      cat("✓ BREAKPOINT DETECTED\n")
      cat("  Break date:", format(bp_date, "%Y-%m-%d"), "\n")
      cat("  Days from harvest:", days_from_harvest, "\n")

      if (abs(days_from_harvest) <= 7) {
        cat("  >>> ✓✓✓ HARVEST DETECTED WITHIN 7 DAYS! ✓✓✓ <<<\n")
      } else if (abs(days_from_harvest) <= 14) {
        cat("  >>> ✓ HARVEST DETECTED WITHIN 14 DAYS <<<\n")
      } else if (days_from_harvest < 0) {
        cat("  Break occurred", abs(days_from_harvest), "days BEFORE harvest\n")
      } else {
        cat("  Break occurred", days_from_harvest, "days AFTER harvest\n")
      }

      cat("  Break magnitude:", round(bfm_result$magnitude, 3), "\n\n")

      # Store results
      all_results[[as.character(smooth_window)]] <- list(
        smooth_window = smooth_window,
        smooth_label = smooth_label,
        break_detected = TRUE,
        break_date = bp_date,
        days_from_harvest = days_from_harvest,
        magnitude = bfm_result$magnitude,
        bfm_result = bfm_result
      )

    } else {
      cat("❌ No breakpoint detected in monitoring period\n\n")

      all_results[[as.character(smooth_window)]] <- list(
        smooth_window = smooth_window,
        smooth_label = smooth_label,
        break_detected = FALSE
      )
    }

  }, error = function(e) {
    cat("ERROR:", e$message, "\n\n")
    all_results[[as.character(smooth_window)]] <- list(
      smooth_window = smooth_window,
      smooth_label = smooth_label,
      error = e$message
    )
  })

  cat("\n")
}

# ============================================================================
# SUMMARY OF ALL SMOOTHING TESTS
# ============================================================================

cat("============================================================================\n")
cat("SUMMARY - EFFECT OF SMOOTHING ON HARVEST DETECTION\n")
cat("============================================================================\n\n")

summary_df <- data.frame()

for (sw in names(all_results)) {
  result <- all_results[[sw]]

  if (!is.null(result$error)) {
    summary_df <- rbind(summary_df, data.frame(
      smoothing_window = result$smooth_window,
      label = result$smooth_label,
      break_detected = "ERROR",
      break_date = NA,
      days_from_harvest = NA,
      magnitude = NA
    ))
  } else if (result$break_detected) {
    summary_df <- rbind(summary_df, data.frame(
      smoothing_window = result$smooth_window,
      label = result$smooth_label,
      break_detected = "YES",
      break_date = format(result$break_date, "%Y-%m-%d"),
      days_from_harvest = result$days_from_harvest,
      magnitude = round(result$magnitude, 3)
    ))
  } else {
    summary_df <- rbind(summary_df, data.frame(
      smoothing_window = result$smooth_window,
      label = result$smooth_label,
      break_detected = "NO",
      break_date = NA,
      days_from_harvest = NA,
      magnitude = NA
    ))
  }
}

print(summary_df, row.names = FALSE)

cat("\n")

# Find best result (closest to harvest)
best_result <- NULL
min_days <- Inf

for (result in all_results) {
  if (!is.null(result$days_from_harvest) && !is.na(result$days_from_harvest)) {
    if (abs(result$days_from_harvest) < min_days) {
      min_days <- abs(result$days_from_harvest)
      best_result <- result
    }
  }
}

if (!is.null(best_result)) {
  cat("BEST RESULT:\n")
  cat("  Smoothing:", best_result$smooth_label, "\n")
  cat("  Break date:", format(best_result$break_date, "%Y-%m-%d"), "\n")
  cat("  Days from harvest:", best_result$days_from_harvest, "\n")
  cat("  Magnitude:", round(best_result$magnitude, 3), "\n\n")

  # Save plot for best result
  output_dir <- here("r_app/experiments/harvest_prediction")

  png(file.path(output_dir, "bfastmonitor_best_smoothing.png"),
      width = 12, height = 8, units = "in", res = 300)
  plot(best_result$bfm_result,
       main = paste0("bfastmonitor - Field ", CONFIG$test_field,
                     " (", best_result$smooth_label, ")"))
  abline(v = decimal_date(harvest_date), col = "red", lty = 2, lwd = 2)
  legend("topleft", legend = c("Actual Harvest"), col = "red", lty = 2, lwd = 2)
  dev.off()

  cat("Saved best result plot: bfastmonitor_best_smoothing.png\n")
}

cat("\n============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n")