SmartCane/r_app/experiments/harvest_prediction/test_bfast_smoothing.R

# ============================================================================
# TEST DIFFERENT SMOOTHING APPROACHES FOR BFAST HARVEST DETECTION
# ============================================================================
# The CI data is very noisy - test multiple smoothing strategies to see
# if better smoothing improves BFAST's ability to detect harvest events
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(bfast)
  library(zoo)
  library(ggplot2)
})

project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

cat("============================================================================\n")
cat("TESTING SMOOTHING STRATEGIES FOR BFAST\n")
cat("============================================================================\n\n")

# Load CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir,
                    "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

# Load harvest data
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
harvest_actual <- read_excel(harvest_file) %>%
  mutate(season_end = as.Date(season_end)) %>%
  filter(!is.na(season_end))

# Pick a test field with known harvests and good data coverage
test_field <- "KHWB"  # From the plot you showed
test_harvests <- harvest_actual %>% filter(field == test_field)

cat("Test field:", test_field, "\n")
cat("Actual harvests:", nrow(test_harvests), "\n\n")

field_data <- time_series_daily %>%
  filter(field_id == test_field) %>%
  arrange(date)

cat("CI observations:", nrow(field_data), "\n")
cat("Date range:", format(min(field_data$date), "%Y-%m-%d"), "to",
    format(max(field_data$date), "%Y-%m-%d"), "\n\n")

# ============================================================================
# DEFINE SMOOTHING STRATEGIES TO TEST
# ============================================================================

# QUICK TEST MODE: Only test best configuration
# To test all configurations, uncomment the full list below
smoothing_strategies <- list(
  ma7 = list(
    name = "7-day moving average",
    smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center")
  )
)

# Full test suite (uncomment to test all smoothing strategies)
# smoothing_strategies <- list(
#   raw = list(
#     name = "No smoothing (raw)",
#     smooth_fun = function(x) x
#   ),
#
#   ma7 = list(
#     name = "7-day moving average",
#     smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center")
#   ),
#
#   ma14 = list(
#     name = "14-day moving average",
#     smooth_fun = function(x) rollmean(x, k = 14, fill = NA, align = "center")
#   ),
#
#   ma21 = list(
#     name = "21-day moving average",
#     smooth_fun = function(x) rollmean(x, k = 21, fill = NA, align = "center")
#   ),
#
#   ma30 = list(
#     name = "30-day moving average",
#     smooth_fun = function(x) rollmean(x, k = 30, fill = NA, align = "center")
#   ),
#
#   median14 = list(
#     name = "14-day rolling median",
#     smooth_fun = function(x) rollmedian(x, k = 14, fill = NA, align = "center")
#   ),
#
#   median21 = list(
#     name = "21-day rolling median",
#     smooth_fun = function(x) rollmedian(x, k = 21, fill = NA, align = "center")
#   ),
#
#   loess = list(
#     name = "LOESS smoothing (span=0.1)",
#     smooth_fun = function(x) {
#       idx <- seq_along(x)
#       fit <- loess(x ~ idx, span = 0.1)
#       predict(fit)
#     }
#   )
# )

# ============================================================================
# TEST BFAST WITH DIFFERENT SMOOTHING + PARAMETERS
# ============================================================================

cat("============================================================================\n")
cat("TESTING CONFIGURATIONS\n")
cat("============================================================================\n\n")

# QUICK TEST MODE: Only test best configuration
bfast_configs <- list(
  config4 = list(h = 0.10, season = "none", name = "h=0.10, no season")
)

# Full test suite (uncomment to test all BFAST configurations)
# bfast_configs <- list(
#   config1 = list(h = 0.15, season = "harmonic", name = "h=0.15, harmonic"),
#   config2 = list(h = 0.10, season = "harmonic", name = "h=0.10, harmonic"),
#   config3 = list(h = 0.15, season = "none", name = "h=0.15, no season"),
#   config4 = list(h = 0.10, season = "none", name = "h=0.10, no season")
# )

results <- list()

for (smooth_name in names(smoothing_strategies)) {
  smooth_config <- smoothing_strategies[[smooth_name]]

  cat("\n--- Testing:", smooth_config$name, "---\n")

  # Apply smoothing
  field_ts <- field_data %>%
    mutate(ci_smooth = smooth_config$smooth_fun(mean_ci))

  # Fill NAs
  field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)

  # Create regular time series
  date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
  ts_regular <- data.frame(date = date_seq) %>%
    left_join(field_ts %>% select(date, ci_smooth), by = "date")

  ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)

  # Convert to ts object
  start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
  start_doy <- as.numeric(format(min(ts_regular$date), "%j"))

  ts_obj <- ts(ts_regular$ci_smooth,
               start = c(start_year, start_doy),
               frequency = 365)

  # Test each BFAST configuration
  for (config_name in names(bfast_configs)) {
    config <- bfast_configs[[config_name]]

    bfast_result <- tryCatch({
      bfast(ts_obj,
            h = config$h,
            season = config$season,
            max.iter = 10,
            breaks = NULL)
    }, error = function(e) {
      NULL
    })

    if (!is.null(bfast_result)) {
      # Extract breaks
      bp_component <- bfast_result$output[[1]]$bp.Vt

      if (!is.null(bp_component) && length(bp_component$breakpoints) > 0) {
        bp_indices <- bp_component$breakpoints
        bp_indices <- bp_indices[!is.na(bp_indices)]

        if (length(bp_indices) > 0) {
          bp_dates <- ts_regular$date[bp_indices]

          # Apply minimum harvest interval filter (250 days)
          if (length(bp_dates) > 1) {
            # Calculate CI drops at each break
            ci_drops <- sapply(bp_indices, function(idx) {
              if (idx > 10 && idx < (length(ts_regular$ci_smooth) - 10)) {
                before_ci <- mean(ts_regular$ci_smooth[(idx-10):(idx-1)], na.rm = TRUE)
                after_ci <- mean(ts_regular$ci_smooth[(idx+1):(idx+10)], na.rm = TRUE)
                return(after_ci - before_ci)
              } else {
                return(0)
              }
            })

            # Keep breaks that are at least 250 days apart
            keep_breaks <- rep(TRUE, length(bp_dates))
            bp_dates_sorted <- sort(bp_dates)

            for (i in 2:length(bp_dates_sorted)) {
              days_since_last <- as.numeric(bp_dates_sorted[i] - bp_dates_sorted[i-1])
              if (days_since_last < 250) {
                # Keep the one with larger CI drop
                idx_current <- which(bp_dates == bp_dates_sorted[i])
                idx_previous <- which(bp_dates == bp_dates_sorted[i-1])

                if (abs(ci_drops[idx_current]) < abs(ci_drops[idx_previous])) {
                  keep_breaks[idx_current] <- FALSE
                } else {
                  keep_breaks[idx_previous] <- FALSE
                }
              }
            }

            bp_dates <- bp_dates[keep_breaks]
          }

          # Check matches with actual harvests
          matches <- sapply(test_harvests$season_end, function(h_date) {
            min(abs(as.numeric(bp_dates - h_date)))
          })

          best_match_days <- min(matches)
          matched_within_2w <- sum(matches <= 14)
          matched_within_4w <- sum(matches <= 28)

          result_entry <- list(
            smoothing = smooth_config$name,
            bfast_config = config$name,
            n_breaks = length(bp_dates),
            n_harvests = nrow(test_harvests),
            matched_2w = matched_within_2w,
            matched_4w = matched_within_4w,
            best_match_days = best_match_days,
            break_dates = bp_dates
          )

          results[[paste(smooth_name, config_name, sep = "_")]] <- result_entry

          cat("  ", config$name, ": ", length(bp_dates), " breaks, ",
              matched_within_2w, "/", nrow(test_harvests), " matched (±2w), ",
              "best: ", best_match_days, " days\n", sep = "")
        } else {
          cat("  ", config$name, ": No breaks detected\n", sep = "")
        }
      } else {
        cat("  ", config$name, ": No breaks detected\n", sep = "")
      }
    } else {
      cat("  ", config$name, ": BFAST failed\n", sep = "")
    }
  }
}

# ============================================================================
# FIND BEST CONFIGURATION
# ============================================================================

cat("\n\n============================================================================\n")
cat("BEST CONFIGURATIONS\n")
cat("============================================================================\n\n")

results_df <- bind_rows(lapply(names(results), function(name) {
  r <- results[[name]]
  data.frame(
    config = paste(r$smoothing, "|", r$bfast_config),
    n_breaks = r$n_breaks,
    matched_2w = r$matched_2w,
    matched_4w = r$matched_4w,
    match_rate_2w = round(100 * r$matched_2w / r$n_harvests, 1),
    match_rate_4w = round(100 * r$matched_4w / r$n_harvests, 1),
    best_match_days = r$best_match_days
  )
}))

# Sort by match rate
results_df <- results_df %>%
  arrange(desc(match_rate_2w), best_match_days)

cat("Top configurations (sorted by 2-week match rate):\n\n")
print(results_df, row.names = FALSE)

# ============================================================================
# VISUALIZE BEST CONFIGURATION
# ============================================================================

if (nrow(results_df) > 0 && results_df$matched_2w[1] > 0) {
  best_config_name <- strsplit(as.character(results_df$config[1]), " \\| ")[[1]]
  best_smooth <- names(smoothing_strategies)[sapply(smoothing_strategies, function(s) s$name == best_config_name[1])]
  best_bfast <- names(bfast_configs)[sapply(bfast_configs, function(c) c$name == best_config_name[2])]

  cat("\n\nGenerating visualization for BEST configuration...\n")
  cat("Smoothing:", best_config_name[1], "\n")
  cat("BFAST:", best_config_name[2], "\n\n")

  # Recreate with best config
  smooth_config <- smoothing_strategies[[best_smooth[1]]]
  bfast_config <- bfast_configs[[best_bfast[1]]]

  field_ts <- field_data %>%
    mutate(ci_smooth = smooth_config$smooth_fun(mean_ci))
  field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)

  date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
  ts_regular <- data.frame(date = date_seq) %>%
    left_join(field_ts %>% select(date, ci_smooth), by = "date")
  ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)

  start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
  start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
  ts_obj <- ts(ts_regular$ci_smooth, start = c(start_year, start_doy), frequency = 365)

  bfast_result <- bfast(ts_obj, h = bfast_config$h, season = bfast_config$season,
                       max.iter = 10, breaks = NULL)

  bp_component <- bfast_result$output[[1]]$bp.Vt
  bp_indices <- bp_component$breakpoints[!is.na(bp_component$breakpoints)]
  bp_dates <- ts_regular$date[bp_indices]

  # Create comprehensive plot
  p <- ggplot() +
    # Raw data (light)
    geom_line(data = field_data, aes(x = date, y = mean_ci),
              color = "gray70", alpha = 0.4, linewidth = 0.5) +
    # Smoothed data
    geom_line(data = ts_regular, aes(x = date, y = ci_smooth),
              color = "darkgreen", linewidth = 1) +
    # Actual harvest dates
    geom_vline(data = test_harvests, aes(xintercept = season_end),
               color = "red", linetype = "dashed", linewidth = 1.2) +
    # BFAST breaks
    geom_vline(data = data.frame(break_date = bp_dates),
               aes(xintercept = break_date),
               color = "blue", linetype = "solid", linewidth = 1.5, alpha = 0.7) +
    # Labels
    geom_text(data = test_harvests,
              aes(x = season_end, y = max(ts_regular$ci_smooth) * 1.05,
                  label = format(season_end, "%Y-%m-%d")),
              angle = 90, vjust = -0.5, size = 3, color = "red", fontface = "bold") +
    labs(
      title = paste0("Field ", test_field, " - BEST Configuration"),
      subtitle = paste0(
        "Smoothing: ", best_config_name[1], " | BFAST: ", best_config_name[2], "\n",
        "Red dashed = Actual (", nrow(test_harvests), ") | ",
        "Blue solid = Detected (", length(bp_dates), ") | ",
        "Match rate: ", results_df$match_rate_2w[1], "% (±2w)"
      ),
      x = "Date",
      y = "CI",
      caption = "Gray = Raw data | Green = Smoothed data"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10),
      legend.position = "bottom"
    )

  output_dir <- here("r_app/experiments/harvest_prediction")
  ggsave(
    file.path(output_dir, "bfast_BEST_smoothing.png"),
    p, width = 16, height = 8, dpi = 300
  )

  cat("✓ Saved: bfast_BEST_smoothing.png\n")
}

# ============================================================================
# RECOMMENDATIONS
# ============================================================================

cat("\n\n============================================================================\n")
cat("RECOMMENDATIONS\n")
cat("============================================================================\n\n")

if (nrow(results_df) > 0 && max(results_df$match_rate_2w) > 0) {
  cat("✓ Found configurations that improve detection:\n\n")
  cat("Best smoothing:", best_config_name[1], "\n")
  cat("Best BFAST params:", best_config_name[2], "\n")
  cat("Match rate (±2 weeks):", results_df$match_rate_2w[1], "%\n")
  cat("Match rate (±4 weeks):", results_df$match_rate_4w[1], "%\n\n")

  cat("Key insights:\n")
  if (grepl("21-day|30-day", best_config_name[1])) {
    cat("- Heavy smoothing (21-30 days) works better than light smoothing\n")
    cat("- This suggests harvest signal is gradual, not abrupt\n")
  }
  if (grepl("median", best_config_name[1])) {
    cat("- Median smoothing works better than mean (more robust to outliers)\n")
  }
  if (grepl("none", best_config_name[2])) {
    cat("- No seasonal model works better (harvest may disrupt seasonal patterns)\n")
  }
  if (grepl("h=0.10", best_config_name[2])) {
    cat("- Smaller h (0.10) allows more breaks (more sensitive detection)\n")
  }
} else {
  cat("⚠ No configuration achieved successful matches\n\n")
  cat("This confirms BFAST may not be suitable because:\n")
  cat("- Harvest doesn't create clear structural breaks\n")
  cat("- CI changes are too gradual\n")
  cat("- Noise obscures the harvest signal even with heavy smoothing\n\n")
  cat("Consider alternative approaches:\n")
  cat("1. Threshold-based: Sustained CI < 2.0 for 14+ days = harvest\n")
  cat("2. Minimum detection: Find local minima in smoothed CI\n")
  cat("3. Crop age model: Expected harvest based on planting date + growth days\n")
}

cat("\n============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n")