# ============================================================================ # TEST DIFFERENT SMOOTHING APPROACHES FOR BFAST HARVEST DETECTION # ============================================================================ # The CI data is very noisy - test multiple smoothing strategies to see # if better smoothing improves BFAST's ability to detect harvest events # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(bfast) library(zoo) library(ggplot2) }) project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) if (basename(getwd()) == "harvest_prediction") { setwd("../../..") } source(here("r_app", "parameters_project.R")) cat("============================================================================\n") cat("TESTING SMOOTHING STRATEGIES FOR BFAST\n") cat("============================================================================\n\n") # Load CI data ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, mean_ci = FitData) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) # Load harvest data harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx") harvest_actual <- read_excel(harvest_file) %>% mutate(season_end = as.Date(season_end)) %>% filter(!is.na(season_end)) # Pick a test field with known harvests and good data coverage test_field <- "KHWB" # From the plot you showed test_harvests <- harvest_actual %>% filter(field == test_field) cat("Test field:", test_field, "\n") cat("Actual harvests:", nrow(test_harvests), "\n\n") field_data <- time_series_daily %>% filter(field_id == test_field) %>% arrange(date) cat("CI observations:", nrow(field_data), "\n") cat("Date range:", format(min(field_data$date), "%Y-%m-%d"), "to", format(max(field_data$date), "%Y-%m-%d"), "\n\n") # ============================================================================ # DEFINE SMOOTHING STRATEGIES TO TEST # ============================================================================ # QUICK TEST MODE: Only test best configuration # To test all configurations, uncomment the full list below smoothing_strategies <- list( ma7 = list( name = "7-day moving average", smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center") ) ) # Full test suite (uncomment to test all smoothing strategies) # smoothing_strategies <- list( # raw = list( # name = "No smoothing (raw)", # smooth_fun = function(x) x # ), # # ma7 = list( # name = "7-day moving average", # smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center") # ), # # ma14 = list( # name = "14-day moving average", # smooth_fun = function(x) rollmean(x, k = 14, fill = NA, align = "center") # ), # # ma21 = list( # name = "21-day moving average", # smooth_fun = function(x) rollmean(x, k = 21, fill = NA, align = "center") # ), # # ma30 = list( # name = "30-day moving average", # smooth_fun = function(x) rollmean(x, k = 30, fill = NA, align = "center") # ), # # median14 = list( # name = "14-day rolling median", # smooth_fun = function(x) rollmedian(x, k = 14, fill = NA, align = "center") # ), # # median21 = list( # name = "21-day rolling median", # smooth_fun = function(x) rollmedian(x, k = 21, fill = NA, align = "center") # ), # # loess = list( # name = "LOESS smoothing (span=0.1)", # smooth_fun = function(x) { # idx <- seq_along(x) # fit <- loess(x ~ idx, span = 0.1) # predict(fit) # } # ) # ) # ============================================================================ # TEST BFAST WITH DIFFERENT SMOOTHING + PARAMETERS # ============================================================================ cat("============================================================================\n") cat("TESTING CONFIGURATIONS\n") cat("============================================================================\n\n") # QUICK TEST MODE: Only test best configuration bfast_configs <- list( config4 = list(h = 0.10, season = "none", name = "h=0.10, no season") ) # Full test suite (uncomment to test all BFAST configurations) # bfast_configs <- list( # config1 = list(h = 0.15, season = "harmonic", name = "h=0.15, harmonic"), # config2 = list(h = 0.10, season = "harmonic", name = "h=0.10, harmonic"), # config3 = list(h = 0.15, season = "none", name = "h=0.15, no season"), # config4 = list(h = 0.10, season = "none", name = "h=0.10, no season") # ) results <- list() for (smooth_name in names(smoothing_strategies)) { smooth_config <- smoothing_strategies[[smooth_name]] cat("\n--- Testing:", smooth_config$name, "---\n") # Apply smoothing field_ts <- field_data %>% mutate(ci_smooth = smooth_config$smooth_fun(mean_ci)) # Fill NAs field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2) # Create regular time series date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day") ts_regular <- data.frame(date = date_seq) %>% left_join(field_ts %>% select(date, ci_smooth), by = "date") ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2) # Convert to ts object start_year <- as.numeric(format(min(ts_regular$date), "%Y")) start_doy <- as.numeric(format(min(ts_regular$date), "%j")) ts_obj <- ts(ts_regular$ci_smooth, start = c(start_year, start_doy), frequency = 365) # Test each BFAST configuration for (config_name in names(bfast_configs)) { config <- bfast_configs[[config_name]] bfast_result <- tryCatch({ bfast(ts_obj, h = config$h, season = config$season, max.iter = 10, breaks = NULL) }, error = function(e) { NULL }) if (!is.null(bfast_result)) { # Extract breaks bp_component <- bfast_result$output[[1]]$bp.Vt if (!is.null(bp_component) && length(bp_component$breakpoints) > 0) { bp_indices <- bp_component$breakpoints bp_indices <- bp_indices[!is.na(bp_indices)] if (length(bp_indices) > 0) { bp_dates <- ts_regular$date[bp_indices] # Apply minimum harvest interval filter (250 days) if (length(bp_dates) > 1) { # Calculate CI drops at each break ci_drops <- sapply(bp_indices, function(idx) { if (idx > 10 && idx < (length(ts_regular$ci_smooth) - 10)) { before_ci <- mean(ts_regular$ci_smooth[(idx-10):(idx-1)], na.rm = TRUE) after_ci <- mean(ts_regular$ci_smooth[(idx+1):(idx+10)], na.rm = TRUE) return(after_ci - before_ci) } else { return(0) } }) # Keep breaks that are at least 250 days apart keep_breaks <- rep(TRUE, length(bp_dates)) bp_dates_sorted <- sort(bp_dates) for (i in 2:length(bp_dates_sorted)) { days_since_last <- as.numeric(bp_dates_sorted[i] - bp_dates_sorted[i-1]) if (days_since_last < 250) { # Keep the one with larger CI drop idx_current <- which(bp_dates == bp_dates_sorted[i]) idx_previous <- which(bp_dates == bp_dates_sorted[i-1]) if (abs(ci_drops[idx_current]) < abs(ci_drops[idx_previous])) { keep_breaks[idx_current] <- FALSE } else { keep_breaks[idx_previous] <- FALSE } } } bp_dates <- bp_dates[keep_breaks] } # Check matches with actual harvests matches <- sapply(test_harvests$season_end, function(h_date) { min(abs(as.numeric(bp_dates - h_date))) }) best_match_days <- min(matches) matched_within_2w <- sum(matches <= 14) matched_within_4w <- sum(matches <= 28) result_entry <- list( smoothing = smooth_config$name, bfast_config = config$name, n_breaks = length(bp_dates), n_harvests = nrow(test_harvests), matched_2w = matched_within_2w, matched_4w = matched_within_4w, best_match_days = best_match_days, break_dates = bp_dates ) results[[paste(smooth_name, config_name, sep = "_")]] <- result_entry cat(" ", config$name, ": ", length(bp_dates), " breaks, ", matched_within_2w, "/", nrow(test_harvests), " matched (±2w), ", "best: ", best_match_days, " days\n", sep = "") } else { cat(" ", config$name, ": No breaks detected\n", sep = "") } } else { cat(" ", config$name, ": No breaks detected\n", sep = "") } } else { cat(" ", config$name, ": BFAST failed\n", sep = "") } } } # ============================================================================ # FIND BEST CONFIGURATION # ============================================================================ cat("\n\n============================================================================\n") cat("BEST CONFIGURATIONS\n") cat("============================================================================\n\n") results_df <- bind_rows(lapply(names(results), function(name) { r <- results[[name]] data.frame( config = paste(r$smoothing, "|", r$bfast_config), n_breaks = r$n_breaks, matched_2w = r$matched_2w, matched_4w = r$matched_4w, match_rate_2w = round(100 * r$matched_2w / r$n_harvests, 1), match_rate_4w = round(100 * r$matched_4w / r$n_harvests, 1), best_match_days = r$best_match_days ) })) # Sort by match rate results_df <- results_df %>% arrange(desc(match_rate_2w), best_match_days) cat("Top configurations (sorted by 2-week match rate):\n\n") print(results_df, row.names = FALSE) # ============================================================================ # VISUALIZE BEST CONFIGURATION # ============================================================================ if (nrow(results_df) > 0 && results_df$matched_2w[1] > 0) { best_config_name <- strsplit(as.character(results_df$config[1]), " \\| ")[[1]] best_smooth <- names(smoothing_strategies)[sapply(smoothing_strategies, function(s) s$name == best_config_name[1])] best_bfast <- names(bfast_configs)[sapply(bfast_configs, function(c) c$name == best_config_name[2])] cat("\n\nGenerating visualization for BEST configuration...\n") cat("Smoothing:", best_config_name[1], "\n") cat("BFAST:", best_config_name[2], "\n\n") # Recreate with best config smooth_config <- smoothing_strategies[[best_smooth[1]]] bfast_config <- bfast_configs[[best_bfast[1]]] field_ts <- field_data %>% mutate(ci_smooth = smooth_config$smooth_fun(mean_ci)) field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2) date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day") ts_regular <- data.frame(date = date_seq) %>% left_join(field_ts %>% select(date, ci_smooth), by = "date") ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2) start_year <- as.numeric(format(min(ts_regular$date), "%Y")) start_doy <- as.numeric(format(min(ts_regular$date), "%j")) ts_obj <- ts(ts_regular$ci_smooth, start = c(start_year, start_doy), frequency = 365) bfast_result <- bfast(ts_obj, h = bfast_config$h, season = bfast_config$season, max.iter = 10, breaks = NULL) bp_component <- bfast_result$output[[1]]$bp.Vt bp_indices <- bp_component$breakpoints[!is.na(bp_component$breakpoints)] bp_dates <- ts_regular$date[bp_indices] # Create comprehensive plot p <- ggplot() + # Raw data (light) geom_line(data = field_data, aes(x = date, y = mean_ci), color = "gray70", alpha = 0.4, linewidth = 0.5) + # Smoothed data geom_line(data = ts_regular, aes(x = date, y = ci_smooth), color = "darkgreen", linewidth = 1) + # Actual harvest dates geom_vline(data = test_harvests, aes(xintercept = season_end), color = "red", linetype = "dashed", linewidth = 1.2) + # BFAST breaks geom_vline(data = data.frame(break_date = bp_dates), aes(xintercept = break_date), color = "blue", linetype = "solid", linewidth = 1.5, alpha = 0.7) + # Labels geom_text(data = test_harvests, aes(x = season_end, y = max(ts_regular$ci_smooth) * 1.05, label = format(season_end, "%Y-%m-%d")), angle = 90, vjust = -0.5, size = 3, color = "red", fontface = "bold") + labs( title = paste0("Field ", test_field, " - BEST Configuration"), subtitle = paste0( "Smoothing: ", best_config_name[1], " | BFAST: ", best_config_name[2], "\n", "Red dashed = Actual (", nrow(test_harvests), ") | ", "Blue solid = Detected (", length(bp_dates), ") | ", "Match rate: ", results_df$match_rate_2w[1], "% (±2w)" ), x = "Date", y = "CI", caption = "Gray = Raw data | Green = Smoothed data" ) + theme_minimal() + theme( plot.title = element_text(face = "bold", size = 14), plot.subtitle = element_text(size = 10), legend.position = "bottom" ) output_dir <- here("r_app/experiments/harvest_prediction") ggsave( file.path(output_dir, "bfast_BEST_smoothing.png"), p, width = 16, height = 8, dpi = 300 ) cat("✓ Saved: bfast_BEST_smoothing.png\n") } # ============================================================================ # RECOMMENDATIONS # ============================================================================ cat("\n\n============================================================================\n") cat("RECOMMENDATIONS\n") cat("============================================================================\n\n") if (nrow(results_df) > 0 && max(results_df$match_rate_2w) > 0) { cat("✓ Found configurations that improve detection:\n\n") cat("Best smoothing:", best_config_name[1], "\n") cat("Best BFAST params:", best_config_name[2], "\n") cat("Match rate (±2 weeks):", results_df$match_rate_2w[1], "%\n") cat("Match rate (±4 weeks):", results_df$match_rate_4w[1], "%\n\n") cat("Key insights:\n") if (grepl("21-day|30-day", best_config_name[1])) { cat("- Heavy smoothing (21-30 days) works better than light smoothing\n") cat("- This suggests harvest signal is gradual, not abrupt\n") } if (grepl("median", best_config_name[1])) { cat("- Median smoothing works better than mean (more robust to outliers)\n") } if (grepl("none", best_config_name[2])) { cat("- No seasonal model works better (harvest may disrupt seasonal patterns)\n") } if (grepl("h=0.10", best_config_name[2])) { cat("- Smaller h (0.10) allows more breaks (more sensitive detection)\n") } } else { cat("⚠ No configuration achieved successful matches\n\n") cat("This confirms BFAST may not be suitable because:\n") cat("- Harvest doesn't create clear structural breaks\n") cat("- CI changes are too gradual\n") cat("- Noise obscures the harvest signal even with heavy smoothing\n\n") cat("Consider alternative approaches:\n") cat("1. Threshold-based: Sustained CI < 2.0 for 14+ days = harvest\n") cat("2. Minimum detection: Find local minima in smoothed CI\n") cat("3. Crop age model: Expected harvest based on planting date + growth days\n") } cat("\n============================================================================\n") cat("ANALYSIS COMPLETE\n") cat("============================================================================\n")