SmartCane/r_app/experiments/harvest_prediction/test_bfast_smoothing.R
2026-01-06 14:17:37 +01:00

428 lines
16 KiB
R

# ============================================================================
# TEST DIFFERENT SMOOTHING APPROACHES FOR BFAST HARVEST DETECTION
# ============================================================================
# The CI data is very noisy - test multiple smoothing strategies to see
# if better smoothing improves BFAST's ability to detect harvest events
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(bfast)
library(zoo)
library(ggplot2)
})
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
cat("============================================================================\n")
cat("TESTING SMOOTHING STRATEGIES FOR BFAST\n")
cat("============================================================================\n\n")
# Load CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir,
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
# Load harvest data
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
harvest_actual <- read_excel(harvest_file) %>%
mutate(season_end = as.Date(season_end)) %>%
filter(!is.na(season_end))
# Pick a test field with known harvests and good data coverage
test_field <- "KHWB" # From the plot you showed
test_harvests <- harvest_actual %>% filter(field == test_field)
cat("Test field:", test_field, "\n")
cat("Actual harvests:", nrow(test_harvests), "\n\n")
field_data <- time_series_daily %>%
filter(field_id == test_field) %>%
arrange(date)
cat("CI observations:", nrow(field_data), "\n")
cat("Date range:", format(min(field_data$date), "%Y-%m-%d"), "to",
format(max(field_data$date), "%Y-%m-%d"), "\n\n")
# ============================================================================
# DEFINE SMOOTHING STRATEGIES TO TEST
# ============================================================================
# QUICK TEST MODE: Only test best configuration
# To test all configurations, uncomment the full list below
smoothing_strategies <- list(
ma7 = list(
name = "7-day moving average",
smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center")
)
)
# Full test suite (uncomment to test all smoothing strategies)
# smoothing_strategies <- list(
# raw = list(
# name = "No smoothing (raw)",
# smooth_fun = function(x) x
# ),
#
# ma7 = list(
# name = "7-day moving average",
# smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center")
# ),
#
# ma14 = list(
# name = "14-day moving average",
# smooth_fun = function(x) rollmean(x, k = 14, fill = NA, align = "center")
# ),
#
# ma21 = list(
# name = "21-day moving average",
# smooth_fun = function(x) rollmean(x, k = 21, fill = NA, align = "center")
# ),
#
# ma30 = list(
# name = "30-day moving average",
# smooth_fun = function(x) rollmean(x, k = 30, fill = NA, align = "center")
# ),
#
# median14 = list(
# name = "14-day rolling median",
# smooth_fun = function(x) rollmedian(x, k = 14, fill = NA, align = "center")
# ),
#
# median21 = list(
# name = "21-day rolling median",
# smooth_fun = function(x) rollmedian(x, k = 21, fill = NA, align = "center")
# ),
#
# loess = list(
# name = "LOESS smoothing (span=0.1)",
# smooth_fun = function(x) {
# idx <- seq_along(x)
# fit <- loess(x ~ idx, span = 0.1)
# predict(fit)
# }
# )
# )
# ============================================================================
# TEST BFAST WITH DIFFERENT SMOOTHING + PARAMETERS
# ============================================================================
cat("============================================================================\n")
cat("TESTING CONFIGURATIONS\n")
cat("============================================================================\n\n")
# QUICK TEST MODE: Only test best configuration
bfast_configs <- list(
config4 = list(h = 0.10, season = "none", name = "h=0.10, no season")
)
# Full test suite (uncomment to test all BFAST configurations)
# bfast_configs <- list(
# config1 = list(h = 0.15, season = "harmonic", name = "h=0.15, harmonic"),
# config2 = list(h = 0.10, season = "harmonic", name = "h=0.10, harmonic"),
# config3 = list(h = 0.15, season = "none", name = "h=0.15, no season"),
# config4 = list(h = 0.10, season = "none", name = "h=0.10, no season")
# )
results <- list()
for (smooth_name in names(smoothing_strategies)) {
smooth_config <- smoothing_strategies[[smooth_name]]
cat("\n--- Testing:", smooth_config$name, "---\n")
# Apply smoothing
field_ts <- field_data %>%
mutate(ci_smooth = smooth_config$smooth_fun(mean_ci))
# Fill NAs
field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)
# Create regular time series
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
ts_regular <- data.frame(date = date_seq) %>%
left_join(field_ts %>% select(date, ci_smooth), by = "date")
ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)
# Convert to ts object
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
ts_obj <- ts(ts_regular$ci_smooth,
start = c(start_year, start_doy),
frequency = 365)
# Test each BFAST configuration
for (config_name in names(bfast_configs)) {
config <- bfast_configs[[config_name]]
bfast_result <- tryCatch({
bfast(ts_obj,
h = config$h,
season = config$season,
max.iter = 10,
breaks = NULL)
}, error = function(e) {
NULL
})
if (!is.null(bfast_result)) {
# Extract breaks
bp_component <- bfast_result$output[[1]]$bp.Vt
if (!is.null(bp_component) && length(bp_component$breakpoints) > 0) {
bp_indices <- bp_component$breakpoints
bp_indices <- bp_indices[!is.na(bp_indices)]
if (length(bp_indices) > 0) {
bp_dates <- ts_regular$date[bp_indices]
# Apply minimum harvest interval filter (250 days)
if (length(bp_dates) > 1) {
# Calculate CI drops at each break
ci_drops <- sapply(bp_indices, function(idx) {
if (idx > 10 && idx < (length(ts_regular$ci_smooth) - 10)) {
before_ci <- mean(ts_regular$ci_smooth[(idx-10):(idx-1)], na.rm = TRUE)
after_ci <- mean(ts_regular$ci_smooth[(idx+1):(idx+10)], na.rm = TRUE)
return(after_ci - before_ci)
} else {
return(0)
}
})
# Keep breaks that are at least 250 days apart
keep_breaks <- rep(TRUE, length(bp_dates))
bp_dates_sorted <- sort(bp_dates)
for (i in 2:length(bp_dates_sorted)) {
days_since_last <- as.numeric(bp_dates_sorted[i] - bp_dates_sorted[i-1])
if (days_since_last < 250) {
# Keep the one with larger CI drop
idx_current <- which(bp_dates == bp_dates_sorted[i])
idx_previous <- which(bp_dates == bp_dates_sorted[i-1])
if (abs(ci_drops[idx_current]) < abs(ci_drops[idx_previous])) {
keep_breaks[idx_current] <- FALSE
} else {
keep_breaks[idx_previous] <- FALSE
}
}
}
bp_dates <- bp_dates[keep_breaks]
}
# Check matches with actual harvests
matches <- sapply(test_harvests$season_end, function(h_date) {
min(abs(as.numeric(bp_dates - h_date)))
})
best_match_days <- min(matches)
matched_within_2w <- sum(matches <= 14)
matched_within_4w <- sum(matches <= 28)
result_entry <- list(
smoothing = smooth_config$name,
bfast_config = config$name,
n_breaks = length(bp_dates),
n_harvests = nrow(test_harvests),
matched_2w = matched_within_2w,
matched_4w = matched_within_4w,
best_match_days = best_match_days,
break_dates = bp_dates
)
results[[paste(smooth_name, config_name, sep = "_")]] <- result_entry
cat(" ", config$name, ": ", length(bp_dates), " breaks, ",
matched_within_2w, "/", nrow(test_harvests), " matched (±2w), ",
"best: ", best_match_days, " days\n", sep = "")
} else {
cat(" ", config$name, ": No breaks detected\n", sep = "")
}
} else {
cat(" ", config$name, ": No breaks detected\n", sep = "")
}
} else {
cat(" ", config$name, ": BFAST failed\n", sep = "")
}
}
}
# ============================================================================
# FIND BEST CONFIGURATION
# ============================================================================
cat("\n\n============================================================================\n")
cat("BEST CONFIGURATIONS\n")
cat("============================================================================\n\n")
results_df <- bind_rows(lapply(names(results), function(name) {
r <- results[[name]]
data.frame(
config = paste(r$smoothing, "|", r$bfast_config),
n_breaks = r$n_breaks,
matched_2w = r$matched_2w,
matched_4w = r$matched_4w,
match_rate_2w = round(100 * r$matched_2w / r$n_harvests, 1),
match_rate_4w = round(100 * r$matched_4w / r$n_harvests, 1),
best_match_days = r$best_match_days
)
}))
# Sort by match rate
results_df <- results_df %>%
arrange(desc(match_rate_2w), best_match_days)
cat("Top configurations (sorted by 2-week match rate):\n\n")
print(results_df, row.names = FALSE)
# ============================================================================
# VISUALIZE BEST CONFIGURATION
# ============================================================================
if (nrow(results_df) > 0 && results_df$matched_2w[1] > 0) {
best_config_name <- strsplit(as.character(results_df$config[1]), " \\| ")[[1]]
best_smooth <- names(smoothing_strategies)[sapply(smoothing_strategies, function(s) s$name == best_config_name[1])]
best_bfast <- names(bfast_configs)[sapply(bfast_configs, function(c) c$name == best_config_name[2])]
cat("\n\nGenerating visualization for BEST configuration...\n")
cat("Smoothing:", best_config_name[1], "\n")
cat("BFAST:", best_config_name[2], "\n\n")
# Recreate with best config
smooth_config <- smoothing_strategies[[best_smooth[1]]]
bfast_config <- bfast_configs[[best_bfast[1]]]
field_ts <- field_data %>%
mutate(ci_smooth = smooth_config$smooth_fun(mean_ci))
field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
ts_regular <- data.frame(date = date_seq) %>%
left_join(field_ts %>% select(date, ci_smooth), by = "date")
ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
ts_obj <- ts(ts_regular$ci_smooth, start = c(start_year, start_doy), frequency = 365)
bfast_result <- bfast(ts_obj, h = bfast_config$h, season = bfast_config$season,
max.iter = 10, breaks = NULL)
bp_component <- bfast_result$output[[1]]$bp.Vt
bp_indices <- bp_component$breakpoints[!is.na(bp_component$breakpoints)]
bp_dates <- ts_regular$date[bp_indices]
# Create comprehensive plot
p <- ggplot() +
# Raw data (light)
geom_line(data = field_data, aes(x = date, y = mean_ci),
color = "gray70", alpha = 0.4, linewidth = 0.5) +
# Smoothed data
geom_line(data = ts_regular, aes(x = date, y = ci_smooth),
color = "darkgreen", linewidth = 1) +
# Actual harvest dates
geom_vline(data = test_harvests, aes(xintercept = season_end),
color = "red", linetype = "dashed", linewidth = 1.2) +
# BFAST breaks
geom_vline(data = data.frame(break_date = bp_dates),
aes(xintercept = break_date),
color = "blue", linetype = "solid", linewidth = 1.5, alpha = 0.7) +
# Labels
geom_text(data = test_harvests,
aes(x = season_end, y = max(ts_regular$ci_smooth) * 1.05,
label = format(season_end, "%Y-%m-%d")),
angle = 90, vjust = -0.5, size = 3, color = "red", fontface = "bold") +
labs(
title = paste0("Field ", test_field, " - BEST Configuration"),
subtitle = paste0(
"Smoothing: ", best_config_name[1], " | BFAST: ", best_config_name[2], "\n",
"Red dashed = Actual (", nrow(test_harvests), ") | ",
"Blue solid = Detected (", length(bp_dates), ") | ",
"Match rate: ", results_df$match_rate_2w[1], "% (±2w)"
),
x = "Date",
y = "CI",
caption = "Gray = Raw data | Green = Smoothed data"
) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
legend.position = "bottom"
)
output_dir <- here("r_app/experiments/harvest_prediction")
ggsave(
file.path(output_dir, "bfast_BEST_smoothing.png"),
p, width = 16, height = 8, dpi = 300
)
cat("✓ Saved: bfast_BEST_smoothing.png\n")
}
# ============================================================================
# RECOMMENDATIONS
# ============================================================================
cat("\n\n============================================================================\n")
cat("RECOMMENDATIONS\n")
cat("============================================================================\n\n")
if (nrow(results_df) > 0 && max(results_df$match_rate_2w) > 0) {
cat("✓ Found configurations that improve detection:\n\n")
cat("Best smoothing:", best_config_name[1], "\n")
cat("Best BFAST params:", best_config_name[2], "\n")
cat("Match rate (±2 weeks):", results_df$match_rate_2w[1], "%\n")
cat("Match rate (±4 weeks):", results_df$match_rate_4w[1], "%\n\n")
cat("Key insights:\n")
if (grepl("21-day|30-day", best_config_name[1])) {
cat("- Heavy smoothing (21-30 days) works better than light smoothing\n")
cat("- This suggests harvest signal is gradual, not abrupt\n")
}
if (grepl("median", best_config_name[1])) {
cat("- Median smoothing works better than mean (more robust to outliers)\n")
}
if (grepl("none", best_config_name[2])) {
cat("- No seasonal model works better (harvest may disrupt seasonal patterns)\n")
}
if (grepl("h=0.10", best_config_name[2])) {
cat("- Smaller h (0.10) allows more breaks (more sensitive detection)\n")
}
} else {
cat("⚠ No configuration achieved successful matches\n\n")
cat("This confirms BFAST may not be suitable because:\n")
cat("- Harvest doesn't create clear structural breaks\n")
cat("- CI changes are too gradual\n")
cat("- Noise obscures the harvest signal even with heavy smoothing\n\n")
cat("Consider alternative approaches:\n")
cat("1. Threshold-based: Sustained CI < 2.0 for 14+ days = harvest\n")
cat("2. Minimum detection: Find local minima in smoothed CI\n")
cat("3. Crop age model: Expected harvest based on planting date + growth days\n")
}
cat("\n============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n")