428 lines
16 KiB
R
428 lines
16 KiB
R
# ============================================================================
|
|
# TEST DIFFERENT SMOOTHING APPROACHES FOR BFAST HARVEST DETECTION
|
|
# ============================================================================
|
|
# The CI data is very noisy - test multiple smoothing strategies to see
|
|
# if better smoothing improves BFAST's ability to detect harvest events
|
|
# ============================================================================
|
|
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(here)
|
|
library(bfast)
|
|
library(zoo)
|
|
library(ggplot2)
|
|
})
|
|
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
if (basename(getwd()) == "harvest_prediction") {
|
|
setwd("../../..")
|
|
}
|
|
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
cat("============================================================================\n")
|
|
cat("TESTING SMOOTHING STRATEGIES FOR BFAST\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
# Load CI data
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir,
|
|
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(date = as.Date(Date)) %>%
|
|
select(field_id = field, date, mean_ci = FitData) %>%
|
|
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
|
arrange(field_id, date)
|
|
|
|
# Load harvest data
|
|
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
|
|
harvest_actual <- read_excel(harvest_file) %>%
|
|
mutate(season_end = as.Date(season_end)) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
# Pick a test field with known harvests and good data coverage
|
|
test_field <- "KHWB" # From the plot you showed
|
|
test_harvests <- harvest_actual %>% filter(field == test_field)
|
|
|
|
cat("Test field:", test_field, "\n")
|
|
cat("Actual harvests:", nrow(test_harvests), "\n\n")
|
|
|
|
field_data <- time_series_daily %>%
|
|
filter(field_id == test_field) %>%
|
|
arrange(date)
|
|
|
|
cat("CI observations:", nrow(field_data), "\n")
|
|
cat("Date range:", format(min(field_data$date), "%Y-%m-%d"), "to",
|
|
format(max(field_data$date), "%Y-%m-%d"), "\n\n")
|
|
|
|
# ============================================================================
|
|
# DEFINE SMOOTHING STRATEGIES TO TEST
|
|
# ============================================================================
|
|
|
|
# QUICK TEST MODE: Only test best configuration
|
|
# To test all configurations, uncomment the full list below
|
|
smoothing_strategies <- list(
|
|
ma7 = list(
|
|
name = "7-day moving average",
|
|
smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center")
|
|
)
|
|
)
|
|
|
|
# Full test suite (uncomment to test all smoothing strategies)
|
|
# smoothing_strategies <- list(
|
|
# raw = list(
|
|
# name = "No smoothing (raw)",
|
|
# smooth_fun = function(x) x
|
|
# ),
|
|
#
|
|
# ma7 = list(
|
|
# name = "7-day moving average",
|
|
# smooth_fun = function(x) rollmean(x, k = 7, fill = NA, align = "center")
|
|
# ),
|
|
#
|
|
# ma14 = list(
|
|
# name = "14-day moving average",
|
|
# smooth_fun = function(x) rollmean(x, k = 14, fill = NA, align = "center")
|
|
# ),
|
|
#
|
|
# ma21 = list(
|
|
# name = "21-day moving average",
|
|
# smooth_fun = function(x) rollmean(x, k = 21, fill = NA, align = "center")
|
|
# ),
|
|
#
|
|
# ma30 = list(
|
|
# name = "30-day moving average",
|
|
# smooth_fun = function(x) rollmean(x, k = 30, fill = NA, align = "center")
|
|
# ),
|
|
#
|
|
# median14 = list(
|
|
# name = "14-day rolling median",
|
|
# smooth_fun = function(x) rollmedian(x, k = 14, fill = NA, align = "center")
|
|
# ),
|
|
#
|
|
# median21 = list(
|
|
# name = "21-day rolling median",
|
|
# smooth_fun = function(x) rollmedian(x, k = 21, fill = NA, align = "center")
|
|
# ),
|
|
#
|
|
# loess = list(
|
|
# name = "LOESS smoothing (span=0.1)",
|
|
# smooth_fun = function(x) {
|
|
# idx <- seq_along(x)
|
|
# fit <- loess(x ~ idx, span = 0.1)
|
|
# predict(fit)
|
|
# }
|
|
# )
|
|
# )
|
|
|
|
# ============================================================================
|
|
# TEST BFAST WITH DIFFERENT SMOOTHING + PARAMETERS
|
|
# ============================================================================
|
|
|
|
cat("============================================================================\n")
|
|
cat("TESTING CONFIGURATIONS\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
# QUICK TEST MODE: Only test best configuration
|
|
bfast_configs <- list(
|
|
config4 = list(h = 0.10, season = "none", name = "h=0.10, no season")
|
|
)
|
|
|
|
# Full test suite (uncomment to test all BFAST configurations)
|
|
# bfast_configs <- list(
|
|
# config1 = list(h = 0.15, season = "harmonic", name = "h=0.15, harmonic"),
|
|
# config2 = list(h = 0.10, season = "harmonic", name = "h=0.10, harmonic"),
|
|
# config3 = list(h = 0.15, season = "none", name = "h=0.15, no season"),
|
|
# config4 = list(h = 0.10, season = "none", name = "h=0.10, no season")
|
|
# )
|
|
|
|
results <- list()
|
|
|
|
for (smooth_name in names(smoothing_strategies)) {
|
|
smooth_config <- smoothing_strategies[[smooth_name]]
|
|
|
|
cat("\n--- Testing:", smooth_config$name, "---\n")
|
|
|
|
# Apply smoothing
|
|
field_ts <- field_data %>%
|
|
mutate(ci_smooth = smooth_config$smooth_fun(mean_ci))
|
|
|
|
# Fill NAs
|
|
field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)
|
|
|
|
# Create regular time series
|
|
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
|
|
ts_regular <- data.frame(date = date_seq) %>%
|
|
left_join(field_ts %>% select(date, ci_smooth), by = "date")
|
|
|
|
ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)
|
|
|
|
# Convert to ts object
|
|
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
|
|
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
|
|
|
|
ts_obj <- ts(ts_regular$ci_smooth,
|
|
start = c(start_year, start_doy),
|
|
frequency = 365)
|
|
|
|
# Test each BFAST configuration
|
|
for (config_name in names(bfast_configs)) {
|
|
config <- bfast_configs[[config_name]]
|
|
|
|
bfast_result <- tryCatch({
|
|
bfast(ts_obj,
|
|
h = config$h,
|
|
season = config$season,
|
|
max.iter = 10,
|
|
breaks = NULL)
|
|
}, error = function(e) {
|
|
NULL
|
|
})
|
|
|
|
if (!is.null(bfast_result)) {
|
|
# Extract breaks
|
|
bp_component <- bfast_result$output[[1]]$bp.Vt
|
|
|
|
if (!is.null(bp_component) && length(bp_component$breakpoints) > 0) {
|
|
bp_indices <- bp_component$breakpoints
|
|
bp_indices <- bp_indices[!is.na(bp_indices)]
|
|
|
|
if (length(bp_indices) > 0) {
|
|
bp_dates <- ts_regular$date[bp_indices]
|
|
|
|
# Apply minimum harvest interval filter (250 days)
|
|
if (length(bp_dates) > 1) {
|
|
# Calculate CI drops at each break
|
|
ci_drops <- sapply(bp_indices, function(idx) {
|
|
if (idx > 10 && idx < (length(ts_regular$ci_smooth) - 10)) {
|
|
before_ci <- mean(ts_regular$ci_smooth[(idx-10):(idx-1)], na.rm = TRUE)
|
|
after_ci <- mean(ts_regular$ci_smooth[(idx+1):(idx+10)], na.rm = TRUE)
|
|
return(after_ci - before_ci)
|
|
} else {
|
|
return(0)
|
|
}
|
|
})
|
|
|
|
# Keep breaks that are at least 250 days apart
|
|
keep_breaks <- rep(TRUE, length(bp_dates))
|
|
bp_dates_sorted <- sort(bp_dates)
|
|
|
|
for (i in 2:length(bp_dates_sorted)) {
|
|
days_since_last <- as.numeric(bp_dates_sorted[i] - bp_dates_sorted[i-1])
|
|
if (days_since_last < 250) {
|
|
# Keep the one with larger CI drop
|
|
idx_current <- which(bp_dates == bp_dates_sorted[i])
|
|
idx_previous <- which(bp_dates == bp_dates_sorted[i-1])
|
|
|
|
if (abs(ci_drops[idx_current]) < abs(ci_drops[idx_previous])) {
|
|
keep_breaks[idx_current] <- FALSE
|
|
} else {
|
|
keep_breaks[idx_previous] <- FALSE
|
|
}
|
|
}
|
|
}
|
|
|
|
bp_dates <- bp_dates[keep_breaks]
|
|
}
|
|
|
|
# Check matches with actual harvests
|
|
matches <- sapply(test_harvests$season_end, function(h_date) {
|
|
min(abs(as.numeric(bp_dates - h_date)))
|
|
})
|
|
|
|
best_match_days <- min(matches)
|
|
matched_within_2w <- sum(matches <= 14)
|
|
matched_within_4w <- sum(matches <= 28)
|
|
|
|
result_entry <- list(
|
|
smoothing = smooth_config$name,
|
|
bfast_config = config$name,
|
|
n_breaks = length(bp_dates),
|
|
n_harvests = nrow(test_harvests),
|
|
matched_2w = matched_within_2w,
|
|
matched_4w = matched_within_4w,
|
|
best_match_days = best_match_days,
|
|
break_dates = bp_dates
|
|
)
|
|
|
|
results[[paste(smooth_name, config_name, sep = "_")]] <- result_entry
|
|
|
|
cat(" ", config$name, ": ", length(bp_dates), " breaks, ",
|
|
matched_within_2w, "/", nrow(test_harvests), " matched (±2w), ",
|
|
"best: ", best_match_days, " days\n", sep = "")
|
|
} else {
|
|
cat(" ", config$name, ": No breaks detected\n", sep = "")
|
|
}
|
|
} else {
|
|
cat(" ", config$name, ": No breaks detected\n", sep = "")
|
|
}
|
|
} else {
|
|
cat(" ", config$name, ": BFAST failed\n", sep = "")
|
|
}
|
|
}
|
|
}
|
|
|
|
# ============================================================================
|
|
# FIND BEST CONFIGURATION
|
|
# ============================================================================
|
|
|
|
cat("\n\n============================================================================\n")
|
|
cat("BEST CONFIGURATIONS\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
results_df <- bind_rows(lapply(names(results), function(name) {
|
|
r <- results[[name]]
|
|
data.frame(
|
|
config = paste(r$smoothing, "|", r$bfast_config),
|
|
n_breaks = r$n_breaks,
|
|
matched_2w = r$matched_2w,
|
|
matched_4w = r$matched_4w,
|
|
match_rate_2w = round(100 * r$matched_2w / r$n_harvests, 1),
|
|
match_rate_4w = round(100 * r$matched_4w / r$n_harvests, 1),
|
|
best_match_days = r$best_match_days
|
|
)
|
|
}))
|
|
|
|
# Sort by match rate
|
|
results_df <- results_df %>%
|
|
arrange(desc(match_rate_2w), best_match_days)
|
|
|
|
cat("Top configurations (sorted by 2-week match rate):\n\n")
|
|
print(results_df, row.names = FALSE)
|
|
|
|
# ============================================================================
|
|
# VISUALIZE BEST CONFIGURATION
|
|
# ============================================================================
|
|
|
|
if (nrow(results_df) > 0 && results_df$matched_2w[1] > 0) {
|
|
best_config_name <- strsplit(as.character(results_df$config[1]), " \\| ")[[1]]
|
|
best_smooth <- names(smoothing_strategies)[sapply(smoothing_strategies, function(s) s$name == best_config_name[1])]
|
|
best_bfast <- names(bfast_configs)[sapply(bfast_configs, function(c) c$name == best_config_name[2])]
|
|
|
|
cat("\n\nGenerating visualization for BEST configuration...\n")
|
|
cat("Smoothing:", best_config_name[1], "\n")
|
|
cat("BFAST:", best_config_name[2], "\n\n")
|
|
|
|
# Recreate with best config
|
|
smooth_config <- smoothing_strategies[[best_smooth[1]]]
|
|
bfast_config <- bfast_configs[[best_bfast[1]]]
|
|
|
|
field_ts <- field_data %>%
|
|
mutate(ci_smooth = smooth_config$smooth_fun(mean_ci))
|
|
field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)
|
|
|
|
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
|
|
ts_regular <- data.frame(date = date_seq) %>%
|
|
left_join(field_ts %>% select(date, ci_smooth), by = "date")
|
|
ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)
|
|
|
|
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
|
|
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
|
|
ts_obj <- ts(ts_regular$ci_smooth, start = c(start_year, start_doy), frequency = 365)
|
|
|
|
bfast_result <- bfast(ts_obj, h = bfast_config$h, season = bfast_config$season,
|
|
max.iter = 10, breaks = NULL)
|
|
|
|
bp_component <- bfast_result$output[[1]]$bp.Vt
|
|
bp_indices <- bp_component$breakpoints[!is.na(bp_component$breakpoints)]
|
|
bp_dates <- ts_regular$date[bp_indices]
|
|
|
|
# Create comprehensive plot
|
|
p <- ggplot() +
|
|
# Raw data (light)
|
|
geom_line(data = field_data, aes(x = date, y = mean_ci),
|
|
color = "gray70", alpha = 0.4, linewidth = 0.5) +
|
|
# Smoothed data
|
|
geom_line(data = ts_regular, aes(x = date, y = ci_smooth),
|
|
color = "darkgreen", linewidth = 1) +
|
|
# Actual harvest dates
|
|
geom_vline(data = test_harvests, aes(xintercept = season_end),
|
|
color = "red", linetype = "dashed", linewidth = 1.2) +
|
|
# BFAST breaks
|
|
geom_vline(data = data.frame(break_date = bp_dates),
|
|
aes(xintercept = break_date),
|
|
color = "blue", linetype = "solid", linewidth = 1.5, alpha = 0.7) +
|
|
# Labels
|
|
geom_text(data = test_harvests,
|
|
aes(x = season_end, y = max(ts_regular$ci_smooth) * 1.05,
|
|
label = format(season_end, "%Y-%m-%d")),
|
|
angle = 90, vjust = -0.5, size = 3, color = "red", fontface = "bold") +
|
|
labs(
|
|
title = paste0("Field ", test_field, " - BEST Configuration"),
|
|
subtitle = paste0(
|
|
"Smoothing: ", best_config_name[1], " | BFAST: ", best_config_name[2], "\n",
|
|
"Red dashed = Actual (", nrow(test_harvests), ") | ",
|
|
"Blue solid = Detected (", length(bp_dates), ") | ",
|
|
"Match rate: ", results_df$match_rate_2w[1], "% (±2w)"
|
|
),
|
|
x = "Date",
|
|
y = "CI",
|
|
caption = "Gray = Raw data | Green = Smoothed data"
|
|
) +
|
|
theme_minimal() +
|
|
theme(
|
|
plot.title = element_text(face = "bold", size = 14),
|
|
plot.subtitle = element_text(size = 10),
|
|
legend.position = "bottom"
|
|
)
|
|
|
|
output_dir <- here("r_app/experiments/harvest_prediction")
|
|
ggsave(
|
|
file.path(output_dir, "bfast_BEST_smoothing.png"),
|
|
p, width = 16, height = 8, dpi = 300
|
|
)
|
|
|
|
cat("✓ Saved: bfast_BEST_smoothing.png\n")
|
|
}
|
|
|
|
# ============================================================================
|
|
# RECOMMENDATIONS
|
|
# ============================================================================
|
|
|
|
cat("\n\n============================================================================\n")
|
|
cat("RECOMMENDATIONS\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
if (nrow(results_df) > 0 && max(results_df$match_rate_2w) > 0) {
|
|
cat("✓ Found configurations that improve detection:\n\n")
|
|
cat("Best smoothing:", best_config_name[1], "\n")
|
|
cat("Best BFAST params:", best_config_name[2], "\n")
|
|
cat("Match rate (±2 weeks):", results_df$match_rate_2w[1], "%\n")
|
|
cat("Match rate (±4 weeks):", results_df$match_rate_4w[1], "%\n\n")
|
|
|
|
cat("Key insights:\n")
|
|
if (grepl("21-day|30-day", best_config_name[1])) {
|
|
cat("- Heavy smoothing (21-30 days) works better than light smoothing\n")
|
|
cat("- This suggests harvest signal is gradual, not abrupt\n")
|
|
}
|
|
if (grepl("median", best_config_name[1])) {
|
|
cat("- Median smoothing works better than mean (more robust to outliers)\n")
|
|
}
|
|
if (grepl("none", best_config_name[2])) {
|
|
cat("- No seasonal model works better (harvest may disrupt seasonal patterns)\n")
|
|
}
|
|
if (grepl("h=0.10", best_config_name[2])) {
|
|
cat("- Smaller h (0.10) allows more breaks (more sensitive detection)\n")
|
|
}
|
|
} else {
|
|
cat("⚠ No configuration achieved successful matches\n\n")
|
|
cat("This confirms BFAST may not be suitable because:\n")
|
|
cat("- Harvest doesn't create clear structural breaks\n")
|
|
cat("- CI changes are too gradual\n")
|
|
cat("- Noise obscures the harvest signal even with heavy smoothing\n\n")
|
|
cat("Consider alternative approaches:\n")
|
|
cat("1. Threshold-based: Sustained CI < 2.0 for 14+ days = harvest\n")
|
|
cat("2. Minimum detection: Find local minima in smoothed CI\n")
|
|
cat("3. Crop age model: Expected harvest based on planting date + growth days\n")
|
|
}
|
|
|
|
cat("\n============================================================================\n")
|
|
cat("ANALYSIS COMPLETE\n")
|
|
cat("============================================================================\n")
|