431 lines
16 KiB
R
431 lines
16 KiB
R
# ============================================================================
|
||
# BFAST ROLLING WINDOW TEST - Single Season Analysis
|
||
# ============================================================================
|
||
# Test bfast's ability to detect harvest breakpoints progressively
|
||
# For each day from -20 to +20 days around harvest:
|
||
# 1. Load CI data from previous harvest to current test day
|
||
# 2. Run bfast to detect breaks
|
||
# 3. Check if harvest date is detected as a breakpoint
|
||
# ============================================================================
|
||
|
||
suppressPackageStartupMessages({
|
||
library(readxl)
|
||
library(dplyr)
|
||
library(tidyr)
|
||
library(lubridate)
|
||
library(here)
|
||
library(bfast)
|
||
library(zoo)
|
||
library(ggplot2)
|
||
})
|
||
|
||
# Set project directory
|
||
project_dir <- "esa"
|
||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||
|
||
# Navigate to project root if in experiments folder
|
||
if (basename(getwd()) == "harvest_prediction") {
|
||
setwd("../../..")
|
||
}
|
||
|
||
source(here("r_app", "parameters_project.R"))
|
||
|
||
# ============================================================================
|
||
# CONFIGURATION
|
||
# ============================================================================
|
||
|
||
CONFIG <- list(
|
||
test_field = "KHWC", # Field to test
|
||
test_harvest_index = 2, # Which harvest event (1 = first, 2 = second, etc.)
|
||
days_before = 20, # Days before harvest to start testing
|
||
days_after = 20, # Days after harvest to end testing
|
||
|
||
# bfast parameters
|
||
h = 0.15, # Minimal segment size (15% of data)
|
||
season = "none", # Seasonal model type ("dummy", "harmonic", "none")
|
||
max_iter = 10, # Maximum iterations
|
||
breaks = NULL # Let bfast determine number of breaks
|
||
)
|
||
|
||
cat("============================================================================\n")
|
||
cat("BFAST ROLLING WINDOW TEST - PROGRESSIVE HARVEST DETECTION\n")
|
||
cat("============================================================================\n\n")
|
||
|
||
cat("Test configuration:\n")
|
||
cat(" Field:", CONFIG$test_field, "\n")
|
||
cat(" Harvest event:", CONFIG$test_harvest_index, "\n")
|
||
cat(" Test window:", CONFIG$days_before, "days before to", CONFIG$days_after, "days after\n")
|
||
cat(" bfast h parameter:", CONFIG$h, "\n")
|
||
cat(" Seasonal model:", CONFIG$season, "\n\n")
|
||
|
||
# ============================================================================
|
||
# LOAD DATA
|
||
# ============================================================================
|
||
|
||
cat("=== LOADING DATA ===\n\n")
|
||
|
||
ci_rds_file <- here("laravel_app/storage/app", project_dir,
|
||
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
||
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
||
|
||
time_series_daily <- ci_data_raw %>%
|
||
mutate(date = as.Date(Date)) %>%
|
||
select(field_id = field, date, mean_ci = FitData) %>%
|
||
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
||
arrange(field_id, date)
|
||
|
||
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
||
mutate(
|
||
season_start = as.Date(season_start),
|
||
season_end = as.Date(season_end)
|
||
) %>%
|
||
filter(!is.na(season_end))
|
||
|
||
# ============================================================================
|
||
# GET TEST FIELD AND HARVEST EVENT
|
||
# ============================================================================
|
||
|
||
field_harvests <- harvest_data %>%
|
||
filter(field == CONFIG$test_field) %>%
|
||
arrange(season_end)
|
||
|
||
if (nrow(field_harvests) < CONFIG$test_harvest_index) {
|
||
stop("Field ", CONFIG$test_field, " does not have harvest event #", CONFIG$test_harvest_index)
|
||
}
|
||
|
||
test_harvest <- field_harvests[CONFIG$test_harvest_index, ]
|
||
harvest_date <- test_harvest$season_end
|
||
|
||
# Get previous harvest date to define season start
|
||
if (CONFIG$test_harvest_index == 1) {
|
||
season_start <- test_harvest$season_start
|
||
if (is.na(season_start)) {
|
||
# If no season_start, use earliest data for this field
|
||
season_start <- min(time_series_daily$date[time_series_daily$field_id == CONFIG$test_field])
|
||
}
|
||
} else {
|
||
season_start <- field_harvests$season_end[CONFIG$test_harvest_index - 1]
|
||
}
|
||
|
||
cat("Test harvest event:\n")
|
||
cat(" Field:", CONFIG$test_field, "\n")
|
||
cat(" Season start:", format(season_start, "%Y-%m-%d"), "\n")
|
||
cat(" Harvest date:", format(harvest_date, "%Y-%m-%d"), "\n")
|
||
cat(" Season length:", as.numeric(harvest_date - season_start), "days\n\n")
|
||
|
||
# ============================================================================
|
||
# PREPARE FIELD TIME SERIES
|
||
# ============================================================================
|
||
|
||
field_ts <- time_series_daily %>%
|
||
filter(field_id == CONFIG$test_field,
|
||
date >= season_start,
|
||
date <= harvest_date + CONFIG$days_after) %>%
|
||
arrange(date)
|
||
|
||
cat("CI data available:\n")
|
||
cat(" Total observations:", nrow(field_ts), "\n")
|
||
cat(" Date range:", format(min(field_ts$date), "%Y-%m-%d"), "to",
|
||
format(max(field_ts$date), "%Y-%m-%d"), "\n\n")
|
||
|
||
# ============================================================================
|
||
# BFAST WRAPPER FUNCTION
|
||
# ============================================================================
|
||
|
||
run_bfast_on_window <- function(ts_data, end_date, harvest_date, config = CONFIG) {
|
||
# Filter data up to end_date
|
||
window_data <- ts_data %>%
|
||
filter(date <= end_date) %>%
|
||
arrange(date)
|
||
|
||
if (nrow(window_data) < 50) {
|
||
return(list(
|
||
success = FALSE,
|
||
reason = "insufficient_data",
|
||
n_obs = nrow(window_data),
|
||
breaks_detected = 0,
|
||
harvest_detected = FALSE
|
||
))
|
||
}
|
||
|
||
# Create regular time series (fill gaps with NA for now, bfast can handle)
|
||
date_seq <- seq.Date(min(window_data$date), max(window_data$date), by = "1 day")
|
||
ts_regular <- data.frame(date = date_seq) %>%
|
||
left_join(window_data, by = "date")
|
||
|
||
# Interpolate missing values (linear interpolation)
|
||
ts_regular$mean_ci_interp <- na.approx(ts_regular$mean_ci, rule = 2)
|
||
|
||
# Convert to ts object (yearly frequency = 365 days)
|
||
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
|
||
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
|
||
|
||
ts_obj <- ts(ts_regular$mean_ci_interp,
|
||
start = c(start_year, start_doy),
|
||
frequency = 365)
|
||
|
||
# Run bfast
|
||
result <- tryCatch({
|
||
bfast_result <- bfast(ts_obj,
|
||
h = config$h,
|
||
season = config$season,
|
||
max.iter = config$max_iter,
|
||
breaks = config$breaks)
|
||
|
||
# Check if any breaks were detected in the trend component
|
||
if (!is.null(bfast_result$output[[1]]$bp.Vt) &&
|
||
length(bfast_result$output[[1]]$bp.Vt$breakpoints) > 0) {
|
||
|
||
# Get breakpoint dates
|
||
bp_indices <- bfast_result$output[[1]]$bp.Vt$breakpoints
|
||
bp_indices <- bp_indices[!is.na(bp_indices)]
|
||
|
||
if (length(bp_indices) > 0) {
|
||
bp_dates <- ts_regular$date[bp_indices]
|
||
|
||
# Check if any breakpoint is close to harvest date (within 14 days)
|
||
days_from_harvest <- as.numeric(bp_dates - harvest_date)
|
||
harvest_detected <- any(abs(days_from_harvest) <= 14)
|
||
|
||
closest_bp <- bp_dates[which.min(abs(days_from_harvest))]
|
||
closest_bp_offset <- as.numeric(closest_bp - harvest_date)
|
||
|
||
return(list(
|
||
success = TRUE,
|
||
n_obs = nrow(window_data),
|
||
breaks_detected = length(bp_dates),
|
||
breakpoint_dates = bp_dates,
|
||
days_from_harvest = days_from_harvest,
|
||
harvest_detected = harvest_detected,
|
||
closest_breakpoint = closest_bp,
|
||
closest_bp_offset = closest_bp_offset,
|
||
bfast_object = bfast_result
|
||
))
|
||
}
|
||
}
|
||
|
||
# No breaks detected
|
||
list(
|
||
success = TRUE,
|
||
n_obs = nrow(window_data),
|
||
breaks_detected = 0,
|
||
harvest_detected = FALSE
|
||
)
|
||
|
||
}, error = function(e) {
|
||
list(
|
||
success = FALSE,
|
||
reason = paste0("bfast_error: ", e$message),
|
||
n_obs = nrow(window_data),
|
||
breaks_detected = 0,
|
||
harvest_detected = FALSE
|
||
)
|
||
})
|
||
|
||
return(result)
|
||
}
|
||
|
||
# ============================================================================
|
||
# ROLLING WINDOW TEST
|
||
# ============================================================================
|
||
|
||
cat("============================================================================\n")
|
||
cat("RUNNING ROLLING WINDOW TEST\n")
|
||
cat("============================================================================\n\n")
|
||
|
||
test_dates <- seq.Date(
|
||
from = harvest_date - CONFIG$days_before,
|
||
to = harvest_date + CONFIG$days_after,
|
||
by = "1 day"
|
||
)
|
||
|
||
cat("Testing", length(test_dates), "different end dates...\n\n")
|
||
|
||
results_df <- data.frame()
|
||
|
||
pb <- txtProgressBar(min = 0, max = length(test_dates), style = 3)
|
||
|
||
for (i in 1:length(test_dates)) {
|
||
test_date <- test_dates[i]
|
||
days_from_harvest <- as.numeric(test_date - harvest_date)
|
||
|
||
result <- run_bfast_on_window(field_ts, test_date, harvest_date, CONFIG)
|
||
|
||
results_df <- bind_rows(results_df, data.frame(
|
||
test_date = test_date,
|
||
days_from_harvest = days_from_harvest,
|
||
n_obs = result$n_obs,
|
||
success = result$success,
|
||
breaks_detected = result$breaks_detected,
|
||
harvest_detected = ifelse(is.null(result$harvest_detected), FALSE, result$harvest_detected),
|
||
closest_bp_offset = ifelse(is.null(result$closest_bp_offset), NA, result$closest_bp_offset),
|
||
reason = ifelse(is.null(result$reason), "ok", result$reason)
|
||
))
|
||
|
||
setTxtProgressBar(pb, i)
|
||
}
|
||
|
||
close(pb)
|
||
|
||
# ============================================================================
|
||
# RESULTS SUMMARY
|
||
# ============================================================================
|
||
|
||
cat("\n\n============================================================================\n")
|
||
cat("RESULTS SUMMARY\n")
|
||
cat("============================================================================\n\n")
|
||
|
||
cat("Total tests run:", nrow(results_df), "\n")
|
||
cat("Successful bfast runs:", sum(results_df$success), "\n")
|
||
cat("Failed runs:", sum(!results_df$success), "\n\n")
|
||
|
||
if (any(!results_df$success)) {
|
||
cat("Failure reasons:\n")
|
||
failure_summary <- results_df %>%
|
||
filter(!success) %>%
|
||
count(reason) %>%
|
||
arrange(desc(n))
|
||
print(failure_summary, row.names = FALSE)
|
||
cat("\n")
|
||
}
|
||
|
||
# When was harvest first detected?
|
||
first_detection <- results_df %>%
|
||
filter(harvest_detected == TRUE) %>%
|
||
arrange(days_from_harvest) %>%
|
||
slice(1)
|
||
|
||
if (nrow(first_detection) > 0) {
|
||
cat("🎯 FIRST HARVEST DETECTION:\n")
|
||
cat(" Test date:", format(first_detection$test_date, "%Y-%m-%d"), "\n")
|
||
cat(" Days from actual harvest:", first_detection$days_from_harvest, "\n")
|
||
cat(" Observations used:", first_detection$n_obs, "\n")
|
||
cat(" Total breaks detected:", first_detection$breaks_detected, "\n")
|
||
cat(" Closest breakpoint offset:", first_detection$closest_bp_offset, "days\n\n")
|
||
|
||
if (first_detection$days_from_harvest < 0) {
|
||
cat(" ✓ Detected", abs(first_detection$days_from_harvest), "days BEFORE actual harvest\n")
|
||
} else if (first_detection$days_from_harvest == 0) {
|
||
cat(" ✓ Detected ON harvest day\n")
|
||
} else {
|
||
cat(" ℹ Detected", first_detection$days_from_harvest, "days AFTER actual harvest\n")
|
||
}
|
||
cat("\n")
|
||
} else {
|
||
cat("❌ Harvest NOT detected in any test window\n\n")
|
||
}
|
||
|
||
# Show detection pattern over time
|
||
cat("============================================================================\n")
|
||
cat("DETECTION PATTERN DAY-BY-DAY\n")
|
||
cat("============================================================================\n\n")
|
||
|
||
detection_summary <- results_df %>%
|
||
mutate(
|
||
status = case_when(
|
||
!success ~ paste0("FAIL: ", reason),
|
||
harvest_detected ~ paste0("✓ DETECTED (", breaks_detected, " breaks)"),
|
||
breaks_detected > 0 ~ paste0("○ ", breaks_detected, " breaks (not harvest)"),
|
||
TRUE ~ "○ No breaks"
|
||
)
|
||
) %>%
|
||
select(
|
||
Date = test_date,
|
||
Days_from_harvest = days_from_harvest,
|
||
N_obs = n_obs,
|
||
Status = status,
|
||
BP_offset = closest_bp_offset
|
||
)
|
||
|
||
print(as.data.frame(detection_summary), row.names = FALSE)
|
||
|
||
# ============================================================================
|
||
# VISUALIZATION
|
||
# ============================================================================
|
||
|
||
cat("\n\n============================================================================\n")
|
||
cat("GENERATING VISUALIZATION\n")
|
||
cat("============================================================================\n\n")
|
||
|
||
# Plot 1: CI time series with harvest date and detection windows
|
||
p1 <- ggplot(field_ts, aes(x = date, y = mean_ci)) +
|
||
geom_line(color = "darkgreen", linewidth = 1) +
|
||
geom_vline(xintercept = harvest_date, color = "red", linetype = "dashed", linewidth = 1) +
|
||
geom_vline(xintercept = harvest_date - CONFIG$days_before, color = "blue", linetype = "dotted") +
|
||
geom_vline(xintercept = harvest_date + CONFIG$days_after, color = "blue", linetype = "dotted") +
|
||
annotate("text", x = harvest_date, y = max(field_ts$mean_ci, na.rm = TRUE),
|
||
label = "Actual Harvest", vjust = -0.5, color = "red") +
|
||
labs(
|
||
title = paste0("Field ", CONFIG$test_field, " - CI Time Series"),
|
||
subtitle = paste0("Season: ", format(season_start, "%Y-%m-%d"), " to ",
|
||
format(harvest_date + CONFIG$days_after, "%Y-%m-%d")),
|
||
x = "Date",
|
||
y = "CI Value"
|
||
) +
|
||
theme_minimal() +
|
||
theme(plot.title = element_text(face = "bold"))
|
||
|
||
# Plot 2: Detection success over test window
|
||
p2 <- ggplot(results_df %>% filter(success),
|
||
aes(x = days_from_harvest, y = as.numeric(harvest_detected))) +
|
||
geom_line(color = "blue", linewidth = 1) +
|
||
geom_point(aes(color = harvest_detected), size = 2) +
|
||
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
|
||
scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "green")) +
|
||
labs(
|
||
title = "bfast Harvest Detection Over Time",
|
||
subtitle = paste0("Field ", CONFIG$test_field, " - Harvest ", CONFIG$test_harvest_index),
|
||
x = "Days from Actual Harvest",
|
||
y = "Harvest Detected",
|
||
color = "Detection"
|
||
) +
|
||
theme_minimal() +
|
||
theme(plot.title = element_text(face = "bold"))
|
||
|
||
# Plot 3: Number of breaks detected
|
||
p3 <- ggplot(results_df %>% filter(success),
|
||
aes(x = days_from_harvest, y = breaks_detected)) +
|
||
geom_line(color = "purple", linewidth = 1) +
|
||
geom_point(aes(color = harvest_detected), size = 2) +
|
||
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
|
||
scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "green")) +
|
||
labs(
|
||
title = "Number of Breakpoints Detected",
|
||
x = "Days from Actual Harvest",
|
||
y = "Breaks Detected",
|
||
color = "Harvest\nDetected"
|
||
) +
|
||
theme_minimal() +
|
||
theme(plot.title = element_text(face = "bold"))
|
||
|
||
# Save plots
|
||
output_dir <- here("r_app/experiments/harvest_prediction")
|
||
ggsave(file.path(output_dir, "bfast_ci_timeseries.png"), p1, width = 10, height = 6, dpi = 300)
|
||
ggsave(file.path(output_dir, "bfast_detection_pattern.png"), p2, width = 10, height = 6, dpi = 300)
|
||
ggsave(file.path(output_dir, "bfast_breaks_count.png"), p3, width = 10, height = 6, dpi = 300)
|
||
|
||
cat("Plots saved to:", output_dir, "\n")
|
||
cat(" - bfast_ci_timeseries.png\n")
|
||
cat(" - bfast_detection_pattern.png\n")
|
||
cat(" - bfast_breaks_count.png\n\n")
|
||
|
||
# ============================================================================
|
||
# SAVE RESULTS
|
||
# ============================================================================
|
||
|
||
output_file <- file.path(output_dir, "bfast_rolling_results.rds")
|
||
saveRDS(list(
|
||
config = CONFIG,
|
||
field = CONFIG$test_field,
|
||
harvest_date = harvest_date,
|
||
season_start = season_start,
|
||
results = results_df,
|
||
field_ts = field_ts
|
||
), output_file)
|
||
|
||
cat("Results saved to:", output_file, "\n\n")
|
||
|
||
cat("============================================================================\n")
|
||
cat("ANALYSIS COMPLETE\n")
|
||
cat("============================================================================\n")
|