SmartCane/r_app/experiments/harvest_prediction/test_bfast_rolling.R
2026-01-06 14:17:37 +01:00

431 lines
16 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ============================================================================
# BFAST ROLLING WINDOW TEST - Single Season Analysis
# ============================================================================
# Test bfast's ability to detect harvest breakpoints progressively
# For each day from -20 to +20 days around harvest:
# 1. Load CI data from previous harvest to current test day
# 2. Run bfast to detect breaks
# 3. Check if harvest date is detected as a breakpoint
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(bfast)
library(zoo)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG <- list(
test_field = "KHWC", # Field to test
test_harvest_index = 2, # Which harvest event (1 = first, 2 = second, etc.)
days_before = 20, # Days before harvest to start testing
days_after = 20, # Days after harvest to end testing
# bfast parameters
h = 0.15, # Minimal segment size (15% of data)
season = "none", # Seasonal model type ("dummy", "harmonic", "none")
max_iter = 10, # Maximum iterations
breaks = NULL # Let bfast determine number of breaks
)
cat("============================================================================\n")
cat("BFAST ROLLING WINDOW TEST - PROGRESSIVE HARVEST DETECTION\n")
cat("============================================================================\n\n")
cat("Test configuration:\n")
cat(" Field:", CONFIG$test_field, "\n")
cat(" Harvest event:", CONFIG$test_harvest_index, "\n")
cat(" Test window:", CONFIG$days_before, "days before to", CONFIG$days_after, "days after\n")
cat(" bfast h parameter:", CONFIG$h, "\n")
cat(" Seasonal model:", CONFIG$season, "\n\n")
# ============================================================================
# LOAD DATA
# ============================================================================
cat("=== LOADING DATA ===\n\n")
ci_rds_file <- here("laravel_app/storage/app", project_dir,
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
# ============================================================================
# GET TEST FIELD AND HARVEST EVENT
# ============================================================================
field_harvests <- harvest_data %>%
filter(field == CONFIG$test_field) %>%
arrange(season_end)
if (nrow(field_harvests) < CONFIG$test_harvest_index) {
stop("Field ", CONFIG$test_field, " does not have harvest event #", CONFIG$test_harvest_index)
}
test_harvest <- field_harvests[CONFIG$test_harvest_index, ]
harvest_date <- test_harvest$season_end
# Get previous harvest date to define season start
if (CONFIG$test_harvest_index == 1) {
season_start <- test_harvest$season_start
if (is.na(season_start)) {
# If no season_start, use earliest data for this field
season_start <- min(time_series_daily$date[time_series_daily$field_id == CONFIG$test_field])
}
} else {
season_start <- field_harvests$season_end[CONFIG$test_harvest_index - 1]
}
cat("Test harvest event:\n")
cat(" Field:", CONFIG$test_field, "\n")
cat(" Season start:", format(season_start, "%Y-%m-%d"), "\n")
cat(" Harvest date:", format(harvest_date, "%Y-%m-%d"), "\n")
cat(" Season length:", as.numeric(harvest_date - season_start), "days\n\n")
# ============================================================================
# PREPARE FIELD TIME SERIES
# ============================================================================
field_ts <- time_series_daily %>%
filter(field_id == CONFIG$test_field,
date >= season_start,
date <= harvest_date + CONFIG$days_after) %>%
arrange(date)
cat("CI data available:\n")
cat(" Total observations:", nrow(field_ts), "\n")
cat(" Date range:", format(min(field_ts$date), "%Y-%m-%d"), "to",
format(max(field_ts$date), "%Y-%m-%d"), "\n\n")
# ============================================================================
# BFAST WRAPPER FUNCTION
# ============================================================================
run_bfast_on_window <- function(ts_data, end_date, harvest_date, config = CONFIG) {
# Filter data up to end_date
window_data <- ts_data %>%
filter(date <= end_date) %>%
arrange(date)
if (nrow(window_data) < 50) {
return(list(
success = FALSE,
reason = "insufficient_data",
n_obs = nrow(window_data),
breaks_detected = 0,
harvest_detected = FALSE
))
}
# Create regular time series (fill gaps with NA for now, bfast can handle)
date_seq <- seq.Date(min(window_data$date), max(window_data$date), by = "1 day")
ts_regular <- data.frame(date = date_seq) %>%
left_join(window_data, by = "date")
# Interpolate missing values (linear interpolation)
ts_regular$mean_ci_interp <- na.approx(ts_regular$mean_ci, rule = 2)
# Convert to ts object (yearly frequency = 365 days)
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
ts_obj <- ts(ts_regular$mean_ci_interp,
start = c(start_year, start_doy),
frequency = 365)
# Run bfast
result <- tryCatch({
bfast_result <- bfast(ts_obj,
h = config$h,
season = config$season,
max.iter = config$max_iter,
breaks = config$breaks)
# Check if any breaks were detected in the trend component
if (!is.null(bfast_result$output[[1]]$bp.Vt) &&
length(bfast_result$output[[1]]$bp.Vt$breakpoints) > 0) {
# Get breakpoint dates
bp_indices <- bfast_result$output[[1]]$bp.Vt$breakpoints
bp_indices <- bp_indices[!is.na(bp_indices)]
if (length(bp_indices) > 0) {
bp_dates <- ts_regular$date[bp_indices]
# Check if any breakpoint is close to harvest date (within 14 days)
days_from_harvest <- as.numeric(bp_dates - harvest_date)
harvest_detected <- any(abs(days_from_harvest) <= 14)
closest_bp <- bp_dates[which.min(abs(days_from_harvest))]
closest_bp_offset <- as.numeric(closest_bp - harvest_date)
return(list(
success = TRUE,
n_obs = nrow(window_data),
breaks_detected = length(bp_dates),
breakpoint_dates = bp_dates,
days_from_harvest = days_from_harvest,
harvest_detected = harvest_detected,
closest_breakpoint = closest_bp,
closest_bp_offset = closest_bp_offset,
bfast_object = bfast_result
))
}
}
# No breaks detected
list(
success = TRUE,
n_obs = nrow(window_data),
breaks_detected = 0,
harvest_detected = FALSE
)
}, error = function(e) {
list(
success = FALSE,
reason = paste0("bfast_error: ", e$message),
n_obs = nrow(window_data),
breaks_detected = 0,
harvest_detected = FALSE
)
})
return(result)
}
# ============================================================================
# ROLLING WINDOW TEST
# ============================================================================
cat("============================================================================\n")
cat("RUNNING ROLLING WINDOW TEST\n")
cat("============================================================================\n\n")
test_dates <- seq.Date(
from = harvest_date - CONFIG$days_before,
to = harvest_date + CONFIG$days_after,
by = "1 day"
)
cat("Testing", length(test_dates), "different end dates...\n\n")
results_df <- data.frame()
pb <- txtProgressBar(min = 0, max = length(test_dates), style = 3)
for (i in 1:length(test_dates)) {
test_date <- test_dates[i]
days_from_harvest <- as.numeric(test_date - harvest_date)
result <- run_bfast_on_window(field_ts, test_date, harvest_date, CONFIG)
results_df <- bind_rows(results_df, data.frame(
test_date = test_date,
days_from_harvest = days_from_harvest,
n_obs = result$n_obs,
success = result$success,
breaks_detected = result$breaks_detected,
harvest_detected = ifelse(is.null(result$harvest_detected), FALSE, result$harvest_detected),
closest_bp_offset = ifelse(is.null(result$closest_bp_offset), NA, result$closest_bp_offset),
reason = ifelse(is.null(result$reason), "ok", result$reason)
))
setTxtProgressBar(pb, i)
}
close(pb)
# ============================================================================
# RESULTS SUMMARY
# ============================================================================
cat("\n\n============================================================================\n")
cat("RESULTS SUMMARY\n")
cat("============================================================================\n\n")
cat("Total tests run:", nrow(results_df), "\n")
cat("Successful bfast runs:", sum(results_df$success), "\n")
cat("Failed runs:", sum(!results_df$success), "\n\n")
if (any(!results_df$success)) {
cat("Failure reasons:\n")
failure_summary <- results_df %>%
filter(!success) %>%
count(reason) %>%
arrange(desc(n))
print(failure_summary, row.names = FALSE)
cat("\n")
}
# When was harvest first detected?
first_detection <- results_df %>%
filter(harvest_detected == TRUE) %>%
arrange(days_from_harvest) %>%
slice(1)
if (nrow(first_detection) > 0) {
cat("🎯 FIRST HARVEST DETECTION:\n")
cat(" Test date:", format(first_detection$test_date, "%Y-%m-%d"), "\n")
cat(" Days from actual harvest:", first_detection$days_from_harvest, "\n")
cat(" Observations used:", first_detection$n_obs, "\n")
cat(" Total breaks detected:", first_detection$breaks_detected, "\n")
cat(" Closest breakpoint offset:", first_detection$closest_bp_offset, "days\n\n")
if (first_detection$days_from_harvest < 0) {
cat(" ✓ Detected", abs(first_detection$days_from_harvest), "days BEFORE actual harvest\n")
} else if (first_detection$days_from_harvest == 0) {
cat(" ✓ Detected ON harvest day\n")
} else {
cat(" Detected", first_detection$days_from_harvest, "days AFTER actual harvest\n")
}
cat("\n")
} else {
cat("❌ Harvest NOT detected in any test window\n\n")
}
# Show detection pattern over time
cat("============================================================================\n")
cat("DETECTION PATTERN DAY-BY-DAY\n")
cat("============================================================================\n\n")
detection_summary <- results_df %>%
mutate(
status = case_when(
!success ~ paste0("FAIL: ", reason),
harvest_detected ~ paste0("✓ DETECTED (", breaks_detected, " breaks)"),
breaks_detected > 0 ~ paste0("○ ", breaks_detected, " breaks (not harvest)"),
TRUE ~ "○ No breaks"
)
) %>%
select(
Date = test_date,
Days_from_harvest = days_from_harvest,
N_obs = n_obs,
Status = status,
BP_offset = closest_bp_offset
)
print(as.data.frame(detection_summary), row.names = FALSE)
# ============================================================================
# VISUALIZATION
# ============================================================================
cat("\n\n============================================================================\n")
cat("GENERATING VISUALIZATION\n")
cat("============================================================================\n\n")
# Plot 1: CI time series with harvest date and detection windows
p1 <- ggplot(field_ts, aes(x = date, y = mean_ci)) +
geom_line(color = "darkgreen", linewidth = 1) +
geom_vline(xintercept = harvest_date, color = "red", linetype = "dashed", linewidth = 1) +
geom_vline(xintercept = harvest_date - CONFIG$days_before, color = "blue", linetype = "dotted") +
geom_vline(xintercept = harvest_date + CONFIG$days_after, color = "blue", linetype = "dotted") +
annotate("text", x = harvest_date, y = max(field_ts$mean_ci, na.rm = TRUE),
label = "Actual Harvest", vjust = -0.5, color = "red") +
labs(
title = paste0("Field ", CONFIG$test_field, " - CI Time Series"),
subtitle = paste0("Season: ", format(season_start, "%Y-%m-%d"), " to ",
format(harvest_date + CONFIG$days_after, "%Y-%m-%d")),
x = "Date",
y = "CI Value"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
# Plot 2: Detection success over test window
p2 <- ggplot(results_df %>% filter(success),
aes(x = days_from_harvest, y = as.numeric(harvest_detected))) +
geom_line(color = "blue", linewidth = 1) +
geom_point(aes(color = harvest_detected), size = 2) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "green")) +
labs(
title = "bfast Harvest Detection Over Time",
subtitle = paste0("Field ", CONFIG$test_field, " - Harvest ", CONFIG$test_harvest_index),
x = "Days from Actual Harvest",
y = "Harvest Detected",
color = "Detection"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
# Plot 3: Number of breaks detected
p3 <- ggplot(results_df %>% filter(success),
aes(x = days_from_harvest, y = breaks_detected)) +
geom_line(color = "purple", linewidth = 1) +
geom_point(aes(color = harvest_detected), size = 2) +
geom_vline(xintercept = 0, color = "red", linetype = "dashed") +
scale_color_manual(values = c("FALSE" = "gray", "TRUE" = "green")) +
labs(
title = "Number of Breakpoints Detected",
x = "Days from Actual Harvest",
y = "Breaks Detected",
color = "Harvest\nDetected"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
# Save plots
output_dir <- here("r_app/experiments/harvest_prediction")
ggsave(file.path(output_dir, "bfast_ci_timeseries.png"), p1, width = 10, height = 6, dpi = 300)
ggsave(file.path(output_dir, "bfast_detection_pattern.png"), p2, width = 10, height = 6, dpi = 300)
ggsave(file.path(output_dir, "bfast_breaks_count.png"), p3, width = 10, height = 6, dpi = 300)
cat("Plots saved to:", output_dir, "\n")
cat(" - bfast_ci_timeseries.png\n")
cat(" - bfast_detection_pattern.png\n")
cat(" - bfast_breaks_count.png\n\n")
# ============================================================================
# SAVE RESULTS
# ============================================================================
output_file <- file.path(output_dir, "bfast_rolling_results.rds")
saveRDS(list(
config = CONFIG,
field = CONFIG$test_field,
harvest_date = harvest_date,
season_start = season_start,
results = results_df,
field_ts = field_ts
), output_file)
cat("Results saved to:", output_file, "\n\n")
cat("============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n")