SmartCane/r_app/experiments/harvest_prediction/test_bfastmonitor.R
2026-01-06 14:17:37 +01:00

393 lines
13 KiB
R

# ============================================================================
# BFAST MONITOR TEST - Real-time Harvest Detection
# ============================================================================
# Use bfastmonitor() which is designed for:
# - A stable historical baseline period
# - Monitoring recent period for breaks
# - Real-time change detection
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(bfast)
library(zoo)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG <- list(
# FIELD SELECTION - Change these to test different fields
test_field = "00302", # Try: 00007, 00104, 00119, 00120, etc.
test_harvest_index = 3, # Which harvest event (1 = first, 2 = second, etc.)
# HISTORY/MONITORING SPLIT
history_days = 300, # Fixed: use first 300 days as baseline history
# Analysis extends to this many days after harvest
days_after_harvest = 20,
# SMOOTHING - Test different values
smoothing_windows = c(1, 7, 14), # 1 = no smoothing, 7 = weekly, 14 = biweekly
# BFASTMONITOR PARAMETERS (tweakable!)
# 1. Formula: how to model the baseline
# - response ~ trend : simple linear trend
# - response ~ trend + harmon : add seasonal harmonics
formula = response ~ trend,
# 2. Order: if using harmon, how many harmonics (1-3 typical)
order = 1,
# 3. Type: type of monitoring process
# - "OLS-MOSUM" (default): moving sum (good for gradual changes)
# - "OLS-CUSUM": cumulative sum (good for abrupt changes)
# - "RE": recursive estimates
# - "ME": moving estimates
type = "OLS-MOSUM",
# 4. h: bandwidth for moving/recursive estimates (0.15-0.5 typical)
# smaller = more sensitive to recent changes
h = 0.25,
# 5. level: significance level for break detection (0.01-0.1)
# lower = stricter (fewer false positives)
level = 0.05
)
cat("============================================================================\n")
cat("BFAST MONITOR - REAL-TIME HARVEST DETECTION\n")
cat("============================================================================\n\n")
# ============================================================================
# LOAD DATA
# ============================================================================
ci_rds_file <- here("laravel_app/storage/app", project_dir,
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
# Check which fields have CI data
fields_with_ci <- unique(time_series_daily$field_id)
cat("Fields with CI data:", length(fields_with_ci), "\n")
cat("Sample fields:", paste(head(fields_with_ci, 20), collapse = ", "), "\n\n")
# Filter harvest data to only fields that have CI data
harvest_data_with_ci <- harvest_data %>%
filter(field %in% fields_with_ci)
cat("Available fields with BOTH harvest data AND CI data:\n")
field_summary <- harvest_data_with_ci %>%
group_by(field) %>%
summarise(
n_harvests = n(),
first_harvest = min(season_end),
last_harvest = max(season_end),
.groups = "drop"
) %>%
arrange(field)
print(field_summary, n = 50)
cat("\n")
# Get test field and harvest
field_harvests <- harvest_data_with_ci %>%
filter(field == CONFIG$test_field) %>%
arrange(season_end)
if (nrow(field_harvests) == 0) {
stop("Field ", CONFIG$test_field, " does not have both CI data and harvest records. ",
"Please choose from the fields listed above.")
}
if (nrow(field_harvests) < CONFIG$test_harvest_index) {
stop("Field ", CONFIG$test_field, " only has ", nrow(field_harvests),
" harvest events, but you requested harvest #", CONFIG$test_harvest_index)
}
test_harvest <- field_harvests[CONFIG$test_harvest_index, ]
harvest_date <- test_harvest$season_end
if (CONFIG$test_harvest_index == 1) {
season_start <- test_harvest$season_start
if (is.na(season_start)) {
season_start <- min(time_series_daily$date[time_series_daily$field_id == CONFIG$test_field])
}
} else {
season_start <- field_harvests$season_end[CONFIG$test_harvest_index - 1]
}
# Prepare field time series
end_date <- harvest_date + CONFIG$days_after_harvest
field_ts <- time_series_daily %>%
filter(field_id == CONFIG$test_field,
date >= season_start,
date <= end_date) %>%
arrange(date)
season_length <- as.numeric(harvest_date - season_start)
history_days <- CONFIG$history_days # Use fixed 300 days
history_end <- season_start + history_days
cat("Field:", CONFIG$test_field, "\n")
cat("Season start:", format(season_start, "%Y-%m-%d"), "\n")
cat("Harvest date:", format(harvest_date, "%Y-%m-%d"), "\n")
cat("Analysis end:", format(end_date, "%Y-%m-%d"), "\n")
cat("Season length:", season_length, "days\n\n")
cat("History period (baseline):", format(season_start, "%Y-%m-%d"), "to",
format(history_end, "%Y-%m-%d"), "(", history_days, "days)\n")
cat("Monitoring period:", format(history_end + 1, "%Y-%m-%d"), "to",
format(end_date, "%Y-%m-%d"), "\n\n")
# ============================================================================
# PREPARE TIME SERIES
# ============================================================================
# Create regular time series
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
ts_regular <- data.frame(date = date_seq) %>%
left_join(field_ts, by = "date")
# Interpolate missing values
ts_regular$mean_ci_interp <- na.approx(ts_regular$mean_ci, rule = 2)
# ============================================================================
# TEST DIFFERENT SMOOTHING WINDOWS
# ============================================================================
cat("============================================================================\n")
cat("TESTING DIFFERENT SMOOTHING WINDOWS\n")
cat("============================================================================\n\n")
all_results <- list()
for (smooth_window in CONFIG$smoothing_windows) {
cat("----------------------------------------------------------------------------\n")
cat("SMOOTHING WINDOW:", smooth_window, "days\n")
cat("----------------------------------------------------------------------------\n\n")
# Apply smoothing
if (smooth_window == 1) {
ts_regular$mean_ci_smooth <- ts_regular$mean_ci_interp
smooth_label <- "No smoothing"
} else {
ts_regular$mean_ci_smooth <- rollmean(ts_regular$mean_ci_interp,
k = smooth_window,
fill = NA,
align = "center")
# Fill NAs at edges
ts_regular$mean_ci_smooth <- na.approx(ts_regular$mean_ci_smooth, rule = 2)
smooth_label <- paste0(smooth_window, "-day moving average")
}
# Convert to ts object
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
ts_obj <- ts(ts_regular$mean_ci_smooth,
start = c(start_year, start_doy),
frequency = 365)
# Calculate start point for monitoring period
history_start_decimal <- as.numeric(start_year) + (start_doy - 1) / 365
history_end_decimal <- history_start_decimal + (history_days / 365)
cat("Time series with", smooth_label, "\n")
cat(" Length:", length(ts_obj), "observations\n\n")
# ============================================================================
# RUN BFASTMONITOR
# ============================================================================
tryCatch({
# Run bfastmonitor
bfm_result <- bfastmonitor(
data = ts_obj,
start = history_end_decimal,
formula = CONFIG$formula,
order = CONFIG$order,
type = CONFIG$type,
h = CONFIG$h,
level = CONFIG$level
)
cat("bfastmonitor completed successfully\n\n")
# Check if break was detected
if (!is.na(bfm_result$breakpoint)) {
# Convert breakpoint back to date
bp_decimal <- bfm_result$breakpoint
bp_year <- floor(bp_decimal)
bp_doy <- round((bp_decimal - bp_year) * 365) + 1
bp_date <- as.Date(paste0(bp_year, "-01-01")) + bp_doy - 1
days_from_harvest <- as.numeric(bp_date - harvest_date)
cat("✓ BREAKPOINT DETECTED\n")
cat(" Break date:", format(bp_date, "%Y-%m-%d"), "\n")
cat(" Days from harvest:", days_from_harvest, "\n")
if (abs(days_from_harvest) <= 7) {
cat(" >>> ✓✓✓ HARVEST DETECTED WITHIN 7 DAYS! ✓✓✓ <<<\n")
} else if (abs(days_from_harvest) <= 14) {
cat(" >>> ✓ HARVEST DETECTED WITHIN 14 DAYS <<<\n")
} else if (days_from_harvest < 0) {
cat(" Break occurred", abs(days_from_harvest), "days BEFORE harvest\n")
} else {
cat(" Break occurred", days_from_harvest, "days AFTER harvest\n")
}
cat(" Break magnitude:", round(bfm_result$magnitude, 3), "\n\n")
# Store results
all_results[[as.character(smooth_window)]] <- list(
smooth_window = smooth_window,
smooth_label = smooth_label,
break_detected = TRUE,
break_date = bp_date,
days_from_harvest = days_from_harvest,
magnitude = bfm_result$magnitude,
bfm_result = bfm_result
)
} else {
cat("❌ No breakpoint detected in monitoring period\n\n")
all_results[[as.character(smooth_window)]] <- list(
smooth_window = smooth_window,
smooth_label = smooth_label,
break_detected = FALSE
)
}
}, error = function(e) {
cat("ERROR:", e$message, "\n\n")
all_results[[as.character(smooth_window)]] <- list(
smooth_window = smooth_window,
smooth_label = smooth_label,
error = e$message
)
})
cat("\n")
}
# ============================================================================
# SUMMARY OF ALL SMOOTHING TESTS
# ============================================================================
cat("============================================================================\n")
cat("SUMMARY - EFFECT OF SMOOTHING ON HARVEST DETECTION\n")
cat("============================================================================\n\n")
summary_df <- data.frame()
for (sw in names(all_results)) {
result <- all_results[[sw]]
if (!is.null(result$error)) {
summary_df <- rbind(summary_df, data.frame(
smoothing_window = result$smooth_window,
label = result$smooth_label,
break_detected = "ERROR",
break_date = NA,
days_from_harvest = NA,
magnitude = NA
))
} else if (result$break_detected) {
summary_df <- rbind(summary_df, data.frame(
smoothing_window = result$smooth_window,
label = result$smooth_label,
break_detected = "YES",
break_date = format(result$break_date, "%Y-%m-%d"),
days_from_harvest = result$days_from_harvest,
magnitude = round(result$magnitude, 3)
))
} else {
summary_df <- rbind(summary_df, data.frame(
smoothing_window = result$smooth_window,
label = result$smooth_label,
break_detected = "NO",
break_date = NA,
days_from_harvest = NA,
magnitude = NA
))
}
}
print(summary_df, row.names = FALSE)
cat("\n")
# Find best result (closest to harvest)
best_result <- NULL
min_days <- Inf
for (result in all_results) {
if (!is.null(result$days_from_harvest) && !is.na(result$days_from_harvest)) {
if (abs(result$days_from_harvest) < min_days) {
min_days <- abs(result$days_from_harvest)
best_result <- result
}
}
}
if (!is.null(best_result)) {
cat("BEST RESULT:\n")
cat(" Smoothing:", best_result$smooth_label, "\n")
cat(" Break date:", format(best_result$break_date, "%Y-%m-%d"), "\n")
cat(" Days from harvest:", best_result$days_from_harvest, "\n")
cat(" Magnitude:", round(best_result$magnitude, 3), "\n\n")
# Save plot for best result
output_dir <- here("r_app/experiments/harvest_prediction")
png(file.path(output_dir, "bfastmonitor_best_smoothing.png"),
width = 12, height = 8, units = "in", res = 300)
plot(best_result$bfm_result,
main = paste0("bfastmonitor - Field ", CONFIG$test_field,
" (", best_result$smooth_label, ")"))
abline(v = decimal_date(harvest_date), col = "red", lty = 2, lwd = 2)
legend("topleft", legend = c("Actual Harvest"), col = "red", lty = 2, lwd = 2)
dev.off()
cat("Saved best result plot: bfastmonitor_best_smoothing.png\n")
}
cat("\n============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n")