SmartCane/r_app/experiments/harvest_prediction/detect_harvest_retrospective_bfast.R
2026-01-06 14:17:37 +01:00

613 lines
21 KiB
R

# ============================================================================
# RETROSPECTIVE HARVEST DETECTION USING BFAST
# ============================================================================
# Purpose: Detect ALL historical harvest dates across complete CI time series
#
# Approach:
# - Use full BFAST algorithm on complete time series (not real-time monitoring)
# - Detect structural breaks = harvest events
# - No need for immediate detection - can wait weeks for confirmation
# - Output: Historical harvest database for field age calculation
#
# Key difference from real-time approaches:
# - THEN: "Did this field harvest yesterday?" (hard, incomplete data)
# - NOW: "When did this field harvest in the past?" (easier, full data)
#
# Usage: Rscript detect_harvest_retrospective_bfast.R [--validate]
# --validate: Optional flag to compare against harvest.xlsx (testing only)
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(bfast)
library(zoo)
library(ggplot2)
})
# ============================================================================
# CONFIGURATION
# ============================================================================
CONFIG <- list(
# DATA SOURCE
project_dir = "esa", # ESA WorldCover data (Planet imagery)
# BFAST PARAMETERS (tuned for sugarcane harvest detection)
h = 0.15, # Minimum segment size (15% of data = ~2 harvests/year possible)
season = "harmonic", # Model seasonal growth patterns
max_iter = 10, # Iterative refinement
breaks = NULL, # Auto-detect number of breaks
# PREPROCESSING
smoothing_window = 7, # Days for rolling median (reduce noise)
min_observations = 200, # Minimum data points needed per field
# HARVEST CLASSIFICATION (post-processing breakpoints)
min_harvest_interval = 180, # Days between harvests (6 months minimum)
ci_drop_threshold = -0.5, # Minimum CI drop to consider as harvest
magnitude_threshold = 0.3, # Minimum break magnitude
# OUTPUT
save_plots = TRUE,
max_plots = 10, # Limit number of diagnostic plots saved
# VALIDATION (optional - only for testing)
validate = TRUE # Set to TRUE to compare against harvest.xlsx
)
# Process command line arguments
args <- commandArgs(trailingOnly = TRUE)
if ("--validate" %in% args) {
CONFIG$validate <- TRUE
cat("Validation mode enabled - will compare against harvest.xlsx\n\n")
}
# Set project directory globally
project_dir <- CONFIG$project_dir
assign("project_dir", project_dir, envir = .GlobalEnv)
# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
cat("============================================================================\n")
cat("RETROSPECTIVE HARVEST DETECTION USING BFAST\n")
cat("============================================================================\n\n")
cat("Configuration:\n")
cat(" Data source:", CONFIG$project_dir, "\n")
cat(" BFAST h parameter:", CONFIG$h, "\n")
cat(" Seasonal model:", CONFIG$season, "\n")
cat(" Smoothing window:", CONFIG$smoothing_window, "days\n")
cat(" Min harvest interval:", CONFIG$min_harvest_interval, "days\n\n")
# ============================================================================
# LOAD DATA
# ============================================================================
cat("=== LOADING CI DATA ===\n\n")
ci_rds_file <- here("laravel_app/storage/app", project_dir,
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
if (!file.exists(ci_rds_file)) {
stop("CI data file not found: ", ci_rds_file)
}
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
cat("Loaded", nrow(time_series_daily), "daily observations\n")
cat("Fields:", length(unique(time_series_daily$field_id)), "\n")
cat("Date range:", format(min(time_series_daily$date), "%Y-%m-%d"), "to",
format(max(time_series_daily$date), "%Y-%m-%d"), "\n\n")
# Field summary
field_summary <- time_series_daily %>%
group_by(field_id) %>%
summarise(
n_obs = n(),
start_date = min(date),
end_date = max(date),
duration_days = as.numeric(end_date - start_date),
mean_ci = mean(mean_ci, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(n_obs))
cat("Field data summary:\n")
cat(" Fields with >", CONFIG$min_observations, "obs:",
sum(field_summary$n_obs >= CONFIG$min_observations), "/", nrow(field_summary), "\n")
cat(" Mean observations per field:", round(mean(field_summary$n_obs)), "\n")
cat(" Mean duration:", round(mean(field_summary$duration_days)), "days\n\n")
# Filter to fields with sufficient data
valid_fields <- field_summary %>%
filter(n_obs >= CONFIG$min_observations) %>%
pull(field_id)
cat("Processing", length(valid_fields), "fields with sufficient data\n\n")
# ============================================================================
# BFAST HARVEST DETECTION FUNCTION
# ============================================================================
detect_harvests_bfast <- function(field_data, field_id, config = CONFIG) {
# Prepare time series
field_ts <- field_data %>%
arrange(date) %>%
mutate(
# Apply smoothing to reduce noise
ci_smooth = if (config$smoothing_window > 1) {
rollmedian(mean_ci, k = config$smoothing_window, fill = NA, align = "center")
} else {
mean_ci
}
)
# Fill NA values from smoothing
field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)
# Create regular daily time series
date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
ts_regular <- data.frame(date = date_seq) %>%
left_join(field_ts %>% select(date, ci_smooth), by = "date")
# Interpolate missing days
ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)
# Convert to ts object (yearly frequency)
start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
start_doy <- as.numeric(format(min(ts_regular$date), "%j"))
ts_obj <- ts(ts_regular$ci_smooth,
start = c(start_year, start_doy),
frequency = 365)
# Run BFAST
bfast_result <- tryCatch({
bfast(ts_obj,
h = config$h,
season = config$season,
max.iter = config$max_iter,
breaks = config$breaks)
}, error = function(e) {
warning("BFAST failed for field ", field_id, ": ", e$message)
return(NULL)
})
if (is.null(bfast_result)) {
return(list(
success = FALSE,
field_id = field_id,
harvests = tibble()
))
}
# Extract breakpoints from trend component
bp_component <- bfast_result$output[[1]]$bp.Vt
if (is.null(bp_component) || length(bp_component$breakpoints) == 0) {
# No breaks detected
return(list(
success = TRUE,
field_id = field_id,
n_breaks = 0,
harvests = tibble(),
bfast_result = bfast_result
))
}
# Get breakpoint indices (remove NAs)
bp_indices <- bp_component$breakpoints
bp_indices <- bp_indices[!is.na(bp_indices)]
if (length(bp_indices) == 0) {
return(list(
success = TRUE,
field_id = field_id,
n_breaks = 0,
harvests = tibble(),
bfast_result = bfast_result
))
}
# Convert indices to dates
bp_dates <- ts_regular$date[bp_indices]
# Get CI values before and after breaks
ci_before <- ts_regular$ci_smooth[pmax(1, bp_indices - 7)] # 7 days before
ci_after <- ts_regular$ci_smooth[pmin(nrow(ts_regular), bp_indices + 7)] # 7 days after
ci_change <- ci_after - ci_before
# Get magnitude from BFAST
magnitudes <- if (!is.null(bp_component$magnitude)) {
abs(bp_component$magnitude)
} else {
rep(NA, length(bp_indices))
}
# Create breaks dataframe
breaks_df <- tibble(
break_date = bp_dates,
break_index = bp_indices,
ci_before = ci_before,
ci_after = ci_after,
ci_change = ci_change,
magnitude = magnitudes
) %>%
arrange(break_date)
# Filter for harvest-like breaks (downward, significant)
harvest_breaks <- breaks_df %>%
filter(
ci_change < config$ci_drop_threshold, # CI dropped
(is.na(magnitude) | magnitude > config$magnitude_threshold) # Significant break
)
# Remove breaks that are too close together (keep first in cluster)
if (nrow(harvest_breaks) > 1) {
harvest_breaks <- harvest_breaks %>%
mutate(
days_since_prev = c(Inf, diff(break_date)),
keep = days_since_prev >= config$min_harvest_interval
) %>%
filter(keep) %>%
select(-days_since_prev, -keep)
}
# Format as harvest detections
harvests <- harvest_breaks %>%
mutate(
field_id = field_id,
harvest_date = break_date,
harvest_week = isoweek(harvest_date),
harvest_year = isoyear(harvest_date),
ci_at_harvest = ci_after,
detection_method = "bfast_retrospective"
) %>%
select(field_id, harvest_date, harvest_week, harvest_year,
ci_at_harvest, ci_change, magnitude, detection_method)
return(list(
success = TRUE,
field_id = field_id,
n_breaks_total = nrow(breaks_df),
n_breaks_harvest = nrow(harvests),
harvests = harvests,
all_breaks = breaks_df,
bfast_result = bfast_result,
ts_data = ts_regular
))
}
# ============================================================================
# PROCESS ALL FIELDS
# ============================================================================
cat("============================================================================\n")
cat("PROCESSING FIELDS\n")
cat("============================================================================\n\n")
all_results <- list()
all_harvests <- tibble()
pb <- txtProgressBar(min = 0, max = length(valid_fields), style = 3)
for (i in seq_along(valid_fields)) {
field_id <- valid_fields[i]
field_data <- time_series_daily %>%
filter(field_id == !!field_id)
result <- detect_harvests_bfast(field_data, field_id, CONFIG)
all_results[[field_id]] <- result
if (result$success && nrow(result$harvests) > 0) {
all_harvests <- bind_rows(all_harvests, result$harvests)
}
setTxtProgressBar(pb, i)
}
close(pb)
cat("\n\n")
# ============================================================================
# SUMMARY STATISTICS
# ============================================================================
cat("============================================================================\n")
cat("DETECTION SUMMARY\n")
cat("============================================================================\n\n")
successful <- sum(sapply(all_results, function(x) x$success))
failed <- length(all_results) - successful
cat("Fields processed:", length(all_results), "\n")
cat(" Successful:", successful, "\n")
cat(" Failed:", failed, "\n\n")
total_breaks <- sum(sapply(all_results, function(x) ifelse(x$success, x$n_breaks_total, 0)))
total_harvests <- sum(sapply(all_results, function(x) ifelse(x$success, x$n_breaks_harvest, 0)))
cat("Breakpoints detected:\n")
cat(" Total breaks:", total_breaks, "\n")
cat(" Classified as harvests:", total_harvests, "\n")
cat(" Filtered out:", total_breaks - total_harvests, "\n\n")
cat("Harvest detections:\n")
cat(" Total harvest events:", nrow(all_harvests), "\n")
cat(" Fields with harvests:", length(unique(all_harvests$field_id)), "\n")
cat(" Mean harvests per field:", round(nrow(all_harvests) / length(unique(all_harvests$field_id)), 2), "\n\n")
if (nrow(all_harvests) > 0) {
harvest_summary <- all_harvests %>%
group_by(field_id) %>%
summarise(
n_harvests = n(),
first_harvest = min(harvest_date),
last_harvest = max(harvest_date),
mean_ci_at_harvest = mean(ci_at_harvest, na.rm = TRUE),
.groups = "drop"
)
cat("Distribution of harvests per field:\n")
harvest_counts <- table(harvest_summary$n_harvests)
for (nh in names(harvest_counts)) {
cat(" ", nh, "harvest(s):", harvest_counts[nh], "fields\n")
}
cat("\n")
cat("CI values at harvest:\n")
cat(" Mean:", round(mean(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n")
cat(" Median:", round(median(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n")
cat(" Range:", round(min(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "-",
round(max(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n\n")
}
# ============================================================================
# VALIDATION (OPTIONAL - TESTING ONLY)
# ============================================================================
if (CONFIG$validate) {
cat("============================================================================\n")
cat("VALIDATION AGAINST HARVEST.XLSX (TESTING ONLY)\n")
cat("============================================================================\n\n")
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
if (file.exists(harvest_file)) {
harvest_actual <- read_excel(harvest_file) %>%
mutate(
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end)) %>%
select(field_id = field, actual_harvest = season_end) %>%
mutate(
actual_week = isoweek(actual_harvest),
actual_year = isoyear(actual_harvest)
)
cat("Loaded", nrow(harvest_actual), "actual harvest records\n\n")
# Match detected to actual
validation <- harvest_actual %>%
left_join(
all_harvests %>%
select(field_id, detected_harvest = harvest_date, detected_week = harvest_week,
detected_year = harvest_year, ci_at_harvest),
by = c("field_id", "actual_year" = "detected_year")
) %>%
mutate(
week_diff = detected_week - actual_week,
days_diff = as.numeric(detected_harvest - actual_harvest),
match_status = case_when(
is.na(detected_harvest) ~ "MISSED",
abs(week_diff) <= 2 ~ "MATCHED (±2w)",
abs(week_diff) <= 4 ~ "MATCHED (±4w)",
TRUE ~ paste0("MISMATCH (", ifelse(week_diff > 0, "+", ""), week_diff, "w)")
)
)
# Summary
cat("Validation results:\n")
match_summary <- table(validation$match_status)
for (status in names(match_summary)) {
cat(" ", status, ":", match_summary[status], "\n")
}
cat("\n")
matched_2w <- sum(validation$match_status == "MATCHED (±2w)", na.rm = TRUE)
matched_4w <- sum(validation$match_status == "MATCHED (±4w)", na.rm = TRUE)
missed <- sum(validation$match_status == "MISSED", na.rm = TRUE)
cat("Detection rate:", round(100 * (nrow(harvest_actual) - missed) / nrow(harvest_actual), 1), "%\n")
cat("Accuracy (±2 weeks):", round(100 * matched_2w / nrow(harvest_actual), 1), "%\n")
cat("Accuracy (±4 weeks):", round(100 * (matched_2w + matched_4w) / nrow(harvest_actual), 1), "%\n\n")
# Check for false positives
false_positives <- all_harvests %>%
anti_join(harvest_actual, by = c("field_id", "harvest_year" = "actual_year"))
cat("False positives:", nrow(false_positives),
"(detected harvests not in harvest.xlsx)\n\n")
# Show detailed comparison for fields with mismatches
mismatches <- validation %>%
filter(grepl("MISMATCH", match_status)) %>%
select(field_id, actual_harvest, detected_harvest, days_diff, match_status)
if (nrow(mismatches) > 0) {
cat("Mismatched detections (sample):\n")
print(head(mismatches, 20))
cat("\n")
}
} else {
cat("Validation file not found:", harvest_file, "\n")
cat("Skipping validation (not needed for operational use)\n\n")
}
}
# ============================================================================
# SAVE RESULTS
# ============================================================================
cat("============================================================================\n")
cat("SAVING RESULTS\n")
cat("============================================================================\n\n")
output_dir <- here("r_app/experiments/harvest_prediction")
# Save main results
output_file <- file.path(output_dir, "detected_harvests_bfast.rds")
saveRDS(list(
config = CONFIG,
detection_date = Sys.time(),
harvests = all_harvests,
field_summary = harvest_summary,
all_results = all_results
), output_file)
cat("✓ Saved harvest detections:", output_file, "\n")
# Save CSV for easy viewing
csv_file <- file.path(output_dir, "detected_harvests_bfast.csv")
write.csv(all_harvests, csv_file, row.names = FALSE)
cat("✓ Saved CSV:", csv_file, "\n\n")
# ============================================================================
# GENERATE DIAGNOSTIC PLOTS (SAMPLE)
# ============================================================================
if (CONFIG$save_plots && nrow(all_harvests) > 0) {
cat("============================================================================\n")
cat("GENERATING DIAGNOSTIC PLOTS\n")
cat("============================================================================\n\n")
# Get fields with harvests
fields_with_harvests <- unique(all_harvests$field_id)
plot_fields <- head(fields_with_harvests, CONFIG$max_plots)
cat("Creating plots for", length(plot_fields), "fields...\n")
for (field_id in plot_fields) {
result <- all_results[[field_id]]
if (result$success && nrow(result$harvests) > 0) {
# Create plot
p <- ggplot(result$ts_data, aes(x = date, y = ci_smooth)) +
geom_line(color = "darkgreen", linewidth = 0.8) +
geom_vline(data = result$harvests,
aes(xintercept = harvest_date),
color = "red", linetype = "dashed", linewidth = 1) +
labs(
title = paste0("Field ", field_id, " - BFAST Harvest Detection"),
subtitle = paste0(nrow(result$harvests), " harvest(s) detected"),
x = "Date",
y = "CI (smoothed)"
) +
theme_minimal() +
theme(plot.title = element_text(face = "bold"))
# Add labels for harvest dates
harvest_labels <- result$harvests %>%
mutate(label = format(harvest_date, "%Y-%m-%d"))
p <- p + geom_text(data = harvest_labels,
aes(x = harvest_date,
y = max(result$ts_data$ci_smooth, na.rm = TRUE),
label = label),
angle = 90, vjust = -0.5, size = 3, color = "red")
# Save plot
plot_file <- file.path(output_dir, paste0("harvest_detection_", field_id, ".png"))
ggsave(plot_file, p, width = 12, height = 6, dpi = 150)
}
}
cat("✓ Saved", length(plot_fields), "diagnostic plots\n\n")
}
# ============================================================================
# CALCULATE CURRENT FIELD AGE
# ============================================================================
cat("============================================================================\n")
cat("CALCULATING CURRENT FIELD AGE\n")
cat("============================================================================\n\n")
if (nrow(all_harvests) > 0) {
# Get most recent harvest for each field
latest_harvests <- all_harvests %>%
group_by(field_id) %>%
slice_max(harvest_date, n = 1, with_ties = FALSE) %>%
ungroup() %>%
mutate(
days_since_harvest = as.numeric(Sys.Date() - harvest_date),
months_since_harvest = days_since_harvest / 30.44,
age_category = case_when(
months_since_harvest < 3 ~ "Young (0-3 months)",
months_since_harvest < 9 ~ "Mature (3-9 months)",
months_since_harvest < 12 ~ "Pre-harvest (9-12 months)",
TRUE ~ "Overdue (12+ months)"
)
) %>%
select(field_id, last_harvest = harvest_date, days_since_harvest,
months_since_harvest, age_category)
cat("Field age summary:\n")
age_counts <- table(latest_harvests$age_category)
for (cat_name in names(age_counts)) {
cat(" ", cat_name, ":", age_counts[cat_name], "fields\n")
}
cat("\n")
# Save field age report
age_file <- file.path(output_dir, "field_age_report.csv")
write.csv(latest_harvests, age_file, row.names = FALSE)
cat("✓ Saved field age report:", age_file, "\n\n")
cat("Sample of current field ages:\n")
print(head(latest_harvests %>% arrange(desc(days_since_harvest)), 10))
cat("\n")
}
# ============================================================================
# COMPLETION
# ============================================================================
cat("============================================================================\n")
cat("RETROSPECTIVE HARVEST DETECTION COMPLETE\n")
cat("============================================================================\n\n")
cat("Summary:\n")
cat(" Fields processed:", length(all_results), "\n")
cat(" Harvests detected:", nrow(all_harvests), "\n")
cat(" Output files saved to:", output_dir, "\n\n")
cat("Next steps:\n")
cat(" 1. Review detected_harvests_bfast.csv for quality\n")
cat(" 2. Check diagnostic plots for sample fields\n")
cat(" 3. Use field_age_report.csv for operational monitoring\n")
cat(" 4. Integrate harvest dates into crop messaging/KPI workflows\n\n")
if (!CONFIG$validate) {
cat("Note: Run with --validate flag to compare against harvest.xlsx (testing only)\n\n")
}