# ============================================================================ # RETROSPECTIVE HARVEST DETECTION USING BFAST # ============================================================================ # Purpose: Detect ALL historical harvest dates across complete CI time series # # Approach: # - Use full BFAST algorithm on complete time series (not real-time monitoring) # - Detect structural breaks = harvest events # - No need for immediate detection - can wait weeks for confirmation # - Output: Historical harvest database for field age calculation # # Key difference from real-time approaches: # - THEN: "Did this field harvest yesterday?" (hard, incomplete data) # - NOW: "When did this field harvest in the past?" (easier, full data) # # Usage: Rscript detect_harvest_retrospective_bfast.R [--validate] # --validate: Optional flag to compare against harvest.xlsx (testing only) # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(bfast) library(zoo) library(ggplot2) }) # ============================================================================ # CONFIGURATION # ============================================================================ CONFIG <- list( # DATA SOURCE project_dir = "esa", # ESA WorldCover data (Planet imagery) # BFAST PARAMETERS (tuned for sugarcane harvest detection) h = 0.15, # Minimum segment size (15% of data = ~2 harvests/year possible) season = "harmonic", # Model seasonal growth patterns max_iter = 10, # Iterative refinement breaks = NULL, # Auto-detect number of breaks # PREPROCESSING smoothing_window = 7, # Days for rolling median (reduce noise) min_observations = 200, # Minimum data points needed per field # HARVEST CLASSIFICATION (post-processing breakpoints) min_harvest_interval = 180, # Days between harvests (6 months minimum) ci_drop_threshold = -0.5, # Minimum CI drop to consider as harvest magnitude_threshold = 0.3, # Minimum break magnitude # OUTPUT save_plots = TRUE, max_plots = 10, # Limit number of diagnostic plots saved # VALIDATION (optional - only for testing) validate = TRUE # Set to TRUE to compare against harvest.xlsx ) # Process command line arguments args <- commandArgs(trailingOnly = TRUE) if ("--validate" %in% args) { CONFIG$validate <- TRUE cat("Validation mode enabled - will compare against harvest.xlsx\n\n") } # Set project directory globally project_dir <- CONFIG$project_dir assign("project_dir", project_dir, envir = .GlobalEnv) # Navigate to project root if in experiments folder if (basename(getwd()) == "harvest_prediction") { setwd("../../..") } source(here("r_app", "parameters_project.R")) cat("============================================================================\n") cat("RETROSPECTIVE HARVEST DETECTION USING BFAST\n") cat("============================================================================\n\n") cat("Configuration:\n") cat(" Data source:", CONFIG$project_dir, "\n") cat(" BFAST h parameter:", CONFIG$h, "\n") cat(" Seasonal model:", CONFIG$season, "\n") cat(" Smoothing window:", CONFIG$smoothing_window, "days\n") cat(" Min harvest interval:", CONFIG$min_harvest_interval, "days\n\n") # ============================================================================ # LOAD DATA # ============================================================================ cat("=== LOADING CI DATA ===\n\n") ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") if (!file.exists(ci_rds_file)) { stop("CI data file not found: ", ci_rds_file) } ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, mean_ci = FitData) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) cat("Loaded", nrow(time_series_daily), "daily observations\n") cat("Fields:", length(unique(time_series_daily$field_id)), "\n") cat("Date range:", format(min(time_series_daily$date), "%Y-%m-%d"), "to", format(max(time_series_daily$date), "%Y-%m-%d"), "\n\n") # Field summary field_summary <- time_series_daily %>% group_by(field_id) %>% summarise( n_obs = n(), start_date = min(date), end_date = max(date), duration_days = as.numeric(end_date - start_date), mean_ci = mean(mean_ci, na.rm = TRUE), .groups = "drop" ) %>% arrange(desc(n_obs)) cat("Field data summary:\n") cat(" Fields with >", CONFIG$min_observations, "obs:", sum(field_summary$n_obs >= CONFIG$min_observations), "/", nrow(field_summary), "\n") cat(" Mean observations per field:", round(mean(field_summary$n_obs)), "\n") cat(" Mean duration:", round(mean(field_summary$duration_days)), "days\n\n") # Filter to fields with sufficient data valid_fields <- field_summary %>% filter(n_obs >= CONFIG$min_observations) %>% pull(field_id) cat("Processing", length(valid_fields), "fields with sufficient data\n\n") # ============================================================================ # BFAST HARVEST DETECTION FUNCTION # ============================================================================ detect_harvests_bfast <- function(field_data, field_id, config = CONFIG) { # Prepare time series field_ts <- field_data %>% arrange(date) %>% mutate( # Apply smoothing to reduce noise ci_smooth = if (config$smoothing_window > 1) { rollmedian(mean_ci, k = config$smoothing_window, fill = NA, align = "center") } else { mean_ci } ) # Fill NA values from smoothing field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2) # Create regular daily time series date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day") ts_regular <- data.frame(date = date_seq) %>% left_join(field_ts %>% select(date, ci_smooth), by = "date") # Interpolate missing days ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2) # Convert to ts object (yearly frequency) start_year <- as.numeric(format(min(ts_regular$date), "%Y")) start_doy <- as.numeric(format(min(ts_regular$date), "%j")) ts_obj <- ts(ts_regular$ci_smooth, start = c(start_year, start_doy), frequency = 365) # Run BFAST bfast_result <- tryCatch({ bfast(ts_obj, h = config$h, season = config$season, max.iter = config$max_iter, breaks = config$breaks) }, error = function(e) { warning("BFAST failed for field ", field_id, ": ", e$message) return(NULL) }) if (is.null(bfast_result)) { return(list( success = FALSE, field_id = field_id, harvests = tibble() )) } # Extract breakpoints from trend component bp_component <- bfast_result$output[[1]]$bp.Vt if (is.null(bp_component) || length(bp_component$breakpoints) == 0) { # No breaks detected return(list( success = TRUE, field_id = field_id, n_breaks = 0, harvests = tibble(), bfast_result = bfast_result )) } # Get breakpoint indices (remove NAs) bp_indices <- bp_component$breakpoints bp_indices <- bp_indices[!is.na(bp_indices)] if (length(bp_indices) == 0) { return(list( success = TRUE, field_id = field_id, n_breaks = 0, harvests = tibble(), bfast_result = bfast_result )) } # Convert indices to dates bp_dates <- ts_regular$date[bp_indices] # Get CI values before and after breaks ci_before <- ts_regular$ci_smooth[pmax(1, bp_indices - 7)] # 7 days before ci_after <- ts_regular$ci_smooth[pmin(nrow(ts_regular), bp_indices + 7)] # 7 days after ci_change <- ci_after - ci_before # Get magnitude from BFAST magnitudes <- if (!is.null(bp_component$magnitude)) { abs(bp_component$magnitude) } else { rep(NA, length(bp_indices)) } # Create breaks dataframe breaks_df <- tibble( break_date = bp_dates, break_index = bp_indices, ci_before = ci_before, ci_after = ci_after, ci_change = ci_change, magnitude = magnitudes ) %>% arrange(break_date) # Filter for harvest-like breaks (downward, significant) harvest_breaks <- breaks_df %>% filter( ci_change < config$ci_drop_threshold, # CI dropped (is.na(magnitude) | magnitude > config$magnitude_threshold) # Significant break ) # Remove breaks that are too close together (keep first in cluster) if (nrow(harvest_breaks) > 1) { harvest_breaks <- harvest_breaks %>% mutate( days_since_prev = c(Inf, diff(break_date)), keep = days_since_prev >= config$min_harvest_interval ) %>% filter(keep) %>% select(-days_since_prev, -keep) } # Format as harvest detections harvests <- harvest_breaks %>% mutate( field_id = field_id, harvest_date = break_date, harvest_week = isoweek(harvest_date), harvest_year = isoyear(harvest_date), ci_at_harvest = ci_after, detection_method = "bfast_retrospective" ) %>% select(field_id, harvest_date, harvest_week, harvest_year, ci_at_harvest, ci_change, magnitude, detection_method) return(list( success = TRUE, field_id = field_id, n_breaks_total = nrow(breaks_df), n_breaks_harvest = nrow(harvests), harvests = harvests, all_breaks = breaks_df, bfast_result = bfast_result, ts_data = ts_regular )) } # ============================================================================ # PROCESS ALL FIELDS # ============================================================================ cat("============================================================================\n") cat("PROCESSING FIELDS\n") cat("============================================================================\n\n") all_results <- list() all_harvests <- tibble() pb <- txtProgressBar(min = 0, max = length(valid_fields), style = 3) for (i in seq_along(valid_fields)) { field_id <- valid_fields[i] field_data <- time_series_daily %>% filter(field_id == !!field_id) result <- detect_harvests_bfast(field_data, field_id, CONFIG) all_results[[field_id]] <- result if (result$success && nrow(result$harvests) > 0) { all_harvests <- bind_rows(all_harvests, result$harvests) } setTxtProgressBar(pb, i) } close(pb) cat("\n\n") # ============================================================================ # SUMMARY STATISTICS # ============================================================================ cat("============================================================================\n") cat("DETECTION SUMMARY\n") cat("============================================================================\n\n") successful <- sum(sapply(all_results, function(x) x$success)) failed <- length(all_results) - successful cat("Fields processed:", length(all_results), "\n") cat(" Successful:", successful, "\n") cat(" Failed:", failed, "\n\n") total_breaks <- sum(sapply(all_results, function(x) ifelse(x$success, x$n_breaks_total, 0))) total_harvests <- sum(sapply(all_results, function(x) ifelse(x$success, x$n_breaks_harvest, 0))) cat("Breakpoints detected:\n") cat(" Total breaks:", total_breaks, "\n") cat(" Classified as harvests:", total_harvests, "\n") cat(" Filtered out:", total_breaks - total_harvests, "\n\n") cat("Harvest detections:\n") cat(" Total harvest events:", nrow(all_harvests), "\n") cat(" Fields with harvests:", length(unique(all_harvests$field_id)), "\n") cat(" Mean harvests per field:", round(nrow(all_harvests) / length(unique(all_harvests$field_id)), 2), "\n\n") if (nrow(all_harvests) > 0) { harvest_summary <- all_harvests %>% group_by(field_id) %>% summarise( n_harvests = n(), first_harvest = min(harvest_date), last_harvest = max(harvest_date), mean_ci_at_harvest = mean(ci_at_harvest, na.rm = TRUE), .groups = "drop" ) cat("Distribution of harvests per field:\n") harvest_counts <- table(harvest_summary$n_harvests) for (nh in names(harvest_counts)) { cat(" ", nh, "harvest(s):", harvest_counts[nh], "fields\n") } cat("\n") cat("CI values at harvest:\n") cat(" Mean:", round(mean(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n") cat(" Median:", round(median(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n") cat(" Range:", round(min(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "-", round(max(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n\n") } # ============================================================================ # VALIDATION (OPTIONAL - TESTING ONLY) # ============================================================================ if (CONFIG$validate) { cat("============================================================================\n") cat("VALIDATION AGAINST HARVEST.XLSX (TESTING ONLY)\n") cat("============================================================================\n\n") harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx") if (file.exists(harvest_file)) { harvest_actual <- read_excel(harvest_file) %>% mutate( season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) %>% select(field_id = field, actual_harvest = season_end) %>% mutate( actual_week = isoweek(actual_harvest), actual_year = isoyear(actual_harvest) ) cat("Loaded", nrow(harvest_actual), "actual harvest records\n\n") # Match detected to actual validation <- harvest_actual %>% left_join( all_harvests %>% select(field_id, detected_harvest = harvest_date, detected_week = harvest_week, detected_year = harvest_year, ci_at_harvest), by = c("field_id", "actual_year" = "detected_year") ) %>% mutate( week_diff = detected_week - actual_week, days_diff = as.numeric(detected_harvest - actual_harvest), match_status = case_when( is.na(detected_harvest) ~ "MISSED", abs(week_diff) <= 2 ~ "MATCHED (±2w)", abs(week_diff) <= 4 ~ "MATCHED (±4w)", TRUE ~ paste0("MISMATCH (", ifelse(week_diff > 0, "+", ""), week_diff, "w)") ) ) # Summary cat("Validation results:\n") match_summary <- table(validation$match_status) for (status in names(match_summary)) { cat(" ", status, ":", match_summary[status], "\n") } cat("\n") matched_2w <- sum(validation$match_status == "MATCHED (±2w)", na.rm = TRUE) matched_4w <- sum(validation$match_status == "MATCHED (±4w)", na.rm = TRUE) missed <- sum(validation$match_status == "MISSED", na.rm = TRUE) cat("Detection rate:", round(100 * (nrow(harvest_actual) - missed) / nrow(harvest_actual), 1), "%\n") cat("Accuracy (±2 weeks):", round(100 * matched_2w / nrow(harvest_actual), 1), "%\n") cat("Accuracy (±4 weeks):", round(100 * (matched_2w + matched_4w) / nrow(harvest_actual), 1), "%\n\n") # Check for false positives false_positives <- all_harvests %>% anti_join(harvest_actual, by = c("field_id", "harvest_year" = "actual_year")) cat("False positives:", nrow(false_positives), "(detected harvests not in harvest.xlsx)\n\n") # Show detailed comparison for fields with mismatches mismatches <- validation %>% filter(grepl("MISMATCH", match_status)) %>% select(field_id, actual_harvest, detected_harvest, days_diff, match_status) if (nrow(mismatches) > 0) { cat("Mismatched detections (sample):\n") print(head(mismatches, 20)) cat("\n") } } else { cat("Validation file not found:", harvest_file, "\n") cat("Skipping validation (not needed for operational use)\n\n") } } # ============================================================================ # SAVE RESULTS # ============================================================================ cat("============================================================================\n") cat("SAVING RESULTS\n") cat("============================================================================\n\n") output_dir <- here("r_app/experiments/harvest_prediction") # Save main results output_file <- file.path(output_dir, "detected_harvests_bfast.rds") saveRDS(list( config = CONFIG, detection_date = Sys.time(), harvests = all_harvests, field_summary = harvest_summary, all_results = all_results ), output_file) cat("✓ Saved harvest detections:", output_file, "\n") # Save CSV for easy viewing csv_file <- file.path(output_dir, "detected_harvests_bfast.csv") write.csv(all_harvests, csv_file, row.names = FALSE) cat("✓ Saved CSV:", csv_file, "\n\n") # ============================================================================ # GENERATE DIAGNOSTIC PLOTS (SAMPLE) # ============================================================================ if (CONFIG$save_plots && nrow(all_harvests) > 0) { cat("============================================================================\n") cat("GENERATING DIAGNOSTIC PLOTS\n") cat("============================================================================\n\n") # Get fields with harvests fields_with_harvests <- unique(all_harvests$field_id) plot_fields <- head(fields_with_harvests, CONFIG$max_plots) cat("Creating plots for", length(plot_fields), "fields...\n") for (field_id in plot_fields) { result <- all_results[[field_id]] if (result$success && nrow(result$harvests) > 0) { # Create plot p <- ggplot(result$ts_data, aes(x = date, y = ci_smooth)) + geom_line(color = "darkgreen", linewidth = 0.8) + geom_vline(data = result$harvests, aes(xintercept = harvest_date), color = "red", linetype = "dashed", linewidth = 1) + labs( title = paste0("Field ", field_id, " - BFAST Harvest Detection"), subtitle = paste0(nrow(result$harvests), " harvest(s) detected"), x = "Date", y = "CI (smoothed)" ) + theme_minimal() + theme(plot.title = element_text(face = "bold")) # Add labels for harvest dates harvest_labels <- result$harvests %>% mutate(label = format(harvest_date, "%Y-%m-%d")) p <- p + geom_text(data = harvest_labels, aes(x = harvest_date, y = max(result$ts_data$ci_smooth, na.rm = TRUE), label = label), angle = 90, vjust = -0.5, size = 3, color = "red") # Save plot plot_file <- file.path(output_dir, paste0("harvest_detection_", field_id, ".png")) ggsave(plot_file, p, width = 12, height = 6, dpi = 150) } } cat("✓ Saved", length(plot_fields), "diagnostic plots\n\n") } # ============================================================================ # CALCULATE CURRENT FIELD AGE # ============================================================================ cat("============================================================================\n") cat("CALCULATING CURRENT FIELD AGE\n") cat("============================================================================\n\n") if (nrow(all_harvests) > 0) { # Get most recent harvest for each field latest_harvests <- all_harvests %>% group_by(field_id) %>% slice_max(harvest_date, n = 1, with_ties = FALSE) %>% ungroup() %>% mutate( days_since_harvest = as.numeric(Sys.Date() - harvest_date), months_since_harvest = days_since_harvest / 30.44, age_category = case_when( months_since_harvest < 3 ~ "Young (0-3 months)", months_since_harvest < 9 ~ "Mature (3-9 months)", months_since_harvest < 12 ~ "Pre-harvest (9-12 months)", TRUE ~ "Overdue (12+ months)" ) ) %>% select(field_id, last_harvest = harvest_date, days_since_harvest, months_since_harvest, age_category) cat("Field age summary:\n") age_counts <- table(latest_harvests$age_category) for (cat_name in names(age_counts)) { cat(" ", cat_name, ":", age_counts[cat_name], "fields\n") } cat("\n") # Save field age report age_file <- file.path(output_dir, "field_age_report.csv") write.csv(latest_harvests, age_file, row.names = FALSE) cat("✓ Saved field age report:", age_file, "\n\n") cat("Sample of current field ages:\n") print(head(latest_harvests %>% arrange(desc(days_since_harvest)), 10)) cat("\n") } # ============================================================================ # COMPLETION # ============================================================================ cat("============================================================================\n") cat("RETROSPECTIVE HARVEST DETECTION COMPLETE\n") cat("============================================================================\n\n") cat("Summary:\n") cat(" Fields processed:", length(all_results), "\n") cat(" Harvests detected:", nrow(all_harvests), "\n") cat(" Output files saved to:", output_dir, "\n\n") cat("Next steps:\n") cat(" 1. Review detected_harvests_bfast.csv for quality\n") cat(" 2. Check diagnostic plots for sample fields\n") cat(" 3. Use field_age_report.csv for operational monitoring\n") cat(" 4. Integrate harvest dates into crop messaging/KPI workflows\n\n") if (!CONFIG$validate) { cat("Note: Run with --validate flag to compare against harvest.xlsx (testing only)\n\n") }