# ============================================================================ # ANALYZE BFAST HARVEST DETECTION RESULTS # ============================================================================ # Diagnose why BFAST detection rate is low and visualize specific examples # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(here) library(bfast) library(zoo) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) if (basename(getwd()) == "harvest_prediction") { setwd("../../..") } source(here("r_app", "parameters_project.R")) cat("============================================================================\n") cat("ANALYZING BFAST RESULTS\n") cat("============================================================================\n\n") # Load BFAST results results_file <- here("r_app/experiments/harvest_prediction/detected_harvests_bfast.rds") if (!file.exists(results_file)) { stop("BFAST results not found. Run detect_harvest_retrospective_bfast.R first.") } bfast_data <- readRDS(results_file) all_results <- bfast_data$all_results all_harvests <- bfast_data$harvests cat("Loaded BFAST results:\n") cat(" Total fields:", length(all_results), "\n") cat(" Harvests detected:", nrow(all_harvests), "\n\n") # Load actual harvest data for comparison harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx") harvest_actual <- read_excel(harvest_file) %>% mutate(season_end = as.Date(season_end)) %>% filter(!is.na(season_end)) cat("Actual harvest records:", nrow(harvest_actual), "\n\n") # ============================================================================ # ANALYZE MATCHED VS MISSED FIELDS # ============================================================================ cat("============================================================================\n") cat("PATTERN ANALYSIS: MATCHED VS MISSED\n") cat("============================================================================\n\n") # Get fields with successful matches (±4 weeks) matched_fields <- harvest_actual %>% inner_join( all_harvests %>% select(field_id, detected_harvest = harvest_date, detected_year = harvest_year), by = c("field" = "field_id") ) %>% mutate( week_diff = abs(isoweek(detected_harvest) - isoweek(season_end)), match_quality = case_when( week_diff <= 2 ~ "Good (±2w)", week_diff <= 4 ~ "Acceptable (±4w)", TRUE ~ "Poor" ) ) %>% filter(match_quality %in% c("Good (±2w)", "Acceptable (±4w)")) # Get fields that were completely missed missed_fields <- harvest_actual %>% anti_join(all_harvests, by = c("field" = "field_id", "season_end" = "harvest_date")) cat("Matched fields (±4w):", nrow(matched_fields), "\n") cat("Missed harvests:", nrow(harvest_actual) - nrow(matched_fields), "\n\n") # Sample fields for detailed visualization if (nrow(matched_fields) > 0) { sample_matched <- matched_fields %>% head(3) %>% select(field, season_end, detected_harvest = detected_harvest, week_diff) cat("Sample MATCHED detections:\n") print(sample_matched) cat("\n") } # Sample missed fields sample_missed <- harvest_actual %>% filter(!(paste(field, season_end) %in% paste(matched_fields$field, matched_fields$season_end))) %>% head(5) %>% select(field, season_end, season_start) cat("Sample MISSED harvests:\n") print(sample_missed) cat("\n") # ============================================================================ # VISUALIZE SPECIFIC EXAMPLES # ============================================================================ cat("============================================================================\n") cat("GENERATING DIAGNOSTIC VISUALIZATIONS\n") cat("============================================================================\n\n") # Load CI data ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() time_series_daily <- ci_data_raw %>% mutate(date = as.Date(Date)) %>% select(field_id = field, date, mean_ci = FitData) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) output_dir <- here("r_app/experiments/harvest_prediction") # Function to create detailed diagnostic plot create_diagnostic_plot <- function(field_id, harvest_dates, result, title_suffix = "") { if (is.null(result$ts_data)) { cat("No time series data for field:", field_id, "\n") return(NULL) } ts_data <- result$ts_data # Create plot p <- ggplot(ts_data, aes(x = date, y = ci_smooth)) + geom_line(color = "darkgreen", linewidth = 0.8, alpha = 0.7) + theme_minimal() + theme( plot.title = element_text(face = "bold", size = 14), plot.subtitle = element_text(size = 10), axis.text = element_text(size = 9), legend.position = "bottom" ) # Add actual harvest dates (from harvest.xlsx) if (!is.null(harvest_dates) && nrow(harvest_dates) > 0) { p <- p + geom_vline(data = harvest_dates, aes(xintercept = season_end), color = "red", linetype = "dashed", linewidth = 1.2, alpha = 0.8) + geom_text(data = harvest_dates, aes(x = season_end, y = max(ts_data$ci_smooth, na.rm = TRUE), label = format(season_end, "%Y-%m-%d")), angle = 90, vjust = -0.5, hjust = 1, size = 3, color = "red", fontface = "bold") } # Add detected breaks (from BFAST) if (!is.null(result$all_breaks) && nrow(result$all_breaks) > 0) { p <- p + geom_vline(data = result$all_breaks, aes(xintercept = break_date), color = "blue", linetype = "dotted", linewidth = 1, alpha = 0.6) + geom_text(data = result$all_breaks, aes(x = break_date, y = min(ts_data$ci_smooth, na.rm = TRUE), label = format(break_date, "%Y-%m-%d")), angle = 90, vjust = 1.2, hjust = 0, size = 2.5, color = "blue") } # Add detected harvests (filtered breaks) if (!is.null(result$harvests) && nrow(result$harvests) > 0) { p <- p + geom_vline(data = result$harvests, aes(xintercept = harvest_date), color = "darkblue", linetype = "solid", linewidth = 1.5, alpha = 0.9) } # Labels and title breaks_info <- if (!is.null(result$all_breaks)) nrow(result$all_breaks) else 0 harvests_info <- if (!is.null(result$harvests)) nrow(result$harvests) else 0 actual_info <- if (!is.null(harvest_dates)) nrow(harvest_dates) else 0 p <- p + labs( title = paste0("Field ", field_id, " - BFAST Analysis ", title_suffix), subtitle = paste0( "Red dashed = Actual harvest (", actual_info, ") | ", "Blue dotted = All breaks (", breaks_info, ") | ", "Dark blue solid = Detected harvests (", harvests_info, ")" ), x = "Date", y = "CI (7-day smoothed)", caption = "Actual harvests from harvest.xlsx | BFAST-detected breaks shown" ) return(p) } # ============================================================================ # EXAMPLE 1: MATCHED FIELD (if any) # ============================================================================ if (nrow(matched_fields) > 0) { cat("Creating plot for MATCHED field...\n") matched_field <- matched_fields$field[1] matched_harvests <- harvest_actual %>% filter(field == matched_field) result <- all_results[[matched_field]] if (!is.null(result)) { p1 <- create_diagnostic_plot(matched_field, matched_harvests, result, "(MATCHED)") if (!is.null(p1)) { ggsave( file.path(output_dir, "bfast_example_MATCHED.png"), p1, width = 14, height = 7, dpi = 300 ) cat("✓ Saved: bfast_example_MATCHED.png\n") } } } # ============================================================================ # EXAMPLE 2: MISSED FIELD # ============================================================================ cat("\nCreating plot for MISSED field...\n") missed_field <- sample_missed$field[1] missed_harvests <- harvest_actual %>% filter(field == missed_field) result_missed <- all_results[[missed_field]] if (!is.null(result_missed)) { p2 <- create_diagnostic_plot(missed_field, missed_harvests, result_missed, "(MISSED)") if (!is.null(p2)) { ggsave( file.path(output_dir, "bfast_example_MISSED.png"), p2, width = 14, height = 7, dpi = 300 ) cat("✓ Saved: bfast_example_MISSED.png\n") } } # ============================================================================ # EXAMPLE 3: FIELD WITH MISMATCHES # ============================================================================ # Find a field with both actual and detected harvests but poor timing mismatch_candidates <- harvest_actual %>% inner_join( all_harvests %>% select(field_id, detected_harvest = harvest_date), by = c("field" = "field_id") ) %>% mutate( days_diff = abs(as.numeric(detected_harvest - season_end)), week_diff = days_diff / 7 ) %>% filter(week_diff > 5) %>% # Significant mismatch arrange(desc(week_diff)) if (nrow(mismatch_candidates) > 0) { cat("\nCreating plot for MISMATCHED field...\n") mismatch_field <- mismatch_candidates$field[1] mismatch_harvests <- harvest_actual %>% filter(field == mismatch_field) result_mismatch <- all_results[[mismatch_field]] if (!is.null(result_mismatch)) { p3 <- create_diagnostic_plot( mismatch_field, mismatch_harvests, result_mismatch, paste0("(MISMATCH: ", round(mismatch_candidates$week_diff[1], 1), " weeks off)") ) if (!is.null(p3)) { ggsave( file.path(output_dir, "bfast_example_MISMATCH.png"), p3, width = 14, height = 7, dpi = 300 ) cat("✓ Saved: bfast_example_MISMATCH.png\n") } } } # ============================================================================ # ANALYZE WHY BFAST IS STRUGGLING # ============================================================================ cat("\n============================================================================\n") cat("DIAGNOSTIC ANALYSIS: WHY LOW DETECTION RATE?\n") cat("============================================================================\n\n") # 1. Check data availability around harvest dates cat("1. DATA AVAILABILITY ANALYSIS\n") cat("Checking if CI data exists around actual harvest dates...\n\n") harvest_data_check <- harvest_actual %>% head(20) %>% rowwise() %>% mutate( ci_at_harvest = { field_ci <- time_series_daily %>% filter(field_id == field, date >= season_end - 14, date <= season_end + 14) if (nrow(field_ci) > 0) { paste0(nrow(field_ci), " obs, CI range: ", round(min(field_ci$mean_ci, na.rm = TRUE), 2), "-", round(max(field_ci$mean_ci, na.rm = TRUE), 2)) } else { "NO DATA" } } ) %>% select(field, season_end, ci_at_harvest) print(harvest_data_check) # 2. Check break detection statistics cat("\n\n2. BREAK DETECTION STATISTICS\n") break_stats <- data.frame( total_fields = length(all_results), fields_with_breaks = sum(sapply(all_results, function(x) !is.null(x$all_breaks) && nrow(x$all_breaks) > 0)), fields_with_harvest_classified = sum(sapply(all_results, function(x) !is.null(x$harvests) && nrow(x$harvests) > 0)), total_breaks = sum(sapply(all_results, function(x) ifelse(!is.null(x$all_breaks), nrow(x$all_breaks), 0))), total_harvest_breaks = sum(sapply(all_results, function(x) ifelse(!is.null(x$harvests), nrow(x$harvests), 0))) ) print(break_stats) cat("\n\n3. CI DROP CHARACTERISTICS AT ACTUAL HARVEST\n") cat("Analyzing CI behavior at known harvest dates...\n\n") # Analyze CI patterns at actual harvests harvest_ci_patterns <- harvest_actual %>% head(50) %>% # Sample for speed rowwise() %>% mutate( ci_change = { field_ci <- time_series_daily %>% filter(field_id == field) %>% arrange(date) if (nrow(field_ci) > 0) { # Find closest dates before and after harvest before_harvest <- field_ci %>% filter(date <= season_end) %>% tail(5) after_harvest <- field_ci %>% filter(date > season_end) %>% head(5) if (nrow(before_harvest) > 0 && nrow(after_harvest) > 0) { ci_before <- mean(before_harvest$mean_ci, na.rm = TRUE) ci_after <- mean(after_harvest$mean_ci, na.rm = TRUE) round(ci_after - ci_before, 2) } else { NA_real_ } } else { NA_real_ } } ) %>% filter(!is.na(ci_change)) if (nrow(harvest_ci_patterns) > 0) { cat("CI change at harvest (sample of", nrow(harvest_ci_patterns), "events):\n") cat(" Mean CI change:", round(mean(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n") cat(" Median CI change:", round(median(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n") cat(" Min CI change:", round(min(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n") cat(" Max CI change:", round(max(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n") cat(" # with CI drop < -0.5:", sum(harvest_ci_patterns$ci_change < -0.5, na.rm = TRUE), "\n") cat(" # with CI increase:", sum(harvest_ci_patterns$ci_change > 0, na.rm = TRUE), "\n") } # ============================================================================ # RECOMMENDATIONS # ============================================================================ cat("\n\n============================================================================\n") cat("RECOMMENDATIONS FOR IMPROVEMENT\n") cat("============================================================================\n\n") cat("Based on the analysis:\n\n") cat("1. DETECTION RATE: ", round(100 * nrow(matched_fields) / nrow(harvest_actual), 1), "%\n") if (nrow(matched_fields) / nrow(harvest_actual) < 0.20) { cat(" → VERY LOW - BFAST may not be suitable for this data\n\n") } else if (nrow(matched_fields) / nrow(harvest_actual) < 0.50) { cat(" → LOW - Parameter tuning may help\n\n") } cat("2. POSSIBLE ISSUES:\n") cat(" - Harvest signal may not cause abrupt CI drops\n") cat(" - Gradual harvest over weeks (not single-day event)\n") cat(" - Regrowth happens quickly (obscures harvest signal)\n") cat(" - BFAST expects abrupt structural breaks\n\n") cat("3. ALTERNATIVE APPROACHES TO CONSIDER:\n") cat(" - Rolling minimum detection (find sustained low CI periods)\n") cat(" - Change point detection with smoother transitions\n") cat(" - Threshold-based approach (CI < 2.5 for 2+ weeks)\n") cat(" - Combine with SAR data for better harvest detection\n") cat(" - Use crop age + CI trajectory modeling\n\n") cat("4. BFAST PARAMETER TUNING (if continuing):\n") cat(" - Try different h values (currently 0.15)\n") cat(" - Test 'none' for season (remove seasonal model)\n") cat(" - Adjust ci_drop_threshold (currently -0.5)\n") cat(" - Relax magnitude_threshold (currently 0.3)\n\n") cat("============================================================================\n") cat("ANALYSIS COMPLETE\n") cat("============================================================================\n\n") cat("Review generated plots:\n") cat(" - bfast_example_MATCHED.png (if available)\n") cat(" - bfast_example_MISSED.png\n") cat(" - bfast_example_MISMATCH.png (if available)\n\n")