SmartCane/r_app/experiments/harvest_prediction/analyze_bfast_results.R

# ============================================================================
# ANALYZE BFAST HARVEST DETECTION RESULTS
# ============================================================================
# Diagnose why BFAST detection rate is low and visualize specific examples
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(bfast)
  library(zoo)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

cat("============================================================================\n")
cat("ANALYZING BFAST RESULTS\n")
cat("============================================================================\n\n")

# Load BFAST results
results_file <- here("r_app/experiments/harvest_prediction/detected_harvests_bfast.rds")
if (!file.exists(results_file)) {
  stop("BFAST results not found. Run detect_harvest_retrospective_bfast.R first.")
}

bfast_data <- readRDS(results_file)
all_results <- bfast_data$all_results
all_harvests <- bfast_data$harvests

cat("Loaded BFAST results:\n")
cat("  Total fields:", length(all_results), "\n")
cat("  Harvests detected:", nrow(all_harvests), "\n\n")

# Load actual harvest data for comparison
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
harvest_actual <- read_excel(harvest_file) %>%
  mutate(season_end = as.Date(season_end)) %>%
  filter(!is.na(season_end))

cat("Actual harvest records:", nrow(harvest_actual), "\n\n")

# ============================================================================
# ANALYZE MATCHED VS MISSED FIELDS
# ============================================================================

cat("============================================================================\n")
cat("PATTERN ANALYSIS: MATCHED VS MISSED\n")
cat("============================================================================\n\n")

# Get fields with successful matches (±4 weeks)
matched_fields <- harvest_actual %>%
  inner_join(
    all_harvests %>%
      select(field_id, detected_harvest = harvest_date, detected_year = harvest_year),
    by = c("field" = "field_id")
  ) %>%
  mutate(
    week_diff = abs(isoweek(detected_harvest) - isoweek(season_end)),
    match_quality = case_when(
      week_diff <= 2 ~ "Good (±2w)",
      week_diff <= 4 ~ "Acceptable (±4w)",
      TRUE ~ "Poor"
    )
  ) %>%
  filter(match_quality %in% c("Good (±2w)", "Acceptable (±4w)"))

# Get fields that were completely missed
missed_fields <- harvest_actual %>%
  anti_join(all_harvests, by = c("field" = "field_id", "season_end" = "harvest_date"))

cat("Matched fields (±4w):", nrow(matched_fields), "\n")
cat("Missed harvests:", nrow(harvest_actual) - nrow(matched_fields), "\n\n")

# Sample fields for detailed visualization
if (nrow(matched_fields) > 0) {
  sample_matched <- matched_fields %>%
    head(3) %>%
    select(field, season_end, detected_harvest = detected_harvest, week_diff)

  cat("Sample MATCHED detections:\n")
  print(sample_matched)
  cat("\n")
}

# Sample missed fields
sample_missed <- harvest_actual %>%
  filter(!(paste(field, season_end) %in% paste(matched_fields$field, matched_fields$season_end))) %>%
  head(5) %>%
  select(field, season_end, season_start)

cat("Sample MISSED harvests:\n")
print(sample_missed)
cat("\n")

# ============================================================================
# VISUALIZE SPECIFIC EXAMPLES
# ============================================================================

cat("============================================================================\n")
cat("GENERATING DIAGNOSTIC VISUALIZATIONS\n")
cat("============================================================================\n\n")

# Load CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir,
                    "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

output_dir <- here("r_app/experiments/harvest_prediction")

# Function to create detailed diagnostic plot
create_diagnostic_plot <- function(field_id, harvest_dates, result, title_suffix = "") {

  if (is.null(result$ts_data)) {
    cat("No time series data for field:", field_id, "\n")
    return(NULL)
  }

  ts_data <- result$ts_data

  # Create plot
  p <- ggplot(ts_data, aes(x = date, y = ci_smooth)) +
    geom_line(color = "darkgreen", linewidth = 0.8, alpha = 0.7) +
    theme_minimal() +
    theme(
      plot.title = element_text(face = "bold", size = 14),
      plot.subtitle = element_text(size = 10),
      axis.text = element_text(size = 9),
      legend.position = "bottom"
    )

  # Add actual harvest dates (from harvest.xlsx)
  if (!is.null(harvest_dates) && nrow(harvest_dates) > 0) {
    p <- p +
      geom_vline(data = harvest_dates,
                 aes(xintercept = season_end),
                 color = "red", linetype = "dashed", linewidth = 1.2, alpha = 0.8) +
      geom_text(data = harvest_dates,
                aes(x = season_end, y = max(ts_data$ci_smooth, na.rm = TRUE),
                    label = format(season_end, "%Y-%m-%d")),
                angle = 90, vjust = -0.5, hjust = 1, size = 3, color = "red", fontface = "bold")
  }

  # Add detected breaks (from BFAST)
  if (!is.null(result$all_breaks) && nrow(result$all_breaks) > 0) {
    p <- p +
      geom_vline(data = result$all_breaks,
                 aes(xintercept = break_date),
                 color = "blue", linetype = "dotted", linewidth = 1, alpha = 0.6) +
      geom_text(data = result$all_breaks,
                aes(x = break_date, y = min(ts_data$ci_smooth, na.rm = TRUE),
                    label = format(break_date, "%Y-%m-%d")),
                angle = 90, vjust = 1.2, hjust = 0, size = 2.5, color = "blue")
  }

  # Add detected harvests (filtered breaks)
  if (!is.null(result$harvests) && nrow(result$harvests) > 0) {
    p <- p +
      geom_vline(data = result$harvests,
                 aes(xintercept = harvest_date),
                 color = "darkblue", linetype = "solid", linewidth = 1.5, alpha = 0.9)
  }

  # Labels and title
  breaks_info <- if (!is.null(result$all_breaks)) nrow(result$all_breaks) else 0
  harvests_info <- if (!is.null(result$harvests)) nrow(result$harvests) else 0
  actual_info <- if (!is.null(harvest_dates)) nrow(harvest_dates) else 0

  p <- p +
    labs(
      title = paste0("Field ", field_id, " - BFAST Analysis ", title_suffix),
      subtitle = paste0(
        "Red dashed = Actual harvest (", actual_info, ") | ",
        "Blue dotted = All breaks (", breaks_info, ") | ",
        "Dark blue solid = Detected harvests (", harvests_info, ")"
      ),
      x = "Date",
      y = "CI (7-day smoothed)",
      caption = "Actual harvests from harvest.xlsx | BFAST-detected breaks shown"
    )

  return(p)
}

# ============================================================================
# EXAMPLE 1: MATCHED FIELD (if any)
# ============================================================================

if (nrow(matched_fields) > 0) {
  cat("Creating plot for MATCHED field...\n")

  matched_field <- matched_fields$field[1]
  matched_harvests <- harvest_actual %>%
    filter(field == matched_field)

  result <- all_results[[matched_field]]

  if (!is.null(result)) {
    p1 <- create_diagnostic_plot(matched_field, matched_harvests, result, "(MATCHED)")

    if (!is.null(p1)) {
      ggsave(
        file.path(output_dir, "bfast_example_MATCHED.png"),
        p1, width = 14, height = 7, dpi = 300
      )
      cat("✓ Saved: bfast_example_MATCHED.png\n")
    }
  }
}

# ============================================================================
# EXAMPLE 2: MISSED FIELD
# ============================================================================

cat("\nCreating plot for MISSED field...\n")

missed_field <- sample_missed$field[1]
missed_harvests <- harvest_actual %>%
  filter(field == missed_field)

result_missed <- all_results[[missed_field]]

if (!is.null(result_missed)) {
  p2 <- create_diagnostic_plot(missed_field, missed_harvests, result_missed, "(MISSED)")

  if (!is.null(p2)) {
    ggsave(
      file.path(output_dir, "bfast_example_MISSED.png"),
      p2, width = 14, height = 7, dpi = 300
    )
    cat("✓ Saved: bfast_example_MISSED.png\n")
  }
}

# ============================================================================
# EXAMPLE 3: FIELD WITH MISMATCHES
# ============================================================================

# Find a field with both actual and detected harvests but poor timing
mismatch_candidates <- harvest_actual %>%
  inner_join(
    all_harvests %>% select(field_id, detected_harvest = harvest_date),
    by = c("field" = "field_id")
  ) %>%
  mutate(
    days_diff = abs(as.numeric(detected_harvest - season_end)),
    week_diff = days_diff / 7
  ) %>%
  filter(week_diff > 5) %>%  # Significant mismatch
  arrange(desc(week_diff))

if (nrow(mismatch_candidates) > 0) {
  cat("\nCreating plot for MISMATCHED field...\n")

  mismatch_field <- mismatch_candidates$field[1]
  mismatch_harvests <- harvest_actual %>%
    filter(field == mismatch_field)

  result_mismatch <- all_results[[mismatch_field]]

  if (!is.null(result_mismatch)) {
    p3 <- create_diagnostic_plot(
      mismatch_field,
      mismatch_harvests,
      result_mismatch,
      paste0("(MISMATCH: ", round(mismatch_candidates$week_diff[1], 1), " weeks off)")
    )

    if (!is.null(p3)) {
      ggsave(
        file.path(output_dir, "bfast_example_MISMATCH.png"),
        p3, width = 14, height = 7, dpi = 300
      )
      cat("✓ Saved: bfast_example_MISMATCH.png\n")
    }
  }
}

# ============================================================================
# ANALYZE WHY BFAST IS STRUGGLING
# ============================================================================

cat("\n============================================================================\n")
cat("DIAGNOSTIC ANALYSIS: WHY LOW DETECTION RATE?\n")
cat("============================================================================\n\n")

# 1. Check data availability around harvest dates
cat("1. DATA AVAILABILITY ANALYSIS\n")
cat("Checking if CI data exists around actual harvest dates...\n\n")

harvest_data_check <- harvest_actual %>%
  head(20) %>%
  rowwise() %>%
  mutate(
    ci_at_harvest = {
      field_ci <- time_series_daily %>%
        filter(field_id == field,
               date >= season_end - 14,
               date <= season_end + 14)

      if (nrow(field_ci) > 0) {
        paste0(nrow(field_ci), " obs, CI range: ",
               round(min(field_ci$mean_ci, na.rm = TRUE), 2), "-",
               round(max(field_ci$mean_ci, na.rm = TRUE), 2))
      } else {
        "NO DATA"
      }
    }
  ) %>%
  select(field, season_end, ci_at_harvest)

print(harvest_data_check)

# 2. Check break detection statistics
cat("\n\n2. BREAK DETECTION STATISTICS\n")

break_stats <- data.frame(
  total_fields = length(all_results),
  fields_with_breaks = sum(sapply(all_results, function(x)
    !is.null(x$all_breaks) && nrow(x$all_breaks) > 0)),
  fields_with_harvest_classified = sum(sapply(all_results, function(x)
    !is.null(x$harvests) && nrow(x$harvests) > 0)),
  total_breaks = sum(sapply(all_results, function(x)
    ifelse(!is.null(x$all_breaks), nrow(x$all_breaks), 0))),
  total_harvest_breaks = sum(sapply(all_results, function(x)
    ifelse(!is.null(x$harvests), nrow(x$harvests), 0)))
)

print(break_stats)

cat("\n\n3. CI DROP CHARACTERISTICS AT ACTUAL HARVEST\n")
cat("Analyzing CI behavior at known harvest dates...\n\n")

# Analyze CI patterns at actual harvests
harvest_ci_patterns <- harvest_actual %>%
  head(50) %>%  # Sample for speed
  rowwise() %>%
  mutate(
    ci_change = {
      field_ci <- time_series_daily %>%
        filter(field_id == field) %>%
        arrange(date)

      if (nrow(field_ci) > 0) {
        # Find closest dates before and after harvest
        before_harvest <- field_ci %>%
          filter(date <= season_end) %>%
          tail(5)
        after_harvest <- field_ci %>%
          filter(date > season_end) %>%
          head(5)

        if (nrow(before_harvest) > 0 && nrow(after_harvest) > 0) {
          ci_before <- mean(before_harvest$mean_ci, na.rm = TRUE)
          ci_after <- mean(after_harvest$mean_ci, na.rm = TRUE)
          round(ci_after - ci_before, 2)
        } else {
          NA_real_
        }
      } else {
        NA_real_
      }
    }
  ) %>%
  filter(!is.na(ci_change))

if (nrow(harvest_ci_patterns) > 0) {
  cat("CI change at harvest (sample of", nrow(harvest_ci_patterns), "events):\n")
  cat("  Mean CI change:", round(mean(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
  cat("  Median CI change:", round(median(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
  cat("  Min CI change:", round(min(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
  cat("  Max CI change:", round(max(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
  cat("  # with CI drop < -0.5:", sum(harvest_ci_patterns$ci_change < -0.5, na.rm = TRUE), "\n")
  cat("  # with CI increase:", sum(harvest_ci_patterns$ci_change > 0, na.rm = TRUE), "\n")
}

# ============================================================================
# RECOMMENDATIONS
# ============================================================================

cat("\n\n============================================================================\n")
cat("RECOMMENDATIONS FOR IMPROVEMENT\n")
cat("============================================================================\n\n")

cat("Based on the analysis:\n\n")

cat("1. DETECTION RATE: ", round(100 * nrow(matched_fields) / nrow(harvest_actual), 1), "%\n")
if (nrow(matched_fields) / nrow(harvest_actual) < 0.20) {
  cat("   → VERY LOW - BFAST may not be suitable for this data\n\n")
} else if (nrow(matched_fields) / nrow(harvest_actual) < 0.50) {
  cat("   → LOW - Parameter tuning may help\n\n")
}

cat("2. POSSIBLE ISSUES:\n")
cat("   - Harvest signal may not cause abrupt CI drops\n")
cat("   - Gradual harvest over weeks (not single-day event)\n")
cat("   - Regrowth happens quickly (obscures harvest signal)\n")
cat("   - BFAST expects abrupt structural breaks\n\n")

cat("3. ALTERNATIVE APPROACHES TO CONSIDER:\n")
cat("   - Rolling minimum detection (find sustained low CI periods)\n")
cat("   - Change point detection with smoother transitions\n")
cat("   - Threshold-based approach (CI < 2.5 for 2+ weeks)\n")
cat("   - Combine with SAR data for better harvest detection\n")
cat("   - Use crop age + CI trajectory modeling\n\n")

cat("4. BFAST PARAMETER TUNING (if continuing):\n")
cat("   - Try different h values (currently 0.15)\n")
cat("   - Test 'none' for season (remove seasonal model)\n")
cat("   - Adjust ci_drop_threshold (currently -0.5)\n")
cat("   - Relax magnitude_threshold (currently 0.3)\n\n")

cat("============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n\n")

cat("Review generated plots:\n")
cat("  - bfast_example_MATCHED.png (if available)\n")
cat("  - bfast_example_MISSED.png\n")
cat("  - bfast_example_MISMATCH.png (if available)\n\n")