SmartCane/r_app/experiments/harvest_prediction/detect_harvest_retrospective_bfast.R

# ============================================================================
# RETROSPECTIVE HARVEST DETECTION USING BFAST
# ============================================================================
# Purpose: Detect ALL historical harvest dates across complete CI time series
#
# Approach:
#   - Use full BFAST algorithm on complete time series (not real-time monitoring)
#   - Detect structural breaks = harvest events
#   - No need for immediate detection - can wait weeks for confirmation
#   - Output: Historical harvest database for field age calculation
#
# Key difference from real-time approaches:
#   - THEN: "Did this field harvest yesterday?" (hard, incomplete data)
#   - NOW: "When did this field harvest in the past?" (easier, full data)
#
# Usage: Rscript detect_harvest_retrospective_bfast.R [--validate]
#   --validate: Optional flag to compare against harvest.xlsx (testing only)
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(here)
  library(bfast)
  library(zoo)
  library(ggplot2)
})

# ============================================================================
# CONFIGURATION
# ============================================================================

CONFIG <- list(
  # DATA SOURCE
  project_dir = "esa",  # ESA WorldCover data (Planet imagery)

  # BFAST PARAMETERS (tuned for sugarcane harvest detection)
  h = 0.15,              # Minimum segment size (15% of data = ~2 harvests/year possible)
  season = "harmonic",   # Model seasonal growth patterns
  max_iter = 10,         # Iterative refinement
  breaks = NULL,         # Auto-detect number of breaks

  # PREPROCESSING
  smoothing_window = 7,  # Days for rolling median (reduce noise)
  min_observations = 200, # Minimum data points needed per field

  # HARVEST CLASSIFICATION (post-processing breakpoints)
  min_harvest_interval = 180,  # Days between harvests (6 months minimum)
  ci_drop_threshold = -0.5,    # Minimum CI drop to consider as harvest
  magnitude_threshold = 0.3,   # Minimum break magnitude

  # OUTPUT
  save_plots = TRUE,
  max_plots = 10,  # Limit number of diagnostic plots saved

  # VALIDATION (optional - only for testing)
  validate = TRUE  # Set to TRUE to compare against harvest.xlsx
)

# Process command line arguments
args <- commandArgs(trailingOnly = TRUE)
if ("--validate" %in% args) {
  CONFIG$validate <- TRUE
  cat("Validation mode enabled - will compare against harvest.xlsx\n\n")
}

# Set project directory globally
project_dir <- CONFIG$project_dir
assign("project_dir", project_dir, envir = .GlobalEnv)

# Navigate to project root if in experiments folder
if (basename(getwd()) == "harvest_prediction") {
  setwd("../../..")
}

source(here("r_app", "parameters_project.R"))

cat("============================================================================\n")
cat("RETROSPECTIVE HARVEST DETECTION USING BFAST\n")
cat("============================================================================\n\n")

cat("Configuration:\n")
cat("  Data source:", CONFIG$project_dir, "\n")
cat("  BFAST h parameter:", CONFIG$h, "\n")
cat("  Seasonal model:", CONFIG$season, "\n")
cat("  Smoothing window:", CONFIG$smoothing_window, "days\n")
cat("  Min harvest interval:", CONFIG$min_harvest_interval, "days\n\n")

# ============================================================================
# LOAD DATA
# ============================================================================

cat("=== LOADING CI DATA ===\n\n")

ci_rds_file <- here("laravel_app/storage/app", project_dir,
                    "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")

if (!file.exists(ci_rds_file)) {
  stop("CI data file not found: ", ci_rds_file)
}

ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

time_series_daily <- ci_data_raw %>%
  mutate(date = as.Date(Date)) %>%
  select(field_id = field, date, mean_ci = FitData) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

cat("Loaded", nrow(time_series_daily), "daily observations\n")
cat("Fields:", length(unique(time_series_daily$field_id)), "\n")
cat("Date range:", format(min(time_series_daily$date), "%Y-%m-%d"), "to",
    format(max(time_series_daily$date), "%Y-%m-%d"), "\n\n")

# Field summary
field_summary <- time_series_daily %>%
  group_by(field_id) %>%
  summarise(
    n_obs = n(),
    start_date = min(date),
    end_date = max(date),
    duration_days = as.numeric(end_date - start_date),
    mean_ci = mean(mean_ci, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(n_obs))

cat("Field data summary:\n")
cat("  Fields with >", CONFIG$min_observations, "obs:",
    sum(field_summary$n_obs >= CONFIG$min_observations), "/", nrow(field_summary), "\n")
cat("  Mean observations per field:", round(mean(field_summary$n_obs)), "\n")
cat("  Mean duration:", round(mean(field_summary$duration_days)), "days\n\n")

# Filter to fields with sufficient data
valid_fields <- field_summary %>%
  filter(n_obs >= CONFIG$min_observations) %>%
  pull(field_id)

cat("Processing", length(valid_fields), "fields with sufficient data\n\n")

# ============================================================================
# BFAST HARVEST DETECTION FUNCTION
# ============================================================================

detect_harvests_bfast <- function(field_data, field_id, config = CONFIG) {

  # Prepare time series
  field_ts <- field_data %>%
    arrange(date) %>%
    mutate(
      # Apply smoothing to reduce noise
      ci_smooth = if (config$smoothing_window > 1) {
        rollmedian(mean_ci, k = config$smoothing_window, fill = NA, align = "center")
      } else {
        mean_ci
      }
    )

  # Fill NA values from smoothing
  field_ts$ci_smooth <- na.approx(field_ts$ci_smooth, rule = 2)

  # Create regular daily time series
  date_seq <- seq.Date(min(field_ts$date), max(field_ts$date), by = "1 day")
  ts_regular <- data.frame(date = date_seq) %>%
    left_join(field_ts %>% select(date, ci_smooth), by = "date")

  # Interpolate missing days
  ts_regular$ci_smooth <- na.approx(ts_regular$ci_smooth, rule = 2)

  # Convert to ts object (yearly frequency)
  start_year <- as.numeric(format(min(ts_regular$date), "%Y"))
  start_doy <- as.numeric(format(min(ts_regular$date), "%j"))

  ts_obj <- ts(ts_regular$ci_smooth,
               start = c(start_year, start_doy),
               frequency = 365)

  # Run BFAST
  bfast_result <- tryCatch({
    bfast(ts_obj,
          h = config$h,
          season = config$season,
          max.iter = config$max_iter,
          breaks = config$breaks)
  }, error = function(e) {
    warning("BFAST failed for field ", field_id, ": ", e$message)
    return(NULL)
  })

  if (is.null(bfast_result)) {
    return(list(
      success = FALSE,
      field_id = field_id,
      harvests = tibble()
    ))
  }

  # Extract breakpoints from trend component
  bp_component <- bfast_result$output[[1]]$bp.Vt

  if (is.null(bp_component) || length(bp_component$breakpoints) == 0) {
    # No breaks detected
    return(list(
      success = TRUE,
      field_id = field_id,
      n_breaks = 0,
      harvests = tibble(),
      bfast_result = bfast_result
    ))
  }

  # Get breakpoint indices (remove NAs)
  bp_indices <- bp_component$breakpoints
  bp_indices <- bp_indices[!is.na(bp_indices)]

  if (length(bp_indices) == 0) {
    return(list(
      success = TRUE,
      field_id = field_id,
      n_breaks = 0,
      harvests = tibble(),
      bfast_result = bfast_result
    ))
  }

  # Convert indices to dates
  bp_dates <- ts_regular$date[bp_indices]

  # Get CI values before and after breaks
  ci_before <- ts_regular$ci_smooth[pmax(1, bp_indices - 7)]  # 7 days before
  ci_after <- ts_regular$ci_smooth[pmin(nrow(ts_regular), bp_indices + 7)]  # 7 days after
  ci_change <- ci_after - ci_before

  # Get magnitude from BFAST
  magnitudes <- if (!is.null(bp_component$magnitude)) {
    abs(bp_component$magnitude)
  } else {
    rep(NA, length(bp_indices))
  }

  # Create breaks dataframe
  breaks_df <- tibble(
    break_date = bp_dates,
    break_index = bp_indices,
    ci_before = ci_before,
    ci_after = ci_after,
    ci_change = ci_change,
    magnitude = magnitudes
  ) %>%
    arrange(break_date)

  # Filter for harvest-like breaks (downward, significant)
  harvest_breaks <- breaks_df %>%
    filter(
      ci_change < config$ci_drop_threshold,  # CI dropped
      (is.na(magnitude) | magnitude > config$magnitude_threshold)  # Significant break
    )

  # Remove breaks that are too close together (keep first in cluster)
  if (nrow(harvest_breaks) > 1) {
    harvest_breaks <- harvest_breaks %>%
      mutate(
        days_since_prev = c(Inf, diff(break_date)),
        keep = days_since_prev >= config$min_harvest_interval
      ) %>%
      filter(keep) %>%
      select(-days_since_prev, -keep)
  }

  # Format as harvest detections
  harvests <- harvest_breaks %>%
    mutate(
      field_id = field_id,
      harvest_date = break_date,
      harvest_week = isoweek(harvest_date),
      harvest_year = isoyear(harvest_date),
      ci_at_harvest = ci_after,
      detection_method = "bfast_retrospective"
    ) %>%
    select(field_id, harvest_date, harvest_week, harvest_year,
           ci_at_harvest, ci_change, magnitude, detection_method)

  return(list(
    success = TRUE,
    field_id = field_id,
    n_breaks_total = nrow(breaks_df),
    n_breaks_harvest = nrow(harvests),
    harvests = harvests,
    all_breaks = breaks_df,
    bfast_result = bfast_result,
    ts_data = ts_regular
  ))
}

# ============================================================================
# PROCESS ALL FIELDS
# ============================================================================

cat("============================================================================\n")
cat("PROCESSING FIELDS\n")
cat("============================================================================\n\n")

all_results <- list()
all_harvests <- tibble()

pb <- txtProgressBar(min = 0, max = length(valid_fields), style = 3)

for (i in seq_along(valid_fields)) {
  field_id <- valid_fields[i]

  field_data <- time_series_daily %>%
    filter(field_id == !!field_id)

  result <- detect_harvests_bfast(field_data, field_id, CONFIG)

  all_results[[field_id]] <- result

  if (result$success && nrow(result$harvests) > 0) {
    all_harvests <- bind_rows(all_harvests, result$harvests)
  }

  setTxtProgressBar(pb, i)
}

close(pb)

cat("\n\n")

# ============================================================================
# SUMMARY STATISTICS
# ============================================================================

cat("============================================================================\n")
cat("DETECTION SUMMARY\n")
cat("============================================================================\n\n")

successful <- sum(sapply(all_results, function(x) x$success))
failed <- length(all_results) - successful

cat("Fields processed:", length(all_results), "\n")
cat("  Successful:", successful, "\n")
cat("  Failed:", failed, "\n\n")

total_breaks <- sum(sapply(all_results, function(x) ifelse(x$success, x$n_breaks_total, 0)))
total_harvests <- sum(sapply(all_results, function(x) ifelse(x$success, x$n_breaks_harvest, 0)))

cat("Breakpoints detected:\n")
cat("  Total breaks:", total_breaks, "\n")
cat("  Classified as harvests:", total_harvests, "\n")
cat("  Filtered out:", total_breaks - total_harvests, "\n\n")

cat("Harvest detections:\n")
cat("  Total harvest events:", nrow(all_harvests), "\n")
cat("  Fields with harvests:", length(unique(all_harvests$field_id)), "\n")
cat("  Mean harvests per field:", round(nrow(all_harvests) / length(unique(all_harvests$field_id)), 2), "\n\n")

if (nrow(all_harvests) > 0) {
  harvest_summary <- all_harvests %>%
    group_by(field_id) %>%
    summarise(
      n_harvests = n(),
      first_harvest = min(harvest_date),
      last_harvest = max(harvest_date),
      mean_ci_at_harvest = mean(ci_at_harvest, na.rm = TRUE),
      .groups = "drop"
    )

  cat("Distribution of harvests per field:\n")
  harvest_counts <- table(harvest_summary$n_harvests)
  for (nh in names(harvest_counts)) {
    cat("  ", nh, "harvest(s):", harvest_counts[nh], "fields\n")
  }
  cat("\n")

  cat("CI values at harvest:\n")
  cat("  Mean:", round(mean(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n")
  cat("  Median:", round(median(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n")
  cat("  Range:", round(min(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "-",
      round(max(all_harvests$ci_at_harvest, na.rm = TRUE), 2), "\n\n")
}

# ============================================================================
# VALIDATION (OPTIONAL - TESTING ONLY)
# ============================================================================

if (CONFIG$validate) {
  cat("============================================================================\n")
  cat("VALIDATION AGAINST HARVEST.XLSX (TESTING ONLY)\n")
  cat("============================================================================\n\n")

  harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")

  if (file.exists(harvest_file)) {
    harvest_actual <- read_excel(harvest_file) %>%
      mutate(
        season_end = as.Date(season_end)
      ) %>%
      filter(!is.na(season_end)) %>%
      select(field_id = field, actual_harvest = season_end) %>%
      mutate(
        actual_week = isoweek(actual_harvest),
        actual_year = isoyear(actual_harvest)
      )

    cat("Loaded", nrow(harvest_actual), "actual harvest records\n\n")

    # Match detected to actual
    validation <- harvest_actual %>%
      left_join(
        all_harvests %>%
          select(field_id, detected_harvest = harvest_date, detected_week = harvest_week,
                 detected_year = harvest_year, ci_at_harvest),
        by = c("field_id", "actual_year" = "detected_year")
      ) %>%
      mutate(
        week_diff = detected_week - actual_week,
        days_diff = as.numeric(detected_harvest - actual_harvest),
        match_status = case_when(
          is.na(detected_harvest) ~ "MISSED",
          abs(week_diff) <= 2 ~ "MATCHED (±2w)",
          abs(week_diff) <= 4 ~ "MATCHED (±4w)",
          TRUE ~ paste0("MISMATCH (", ifelse(week_diff > 0, "+", ""), week_diff, "w)")
        )
      )

    # Summary
    cat("Validation results:\n")
    match_summary <- table(validation$match_status)
    for (status in names(match_summary)) {
      cat("  ", status, ":", match_summary[status], "\n")
    }
    cat("\n")

    matched_2w <- sum(validation$match_status == "MATCHED (±2w)", na.rm = TRUE)
    matched_4w <- sum(validation$match_status == "MATCHED (±4w)", na.rm = TRUE)
    missed <- sum(validation$match_status == "MISSED", na.rm = TRUE)

    cat("Detection rate:", round(100 * (nrow(harvest_actual) - missed) / nrow(harvest_actual), 1), "%\n")
    cat("Accuracy (±2 weeks):", round(100 * matched_2w / nrow(harvest_actual), 1), "%\n")
    cat("Accuracy (±4 weeks):", round(100 * (matched_2w + matched_4w) / nrow(harvest_actual), 1), "%\n\n")

    # Check for false positives
    false_positives <- all_harvests %>%
      anti_join(harvest_actual, by = c("field_id", "harvest_year" = "actual_year"))

    cat("False positives:", nrow(false_positives),
        "(detected harvests not in harvest.xlsx)\n\n")

    # Show detailed comparison for fields with mismatches
    mismatches <- validation %>%
      filter(grepl("MISMATCH", match_status)) %>%
      select(field_id, actual_harvest, detected_harvest, days_diff, match_status)

    if (nrow(mismatches) > 0) {
      cat("Mismatched detections (sample):\n")
      print(head(mismatches, 20))
      cat("\n")
    }

  } else {
    cat("Validation file not found:", harvest_file, "\n")
    cat("Skipping validation (not needed for operational use)\n\n")
  }
}

# ============================================================================
# SAVE RESULTS
# ============================================================================

cat("============================================================================\n")
cat("SAVING RESULTS\n")
cat("============================================================================\n\n")

output_dir <- here("r_app/experiments/harvest_prediction")

# Save main results
output_file <- file.path(output_dir, "detected_harvests_bfast.rds")
saveRDS(list(
  config = CONFIG,
  detection_date = Sys.time(),
  harvests = all_harvests,
  field_summary = harvest_summary,
  all_results = all_results
), output_file)
cat("✓ Saved harvest detections:", output_file, "\n")

# Save CSV for easy viewing
csv_file <- file.path(output_dir, "detected_harvests_bfast.csv")
write.csv(all_harvests, csv_file, row.names = FALSE)
cat("✓ Saved CSV:", csv_file, "\n\n")

# ============================================================================
# GENERATE DIAGNOSTIC PLOTS (SAMPLE)
# ============================================================================

if (CONFIG$save_plots && nrow(all_harvests) > 0) {
  cat("============================================================================\n")
  cat("GENERATING DIAGNOSTIC PLOTS\n")
  cat("============================================================================\n\n")

  # Get fields with harvests
  fields_with_harvests <- unique(all_harvests$field_id)
  plot_fields <- head(fields_with_harvests, CONFIG$max_plots)

  cat("Creating plots for", length(plot_fields), "fields...\n")

  for (field_id in plot_fields) {
    result <- all_results[[field_id]]

    if (result$success && nrow(result$harvests) > 0) {
      # Create plot
      p <- ggplot(result$ts_data, aes(x = date, y = ci_smooth)) +
        geom_line(color = "darkgreen", linewidth = 0.8) +
        geom_vline(data = result$harvests,
                   aes(xintercept = harvest_date),
                   color = "red", linetype = "dashed", linewidth = 1) +
        labs(
          title = paste0("Field ", field_id, " - BFAST Harvest Detection"),
          subtitle = paste0(nrow(result$harvests), " harvest(s) detected"),
          x = "Date",
          y = "CI (smoothed)"
        ) +
        theme_minimal() +
        theme(plot.title = element_text(face = "bold"))

      # Add labels for harvest dates
      harvest_labels <- result$harvests %>%
        mutate(label = format(harvest_date, "%Y-%m-%d"))

      p <- p + geom_text(data = harvest_labels,
                         aes(x = harvest_date,
                             y = max(result$ts_data$ci_smooth, na.rm = TRUE),
                             label = label),
                         angle = 90, vjust = -0.5, size = 3, color = "red")

      # Save plot
      plot_file <- file.path(output_dir, paste0("harvest_detection_", field_id, ".png"))
      ggsave(plot_file, p, width = 12, height = 6, dpi = 150)
    }
  }

  cat("✓ Saved", length(plot_fields), "diagnostic plots\n\n")
}

# ============================================================================
# CALCULATE CURRENT FIELD AGE
# ============================================================================

cat("============================================================================\n")
cat("CALCULATING CURRENT FIELD AGE\n")
cat("============================================================================\n\n")

if (nrow(all_harvests) > 0) {
  # Get most recent harvest for each field
  latest_harvests <- all_harvests %>%
    group_by(field_id) %>%
    slice_max(harvest_date, n = 1, with_ties = FALSE) %>%
    ungroup() %>%
    mutate(
      days_since_harvest = as.numeric(Sys.Date() - harvest_date),
      months_since_harvest = days_since_harvest / 30.44,
      age_category = case_when(
        months_since_harvest < 3 ~ "Young (0-3 months)",
        months_since_harvest < 9 ~ "Mature (3-9 months)",
        months_since_harvest < 12 ~ "Pre-harvest (9-12 months)",
        TRUE ~ "Overdue (12+ months)"
      )
    ) %>%
    select(field_id, last_harvest = harvest_date, days_since_harvest,
           months_since_harvest, age_category)

  cat("Field age summary:\n")
  age_counts <- table(latest_harvests$age_category)
  for (cat_name in names(age_counts)) {
    cat("  ", cat_name, ":", age_counts[cat_name], "fields\n")
  }
  cat("\n")

  # Save field age report
  age_file <- file.path(output_dir, "field_age_report.csv")
  write.csv(latest_harvests, age_file, row.names = FALSE)
  cat("✓ Saved field age report:", age_file, "\n\n")

  cat("Sample of current field ages:\n")
  print(head(latest_harvests %>% arrange(desc(days_since_harvest)), 10))
  cat("\n")
}

# ============================================================================
# COMPLETION
# ============================================================================

cat("============================================================================\n")
cat("RETROSPECTIVE HARVEST DETECTION COMPLETE\n")
cat("============================================================================\n\n")

cat("Summary:\n")
cat("  Fields processed:", length(all_results), "\n")
cat("  Harvests detected:", nrow(all_harvests), "\n")
cat("  Output files saved to:", output_dir, "\n\n")

cat("Next steps:\n")
cat("  1. Review detected_harvests_bfast.csv for quality\n")
cat("  2. Check diagnostic plots for sample fields\n")
cat("  3. Use field_age_report.csv for operational monitoring\n")
cat("  4. Integrate harvest dates into crop messaging/KPI workflows\n\n")

if (!CONFIG$validate) {
  cat("Note: Run with --validate flag to compare against harvest.xlsx (testing only)\n\n")
}