SmartCane/r_app/experiments/harvest_prediction/analyze_bfast_results.R
2026-01-06 14:17:37 +01:00

438 lines
15 KiB
R

# ============================================================================
# ANALYZE BFAST HARVEST DETECTION RESULTS
# ============================================================================
# Diagnose why BFAST detection rate is low and visualize specific examples
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(here)
library(bfast)
library(zoo)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
if (basename(getwd()) == "harvest_prediction") {
setwd("../../..")
}
source(here("r_app", "parameters_project.R"))
cat("============================================================================\n")
cat("ANALYZING BFAST RESULTS\n")
cat("============================================================================\n\n")
# Load BFAST results
results_file <- here("r_app/experiments/harvest_prediction/detected_harvests_bfast.rds")
if (!file.exists(results_file)) {
stop("BFAST results not found. Run detect_harvest_retrospective_bfast.R first.")
}
bfast_data <- readRDS(results_file)
all_results <- bfast_data$all_results
all_harvests <- bfast_data$harvests
cat("Loaded BFAST results:\n")
cat(" Total fields:", length(all_results), "\n")
cat(" Harvests detected:", nrow(all_harvests), "\n\n")
# Load actual harvest data for comparison
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
harvest_actual <- read_excel(harvest_file) %>%
mutate(season_end = as.Date(season_end)) %>%
filter(!is.na(season_end))
cat("Actual harvest records:", nrow(harvest_actual), "\n\n")
# ============================================================================
# ANALYZE MATCHED VS MISSED FIELDS
# ============================================================================
cat("============================================================================\n")
cat("PATTERN ANALYSIS: MATCHED VS MISSED\n")
cat("============================================================================\n\n")
# Get fields with successful matches (±4 weeks)
matched_fields <- harvest_actual %>%
inner_join(
all_harvests %>%
select(field_id, detected_harvest = harvest_date, detected_year = harvest_year),
by = c("field" = "field_id")
) %>%
mutate(
week_diff = abs(isoweek(detected_harvest) - isoweek(season_end)),
match_quality = case_when(
week_diff <= 2 ~ "Good (±2w)",
week_diff <= 4 ~ "Acceptable (±4w)",
TRUE ~ "Poor"
)
) %>%
filter(match_quality %in% c("Good (±2w)", "Acceptable (±4w)"))
# Get fields that were completely missed
missed_fields <- harvest_actual %>%
anti_join(all_harvests, by = c("field" = "field_id", "season_end" = "harvest_date"))
cat("Matched fields (±4w):", nrow(matched_fields), "\n")
cat("Missed harvests:", nrow(harvest_actual) - nrow(matched_fields), "\n\n")
# Sample fields for detailed visualization
if (nrow(matched_fields) > 0) {
sample_matched <- matched_fields %>%
head(3) %>%
select(field, season_end, detected_harvest = detected_harvest, week_diff)
cat("Sample MATCHED detections:\n")
print(sample_matched)
cat("\n")
}
# Sample missed fields
sample_missed <- harvest_actual %>%
filter(!(paste(field, season_end) %in% paste(matched_fields$field, matched_fields$season_end))) %>%
head(5) %>%
select(field, season_end, season_start)
cat("Sample MISSED harvests:\n")
print(sample_missed)
cat("\n")
# ============================================================================
# VISUALIZE SPECIFIC EXAMPLES
# ============================================================================
cat("============================================================================\n")
cat("GENERATING DIAGNOSTIC VISUALIZATIONS\n")
cat("============================================================================\n\n")
# Load CI data
ci_rds_file <- here("laravel_app/storage/app", project_dir,
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(date = as.Date(Date)) %>%
select(field_id = field, date, mean_ci = FitData) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
output_dir <- here("r_app/experiments/harvest_prediction")
# Function to create detailed diagnostic plot
create_diagnostic_plot <- function(field_id, harvest_dates, result, title_suffix = "") {
if (is.null(result$ts_data)) {
cat("No time series data for field:", field_id, "\n")
return(NULL)
}
ts_data <- result$ts_data
# Create plot
p <- ggplot(ts_data, aes(x = date, y = ci_smooth)) +
geom_line(color = "darkgreen", linewidth = 0.8, alpha = 0.7) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 10),
axis.text = element_text(size = 9),
legend.position = "bottom"
)
# Add actual harvest dates (from harvest.xlsx)
if (!is.null(harvest_dates) && nrow(harvest_dates) > 0) {
p <- p +
geom_vline(data = harvest_dates,
aes(xintercept = season_end),
color = "red", linetype = "dashed", linewidth = 1.2, alpha = 0.8) +
geom_text(data = harvest_dates,
aes(x = season_end, y = max(ts_data$ci_smooth, na.rm = TRUE),
label = format(season_end, "%Y-%m-%d")),
angle = 90, vjust = -0.5, hjust = 1, size = 3, color = "red", fontface = "bold")
}
# Add detected breaks (from BFAST)
if (!is.null(result$all_breaks) && nrow(result$all_breaks) > 0) {
p <- p +
geom_vline(data = result$all_breaks,
aes(xintercept = break_date),
color = "blue", linetype = "dotted", linewidth = 1, alpha = 0.6) +
geom_text(data = result$all_breaks,
aes(x = break_date, y = min(ts_data$ci_smooth, na.rm = TRUE),
label = format(break_date, "%Y-%m-%d")),
angle = 90, vjust = 1.2, hjust = 0, size = 2.5, color = "blue")
}
# Add detected harvests (filtered breaks)
if (!is.null(result$harvests) && nrow(result$harvests) > 0) {
p <- p +
geom_vline(data = result$harvests,
aes(xintercept = harvest_date),
color = "darkblue", linetype = "solid", linewidth = 1.5, alpha = 0.9)
}
# Labels and title
breaks_info <- if (!is.null(result$all_breaks)) nrow(result$all_breaks) else 0
harvests_info <- if (!is.null(result$harvests)) nrow(result$harvests) else 0
actual_info <- if (!is.null(harvest_dates)) nrow(harvest_dates) else 0
p <- p +
labs(
title = paste0("Field ", field_id, " - BFAST Analysis ", title_suffix),
subtitle = paste0(
"Red dashed = Actual harvest (", actual_info, ") | ",
"Blue dotted = All breaks (", breaks_info, ") | ",
"Dark blue solid = Detected harvests (", harvests_info, ")"
),
x = "Date",
y = "CI (7-day smoothed)",
caption = "Actual harvests from harvest.xlsx | BFAST-detected breaks shown"
)
return(p)
}
# ============================================================================
# EXAMPLE 1: MATCHED FIELD (if any)
# ============================================================================
if (nrow(matched_fields) > 0) {
cat("Creating plot for MATCHED field...\n")
matched_field <- matched_fields$field[1]
matched_harvests <- harvest_actual %>%
filter(field == matched_field)
result <- all_results[[matched_field]]
if (!is.null(result)) {
p1 <- create_diagnostic_plot(matched_field, matched_harvests, result, "(MATCHED)")
if (!is.null(p1)) {
ggsave(
file.path(output_dir, "bfast_example_MATCHED.png"),
p1, width = 14, height = 7, dpi = 300
)
cat("✓ Saved: bfast_example_MATCHED.png\n")
}
}
}
# ============================================================================
# EXAMPLE 2: MISSED FIELD
# ============================================================================
cat("\nCreating plot for MISSED field...\n")
missed_field <- sample_missed$field[1]
missed_harvests <- harvest_actual %>%
filter(field == missed_field)
result_missed <- all_results[[missed_field]]
if (!is.null(result_missed)) {
p2 <- create_diagnostic_plot(missed_field, missed_harvests, result_missed, "(MISSED)")
if (!is.null(p2)) {
ggsave(
file.path(output_dir, "bfast_example_MISSED.png"),
p2, width = 14, height = 7, dpi = 300
)
cat("✓ Saved: bfast_example_MISSED.png\n")
}
}
# ============================================================================
# EXAMPLE 3: FIELD WITH MISMATCHES
# ============================================================================
# Find a field with both actual and detected harvests but poor timing
mismatch_candidates <- harvest_actual %>%
inner_join(
all_harvests %>% select(field_id, detected_harvest = harvest_date),
by = c("field" = "field_id")
) %>%
mutate(
days_diff = abs(as.numeric(detected_harvest - season_end)),
week_diff = days_diff / 7
) %>%
filter(week_diff > 5) %>% # Significant mismatch
arrange(desc(week_diff))
if (nrow(mismatch_candidates) > 0) {
cat("\nCreating plot for MISMATCHED field...\n")
mismatch_field <- mismatch_candidates$field[1]
mismatch_harvests <- harvest_actual %>%
filter(field == mismatch_field)
result_mismatch <- all_results[[mismatch_field]]
if (!is.null(result_mismatch)) {
p3 <- create_diagnostic_plot(
mismatch_field,
mismatch_harvests,
result_mismatch,
paste0("(MISMATCH: ", round(mismatch_candidates$week_diff[1], 1), " weeks off)")
)
if (!is.null(p3)) {
ggsave(
file.path(output_dir, "bfast_example_MISMATCH.png"),
p3, width = 14, height = 7, dpi = 300
)
cat("✓ Saved: bfast_example_MISMATCH.png\n")
}
}
}
# ============================================================================
# ANALYZE WHY BFAST IS STRUGGLING
# ============================================================================
cat("\n============================================================================\n")
cat("DIAGNOSTIC ANALYSIS: WHY LOW DETECTION RATE?\n")
cat("============================================================================\n\n")
# 1. Check data availability around harvest dates
cat("1. DATA AVAILABILITY ANALYSIS\n")
cat("Checking if CI data exists around actual harvest dates...\n\n")
harvest_data_check <- harvest_actual %>%
head(20) %>%
rowwise() %>%
mutate(
ci_at_harvest = {
field_ci <- time_series_daily %>%
filter(field_id == field,
date >= season_end - 14,
date <= season_end + 14)
if (nrow(field_ci) > 0) {
paste0(nrow(field_ci), " obs, CI range: ",
round(min(field_ci$mean_ci, na.rm = TRUE), 2), "-",
round(max(field_ci$mean_ci, na.rm = TRUE), 2))
} else {
"NO DATA"
}
}
) %>%
select(field, season_end, ci_at_harvest)
print(harvest_data_check)
# 2. Check break detection statistics
cat("\n\n2. BREAK DETECTION STATISTICS\n")
break_stats <- data.frame(
total_fields = length(all_results),
fields_with_breaks = sum(sapply(all_results, function(x)
!is.null(x$all_breaks) && nrow(x$all_breaks) > 0)),
fields_with_harvest_classified = sum(sapply(all_results, function(x)
!is.null(x$harvests) && nrow(x$harvests) > 0)),
total_breaks = sum(sapply(all_results, function(x)
ifelse(!is.null(x$all_breaks), nrow(x$all_breaks), 0))),
total_harvest_breaks = sum(sapply(all_results, function(x)
ifelse(!is.null(x$harvests), nrow(x$harvests), 0)))
)
print(break_stats)
cat("\n\n3. CI DROP CHARACTERISTICS AT ACTUAL HARVEST\n")
cat("Analyzing CI behavior at known harvest dates...\n\n")
# Analyze CI patterns at actual harvests
harvest_ci_patterns <- harvest_actual %>%
head(50) %>% # Sample for speed
rowwise() %>%
mutate(
ci_change = {
field_ci <- time_series_daily %>%
filter(field_id == field) %>%
arrange(date)
if (nrow(field_ci) > 0) {
# Find closest dates before and after harvest
before_harvest <- field_ci %>%
filter(date <= season_end) %>%
tail(5)
after_harvest <- field_ci %>%
filter(date > season_end) %>%
head(5)
if (nrow(before_harvest) > 0 && nrow(after_harvest) > 0) {
ci_before <- mean(before_harvest$mean_ci, na.rm = TRUE)
ci_after <- mean(after_harvest$mean_ci, na.rm = TRUE)
round(ci_after - ci_before, 2)
} else {
NA_real_
}
} else {
NA_real_
}
}
) %>%
filter(!is.na(ci_change))
if (nrow(harvest_ci_patterns) > 0) {
cat("CI change at harvest (sample of", nrow(harvest_ci_patterns), "events):\n")
cat(" Mean CI change:", round(mean(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
cat(" Median CI change:", round(median(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
cat(" Min CI change:", round(min(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
cat(" Max CI change:", round(max(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
cat(" # with CI drop < -0.5:", sum(harvest_ci_patterns$ci_change < -0.5, na.rm = TRUE), "\n")
cat(" # with CI increase:", sum(harvest_ci_patterns$ci_change > 0, na.rm = TRUE), "\n")
}
# ============================================================================
# RECOMMENDATIONS
# ============================================================================
cat("\n\n============================================================================\n")
cat("RECOMMENDATIONS FOR IMPROVEMENT\n")
cat("============================================================================\n\n")
cat("Based on the analysis:\n\n")
cat("1. DETECTION RATE: ", round(100 * nrow(matched_fields) / nrow(harvest_actual), 1), "%\n")
if (nrow(matched_fields) / nrow(harvest_actual) < 0.20) {
cat(" → VERY LOW - BFAST may not be suitable for this data\n\n")
} else if (nrow(matched_fields) / nrow(harvest_actual) < 0.50) {
cat(" → LOW - Parameter tuning may help\n\n")
}
cat("2. POSSIBLE ISSUES:\n")
cat(" - Harvest signal may not cause abrupt CI drops\n")
cat(" - Gradual harvest over weeks (not single-day event)\n")
cat(" - Regrowth happens quickly (obscures harvest signal)\n")
cat(" - BFAST expects abrupt structural breaks\n\n")
cat("3. ALTERNATIVE APPROACHES TO CONSIDER:\n")
cat(" - Rolling minimum detection (find sustained low CI periods)\n")
cat(" - Change point detection with smoother transitions\n")
cat(" - Threshold-based approach (CI < 2.5 for 2+ weeks)\n")
cat(" - Combine with SAR data for better harvest detection\n")
cat(" - Use crop age + CI trajectory modeling\n\n")
cat("4. BFAST PARAMETER TUNING (if continuing):\n")
cat(" - Try different h values (currently 0.15)\n")
cat(" - Test 'none' for season (remove seasonal model)\n")
cat(" - Adjust ci_drop_threshold (currently -0.5)\n")
cat(" - Relax magnitude_threshold (currently 0.3)\n\n")
cat("============================================================================\n")
cat("ANALYSIS COMPLETE\n")
cat("============================================================================\n\n")
cat("Review generated plots:\n")
cat(" - bfast_example_MATCHED.png (if available)\n")
cat(" - bfast_example_MISSED.png\n")
cat(" - bfast_example_MISMATCH.png (if available)\n\n")