438 lines
15 KiB
R
438 lines
15 KiB
R
# ============================================================================
|
|
# ANALYZE BFAST HARVEST DETECTION RESULTS
|
|
# ============================================================================
|
|
# Diagnose why BFAST detection rate is low and visualize specific examples
|
|
# ============================================================================
|
|
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(here)
|
|
library(bfast)
|
|
library(zoo)
|
|
library(ggplot2)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
if (basename(getwd()) == "harvest_prediction") {
|
|
setwd("../../..")
|
|
}
|
|
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
cat("============================================================================\n")
|
|
cat("ANALYZING BFAST RESULTS\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
# Load BFAST results
|
|
results_file <- here("r_app/experiments/harvest_prediction/detected_harvests_bfast.rds")
|
|
if (!file.exists(results_file)) {
|
|
stop("BFAST results not found. Run detect_harvest_retrospective_bfast.R first.")
|
|
}
|
|
|
|
bfast_data <- readRDS(results_file)
|
|
all_results <- bfast_data$all_results
|
|
all_harvests <- bfast_data$harvests
|
|
|
|
cat("Loaded BFAST results:\n")
|
|
cat(" Total fields:", length(all_results), "\n")
|
|
cat(" Harvests detected:", nrow(all_harvests), "\n\n")
|
|
|
|
# Load actual harvest data for comparison
|
|
harvest_file <- here("laravel_app/storage/app", project_dir, "Data/harvest.xlsx")
|
|
harvest_actual <- read_excel(harvest_file) %>%
|
|
mutate(season_end = as.Date(season_end)) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
cat("Actual harvest records:", nrow(harvest_actual), "\n\n")
|
|
|
|
# ============================================================================
|
|
# ANALYZE MATCHED VS MISSED FIELDS
|
|
# ============================================================================
|
|
|
|
cat("============================================================================\n")
|
|
cat("PATTERN ANALYSIS: MATCHED VS MISSED\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
# Get fields with successful matches (±4 weeks)
|
|
matched_fields <- harvest_actual %>%
|
|
inner_join(
|
|
all_harvests %>%
|
|
select(field_id, detected_harvest = harvest_date, detected_year = harvest_year),
|
|
by = c("field" = "field_id")
|
|
) %>%
|
|
mutate(
|
|
week_diff = abs(isoweek(detected_harvest) - isoweek(season_end)),
|
|
match_quality = case_when(
|
|
week_diff <= 2 ~ "Good (±2w)",
|
|
week_diff <= 4 ~ "Acceptable (±4w)",
|
|
TRUE ~ "Poor"
|
|
)
|
|
) %>%
|
|
filter(match_quality %in% c("Good (±2w)", "Acceptable (±4w)"))
|
|
|
|
# Get fields that were completely missed
|
|
missed_fields <- harvest_actual %>%
|
|
anti_join(all_harvests, by = c("field" = "field_id", "season_end" = "harvest_date"))
|
|
|
|
cat("Matched fields (±4w):", nrow(matched_fields), "\n")
|
|
cat("Missed harvests:", nrow(harvest_actual) - nrow(matched_fields), "\n\n")
|
|
|
|
# Sample fields for detailed visualization
|
|
if (nrow(matched_fields) > 0) {
|
|
sample_matched <- matched_fields %>%
|
|
head(3) %>%
|
|
select(field, season_end, detected_harvest = detected_harvest, week_diff)
|
|
|
|
cat("Sample MATCHED detections:\n")
|
|
print(sample_matched)
|
|
cat("\n")
|
|
}
|
|
|
|
# Sample missed fields
|
|
sample_missed <- harvest_actual %>%
|
|
filter(!(paste(field, season_end) %in% paste(matched_fields$field, matched_fields$season_end))) %>%
|
|
head(5) %>%
|
|
select(field, season_end, season_start)
|
|
|
|
cat("Sample MISSED harvests:\n")
|
|
print(sample_missed)
|
|
cat("\n")
|
|
|
|
# ============================================================================
|
|
# VISUALIZE SPECIFIC EXAMPLES
|
|
# ============================================================================
|
|
|
|
cat("============================================================================\n")
|
|
cat("GENERATING DIAGNOSTIC VISUALIZATIONS\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
# Load CI data
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir,
|
|
"Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(date = as.Date(Date)) %>%
|
|
select(field_id = field, date, mean_ci = FitData) %>%
|
|
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
|
arrange(field_id, date)
|
|
|
|
output_dir <- here("r_app/experiments/harvest_prediction")
|
|
|
|
# Function to create detailed diagnostic plot
|
|
create_diagnostic_plot <- function(field_id, harvest_dates, result, title_suffix = "") {
|
|
|
|
if (is.null(result$ts_data)) {
|
|
cat("No time series data for field:", field_id, "\n")
|
|
return(NULL)
|
|
}
|
|
|
|
ts_data <- result$ts_data
|
|
|
|
# Create plot
|
|
p <- ggplot(ts_data, aes(x = date, y = ci_smooth)) +
|
|
geom_line(color = "darkgreen", linewidth = 0.8, alpha = 0.7) +
|
|
theme_minimal() +
|
|
theme(
|
|
plot.title = element_text(face = "bold", size = 14),
|
|
plot.subtitle = element_text(size = 10),
|
|
axis.text = element_text(size = 9),
|
|
legend.position = "bottom"
|
|
)
|
|
|
|
# Add actual harvest dates (from harvest.xlsx)
|
|
if (!is.null(harvest_dates) && nrow(harvest_dates) > 0) {
|
|
p <- p +
|
|
geom_vline(data = harvest_dates,
|
|
aes(xintercept = season_end),
|
|
color = "red", linetype = "dashed", linewidth = 1.2, alpha = 0.8) +
|
|
geom_text(data = harvest_dates,
|
|
aes(x = season_end, y = max(ts_data$ci_smooth, na.rm = TRUE),
|
|
label = format(season_end, "%Y-%m-%d")),
|
|
angle = 90, vjust = -0.5, hjust = 1, size = 3, color = "red", fontface = "bold")
|
|
}
|
|
|
|
# Add detected breaks (from BFAST)
|
|
if (!is.null(result$all_breaks) && nrow(result$all_breaks) > 0) {
|
|
p <- p +
|
|
geom_vline(data = result$all_breaks,
|
|
aes(xintercept = break_date),
|
|
color = "blue", linetype = "dotted", linewidth = 1, alpha = 0.6) +
|
|
geom_text(data = result$all_breaks,
|
|
aes(x = break_date, y = min(ts_data$ci_smooth, na.rm = TRUE),
|
|
label = format(break_date, "%Y-%m-%d")),
|
|
angle = 90, vjust = 1.2, hjust = 0, size = 2.5, color = "blue")
|
|
}
|
|
|
|
# Add detected harvests (filtered breaks)
|
|
if (!is.null(result$harvests) && nrow(result$harvests) > 0) {
|
|
p <- p +
|
|
geom_vline(data = result$harvests,
|
|
aes(xintercept = harvest_date),
|
|
color = "darkblue", linetype = "solid", linewidth = 1.5, alpha = 0.9)
|
|
}
|
|
|
|
# Labels and title
|
|
breaks_info <- if (!is.null(result$all_breaks)) nrow(result$all_breaks) else 0
|
|
harvests_info <- if (!is.null(result$harvests)) nrow(result$harvests) else 0
|
|
actual_info <- if (!is.null(harvest_dates)) nrow(harvest_dates) else 0
|
|
|
|
p <- p +
|
|
labs(
|
|
title = paste0("Field ", field_id, " - BFAST Analysis ", title_suffix),
|
|
subtitle = paste0(
|
|
"Red dashed = Actual harvest (", actual_info, ") | ",
|
|
"Blue dotted = All breaks (", breaks_info, ") | ",
|
|
"Dark blue solid = Detected harvests (", harvests_info, ")"
|
|
),
|
|
x = "Date",
|
|
y = "CI (7-day smoothed)",
|
|
caption = "Actual harvests from harvest.xlsx | BFAST-detected breaks shown"
|
|
)
|
|
|
|
return(p)
|
|
}
|
|
|
|
# ============================================================================
|
|
# EXAMPLE 1: MATCHED FIELD (if any)
|
|
# ============================================================================
|
|
|
|
if (nrow(matched_fields) > 0) {
|
|
cat("Creating plot for MATCHED field...\n")
|
|
|
|
matched_field <- matched_fields$field[1]
|
|
matched_harvests <- harvest_actual %>%
|
|
filter(field == matched_field)
|
|
|
|
result <- all_results[[matched_field]]
|
|
|
|
if (!is.null(result)) {
|
|
p1 <- create_diagnostic_plot(matched_field, matched_harvests, result, "(MATCHED)")
|
|
|
|
if (!is.null(p1)) {
|
|
ggsave(
|
|
file.path(output_dir, "bfast_example_MATCHED.png"),
|
|
p1, width = 14, height = 7, dpi = 300
|
|
)
|
|
cat("✓ Saved: bfast_example_MATCHED.png\n")
|
|
}
|
|
}
|
|
}
|
|
|
|
# ============================================================================
|
|
# EXAMPLE 2: MISSED FIELD
|
|
# ============================================================================
|
|
|
|
cat("\nCreating plot for MISSED field...\n")
|
|
|
|
missed_field <- sample_missed$field[1]
|
|
missed_harvests <- harvest_actual %>%
|
|
filter(field == missed_field)
|
|
|
|
result_missed <- all_results[[missed_field]]
|
|
|
|
if (!is.null(result_missed)) {
|
|
p2 <- create_diagnostic_plot(missed_field, missed_harvests, result_missed, "(MISSED)")
|
|
|
|
if (!is.null(p2)) {
|
|
ggsave(
|
|
file.path(output_dir, "bfast_example_MISSED.png"),
|
|
p2, width = 14, height = 7, dpi = 300
|
|
)
|
|
cat("✓ Saved: bfast_example_MISSED.png\n")
|
|
}
|
|
}
|
|
|
|
# ============================================================================
|
|
# EXAMPLE 3: FIELD WITH MISMATCHES
|
|
# ============================================================================
|
|
|
|
# Find a field with both actual and detected harvests but poor timing
|
|
mismatch_candidates <- harvest_actual %>%
|
|
inner_join(
|
|
all_harvests %>% select(field_id, detected_harvest = harvest_date),
|
|
by = c("field" = "field_id")
|
|
) %>%
|
|
mutate(
|
|
days_diff = abs(as.numeric(detected_harvest - season_end)),
|
|
week_diff = days_diff / 7
|
|
) %>%
|
|
filter(week_diff > 5) %>% # Significant mismatch
|
|
arrange(desc(week_diff))
|
|
|
|
if (nrow(mismatch_candidates) > 0) {
|
|
cat("\nCreating plot for MISMATCHED field...\n")
|
|
|
|
mismatch_field <- mismatch_candidates$field[1]
|
|
mismatch_harvests <- harvest_actual %>%
|
|
filter(field == mismatch_field)
|
|
|
|
result_mismatch <- all_results[[mismatch_field]]
|
|
|
|
if (!is.null(result_mismatch)) {
|
|
p3 <- create_diagnostic_plot(
|
|
mismatch_field,
|
|
mismatch_harvests,
|
|
result_mismatch,
|
|
paste0("(MISMATCH: ", round(mismatch_candidates$week_diff[1], 1), " weeks off)")
|
|
)
|
|
|
|
if (!is.null(p3)) {
|
|
ggsave(
|
|
file.path(output_dir, "bfast_example_MISMATCH.png"),
|
|
p3, width = 14, height = 7, dpi = 300
|
|
)
|
|
cat("✓ Saved: bfast_example_MISMATCH.png\n")
|
|
}
|
|
}
|
|
}
|
|
|
|
# ============================================================================
|
|
# ANALYZE WHY BFAST IS STRUGGLING
|
|
# ============================================================================
|
|
|
|
cat("\n============================================================================\n")
|
|
cat("DIAGNOSTIC ANALYSIS: WHY LOW DETECTION RATE?\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
# 1. Check data availability around harvest dates
|
|
cat("1. DATA AVAILABILITY ANALYSIS\n")
|
|
cat("Checking if CI data exists around actual harvest dates...\n\n")
|
|
|
|
harvest_data_check <- harvest_actual %>%
|
|
head(20) %>%
|
|
rowwise() %>%
|
|
mutate(
|
|
ci_at_harvest = {
|
|
field_ci <- time_series_daily %>%
|
|
filter(field_id == field,
|
|
date >= season_end - 14,
|
|
date <= season_end + 14)
|
|
|
|
if (nrow(field_ci) > 0) {
|
|
paste0(nrow(field_ci), " obs, CI range: ",
|
|
round(min(field_ci$mean_ci, na.rm = TRUE), 2), "-",
|
|
round(max(field_ci$mean_ci, na.rm = TRUE), 2))
|
|
} else {
|
|
"NO DATA"
|
|
}
|
|
}
|
|
) %>%
|
|
select(field, season_end, ci_at_harvest)
|
|
|
|
print(harvest_data_check)
|
|
|
|
# 2. Check break detection statistics
|
|
cat("\n\n2. BREAK DETECTION STATISTICS\n")
|
|
|
|
break_stats <- data.frame(
|
|
total_fields = length(all_results),
|
|
fields_with_breaks = sum(sapply(all_results, function(x)
|
|
!is.null(x$all_breaks) && nrow(x$all_breaks) > 0)),
|
|
fields_with_harvest_classified = sum(sapply(all_results, function(x)
|
|
!is.null(x$harvests) && nrow(x$harvests) > 0)),
|
|
total_breaks = sum(sapply(all_results, function(x)
|
|
ifelse(!is.null(x$all_breaks), nrow(x$all_breaks), 0))),
|
|
total_harvest_breaks = sum(sapply(all_results, function(x)
|
|
ifelse(!is.null(x$harvests), nrow(x$harvests), 0)))
|
|
)
|
|
|
|
print(break_stats)
|
|
|
|
cat("\n\n3. CI DROP CHARACTERISTICS AT ACTUAL HARVEST\n")
|
|
cat("Analyzing CI behavior at known harvest dates...\n\n")
|
|
|
|
# Analyze CI patterns at actual harvests
|
|
harvest_ci_patterns <- harvest_actual %>%
|
|
head(50) %>% # Sample for speed
|
|
rowwise() %>%
|
|
mutate(
|
|
ci_change = {
|
|
field_ci <- time_series_daily %>%
|
|
filter(field_id == field) %>%
|
|
arrange(date)
|
|
|
|
if (nrow(field_ci) > 0) {
|
|
# Find closest dates before and after harvest
|
|
before_harvest <- field_ci %>%
|
|
filter(date <= season_end) %>%
|
|
tail(5)
|
|
after_harvest <- field_ci %>%
|
|
filter(date > season_end) %>%
|
|
head(5)
|
|
|
|
if (nrow(before_harvest) > 0 && nrow(after_harvest) > 0) {
|
|
ci_before <- mean(before_harvest$mean_ci, na.rm = TRUE)
|
|
ci_after <- mean(after_harvest$mean_ci, na.rm = TRUE)
|
|
round(ci_after - ci_before, 2)
|
|
} else {
|
|
NA_real_
|
|
}
|
|
} else {
|
|
NA_real_
|
|
}
|
|
}
|
|
) %>%
|
|
filter(!is.na(ci_change))
|
|
|
|
if (nrow(harvest_ci_patterns) > 0) {
|
|
cat("CI change at harvest (sample of", nrow(harvest_ci_patterns), "events):\n")
|
|
cat(" Mean CI change:", round(mean(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
|
|
cat(" Median CI change:", round(median(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
|
|
cat(" Min CI change:", round(min(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
|
|
cat(" Max CI change:", round(max(harvest_ci_patterns$ci_change, na.rm = TRUE), 2), "\n")
|
|
cat(" # with CI drop < -0.5:", sum(harvest_ci_patterns$ci_change < -0.5, na.rm = TRUE), "\n")
|
|
cat(" # with CI increase:", sum(harvest_ci_patterns$ci_change > 0, na.rm = TRUE), "\n")
|
|
}
|
|
|
|
# ============================================================================
|
|
# RECOMMENDATIONS
|
|
# ============================================================================
|
|
|
|
cat("\n\n============================================================================\n")
|
|
cat("RECOMMENDATIONS FOR IMPROVEMENT\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
cat("Based on the analysis:\n\n")
|
|
|
|
cat("1. DETECTION RATE: ", round(100 * nrow(matched_fields) / nrow(harvest_actual), 1), "%\n")
|
|
if (nrow(matched_fields) / nrow(harvest_actual) < 0.20) {
|
|
cat(" → VERY LOW - BFAST may not be suitable for this data\n\n")
|
|
} else if (nrow(matched_fields) / nrow(harvest_actual) < 0.50) {
|
|
cat(" → LOW - Parameter tuning may help\n\n")
|
|
}
|
|
|
|
cat("2. POSSIBLE ISSUES:\n")
|
|
cat(" - Harvest signal may not cause abrupt CI drops\n")
|
|
cat(" - Gradual harvest over weeks (not single-day event)\n")
|
|
cat(" - Regrowth happens quickly (obscures harvest signal)\n")
|
|
cat(" - BFAST expects abrupt structural breaks\n\n")
|
|
|
|
cat("3. ALTERNATIVE APPROACHES TO CONSIDER:\n")
|
|
cat(" - Rolling minimum detection (find sustained low CI periods)\n")
|
|
cat(" - Change point detection with smoother transitions\n")
|
|
cat(" - Threshold-based approach (CI < 2.5 for 2+ weeks)\n")
|
|
cat(" - Combine with SAR data for better harvest detection\n")
|
|
cat(" - Use crop age + CI trajectory modeling\n\n")
|
|
|
|
cat("4. BFAST PARAMETER TUNING (if continuing):\n")
|
|
cat(" - Try different h values (currently 0.15)\n")
|
|
cat(" - Test 'none' for season (remove seasonal model)\n")
|
|
cat(" - Adjust ci_drop_threshold (currently -0.5)\n")
|
|
cat(" - Relax magnitude_threshold (currently 0.3)\n\n")
|
|
|
|
cat("============================================================================\n")
|
|
cat("ANALYSIS COMPLETE\n")
|
|
cat("============================================================================\n\n")
|
|
|
|
cat("Review generated plots:\n")
|
|
cat(" - bfast_example_MATCHED.png (if available)\n")
|
|
cat(" - bfast_example_MISSED.png\n")
|
|
cat(" - bfast_example_MISMATCH.png (if available)\n\n")
|