448 lines
14 KiB
R
448 lines
14 KiB
R
# ============================================================================
|
|
# OPERATIONAL HARVEST PREDICTION
|
|
# Analyze current season growth curves to predict harvest timing
|
|
# ============================================================================
|
|
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(terra)
|
|
library(sf)
|
|
library(here)
|
|
library(ggplot2)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# ============================================================================
|
|
# STEP 1: LOAD DATA
|
|
# ============================================================================
|
|
|
|
cat("=== LOADING DATA ===\n\n")
|
|
|
|
# Load CI time series
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(
|
|
date = as.Date(Date),
|
|
week = isoweek(date),
|
|
year = isoyear(date)
|
|
) %>%
|
|
select(
|
|
field_id = field,
|
|
date,
|
|
week,
|
|
year,
|
|
mean_ci = FitData
|
|
) %>%
|
|
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
|
arrange(field_id, date)
|
|
|
|
# Load harvest data
|
|
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
fields_with_ci <- unique(time_series_daily$field_id)
|
|
harvest_data_filtered <- harvest_data %>%
|
|
filter(field %in% fields_with_ci) %>%
|
|
arrange(field, season_end)
|
|
|
|
cat("Loaded CI data for", length(fields_with_ci), "fields\n")
|
|
cat("Loaded harvest data for", length(unique(harvest_data_filtered$field)), "fields\n\n")
|
|
|
|
# ============================================================================
|
|
# STEP 2: SEGMENT TIME SERIES BY SEASON
|
|
# ============================================================================
|
|
|
|
cat("=== SEGMENTING TIME SERIES INTO INDIVIDUAL SEASONS ===\n\n")
|
|
|
|
# For each field, create seasons based on harvest dates
|
|
# Season starts day after previous harvest, ends at next harvest
|
|
create_seasons <- function(field_name, ci_ts, harvest_df) {
|
|
# Get CI data for this field
|
|
field_ci <- ci_ts %>%
|
|
filter(field_id == field_name) %>%
|
|
arrange(date)
|
|
|
|
# Get harvest dates for this field
|
|
field_harvests <- harvest_df %>%
|
|
filter(field == field_name) %>%
|
|
arrange(season_end) %>%
|
|
mutate(season_id = row_number())
|
|
|
|
if (nrow(field_harvests) == 0) {
|
|
return(NULL)
|
|
}
|
|
|
|
# Create season segments
|
|
seasons_list <- list()
|
|
|
|
for (i in 1:nrow(field_harvests)) {
|
|
# Season start: day after previous harvest (or start of data if first season)
|
|
if (i == 1) {
|
|
season_start <- min(field_ci$date)
|
|
} else {
|
|
season_start <- field_harvests$season_end[i-1] + 1
|
|
}
|
|
|
|
# Season end: current harvest date
|
|
season_end <- field_harvests$season_end[i]
|
|
|
|
# Extract CI data for this season
|
|
season_ci <- field_ci %>%
|
|
filter(date >= season_start, date <= season_end)
|
|
|
|
if (nrow(season_ci) > 0) {
|
|
season_ci$season_id <- i
|
|
season_ci$season_start_date <- season_start
|
|
season_ci$season_end_date <- season_end
|
|
season_ci$days_in_season <- as.numeric(season_end - season_start)
|
|
season_ci$days_since_start <- as.numeric(season_ci$date - season_start)
|
|
season_ci$days_until_harvest <- as.numeric(season_end - season_ci$date)
|
|
|
|
seasons_list[[i]] <- season_ci
|
|
}
|
|
}
|
|
|
|
# Add current ongoing season (after last harvest)
|
|
if (nrow(field_harvests) > 0) {
|
|
last_harvest <- field_harvests$season_end[nrow(field_harvests)]
|
|
current_season_start <- last_harvest + 1
|
|
|
|
current_season_ci <- field_ci %>%
|
|
filter(date >= current_season_start)
|
|
|
|
if (nrow(current_season_ci) > 0) {
|
|
current_season_ci$season_id <- nrow(field_harvests) + 1
|
|
current_season_ci$season_start_date <- current_season_start
|
|
current_season_ci$season_end_date <- NA # Unknown - this is what we're predicting
|
|
current_season_ci$days_in_season <- NA
|
|
current_season_ci$days_since_start <- as.numeric(current_season_ci$date - current_season_start)
|
|
current_season_ci$days_until_harvest <- NA
|
|
|
|
seasons_list[[length(seasons_list) + 1]] <- current_season_ci
|
|
}
|
|
}
|
|
|
|
if (length(seasons_list) > 0) {
|
|
return(bind_rows(seasons_list))
|
|
} else {
|
|
return(NULL)
|
|
}
|
|
}
|
|
|
|
# Create segmented data for all fields
|
|
all_seasons <- lapply(fields_with_ci, function(field_name) {
|
|
seasons <- create_seasons(field_name, time_series_daily, harvest_data_filtered)
|
|
if (!is.null(seasons)) {
|
|
seasons$field_id <- field_name
|
|
}
|
|
return(seasons)
|
|
}) %>%
|
|
bind_rows()
|
|
|
|
cat("Created", nrow(all_seasons), "season-segmented observations\n")
|
|
cat("Total seasons:", length(unique(paste(all_seasons$field_id, all_seasons$season_id))), "\n\n")
|
|
|
|
# Summary by season
|
|
season_summary <- all_seasons %>%
|
|
group_by(field_id, season_id) %>%
|
|
summarise(
|
|
season_start = min(season_start_date),
|
|
season_end = max(season_end_date),
|
|
n_observations = n(),
|
|
days_duration = max(days_in_season, na.rm = TRUE),
|
|
max_ci = max(mean_ci, na.rm = TRUE),
|
|
is_current = all(is.na(season_end_date)),
|
|
.groups = "drop"
|
|
)
|
|
|
|
cat("Season summary:\n")
|
|
print(head(season_summary, 20))
|
|
|
|
# ============================================================================
|
|
# STEP 3: GROWTH CURVE ANALYSIS PER SEASON
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== ANALYZING GROWTH CURVES PER SEASON ===\n\n")
|
|
|
|
# Smoothing function (Savitzky-Golay style moving average)
|
|
smooth_ci <- function(ci_values, window = 15) {
|
|
n <- length(ci_values)
|
|
if (n < window) window <- max(3, n)
|
|
|
|
smoothed <- rep(NA, n)
|
|
half_window <- floor(window / 2)
|
|
|
|
for (i in 1:n) {
|
|
start_idx <- max(1, i - half_window)
|
|
end_idx <- min(n, i + half_window)
|
|
smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE)
|
|
}
|
|
|
|
return(smoothed)
|
|
}
|
|
|
|
# Detect peak and senescence
|
|
analyze_season_curve <- function(season_df) {
|
|
if (nrow(season_df) < 20) {
|
|
return(list(
|
|
peak_date = NA,
|
|
peak_ci = NA,
|
|
peak_days_since_start = NA,
|
|
senescence_start_date = NA,
|
|
senescence_rate = NA,
|
|
current_phase = "insufficient_data"
|
|
))
|
|
}
|
|
|
|
# Smooth the curve
|
|
season_df$ci_smooth <- smooth_ci(season_df$mean_ci)
|
|
|
|
# Find peak
|
|
peak_idx <- which.max(season_df$ci_smooth)
|
|
peak_date <- season_df$date[peak_idx]
|
|
peak_ci <- season_df$ci_smooth[peak_idx]
|
|
peak_days <- season_df$days_since_start[peak_idx]
|
|
|
|
# Check if we're past the peak
|
|
last_date <- max(season_df$date)
|
|
is_post_peak <- last_date > peak_date
|
|
|
|
# Calculate senescence rate (slope after peak)
|
|
if (is_post_peak && peak_idx < nrow(season_df) - 5) {
|
|
post_peak_data <- season_df[(peak_idx):nrow(season_df), ]
|
|
|
|
# Fit linear model to post-peak data
|
|
lm_post <- lm(ci_smooth ~ days_since_start, data = post_peak_data)
|
|
senescence_rate <- coef(lm_post)[2] # Slope
|
|
senescence_start <- peak_date
|
|
} else {
|
|
senescence_rate <- NA
|
|
senescence_start <- NA
|
|
}
|
|
|
|
# Determine current phase
|
|
current_ci <- tail(season_df$ci_smooth, 1)
|
|
|
|
if (is.na(current_ci)) {
|
|
current_phase <- "unknown"
|
|
} else if (!is_post_peak) {
|
|
current_phase <- "growing"
|
|
} else if (current_ci > 2.5) {
|
|
current_phase <- "post_peak_maturing"
|
|
} else {
|
|
current_phase <- "declining_harvest_approaching"
|
|
}
|
|
|
|
return(list(
|
|
peak_date = peak_date,
|
|
peak_ci = peak_ci,
|
|
peak_days_since_start = peak_days,
|
|
senescence_start_date = senescence_start,
|
|
senescence_rate = senescence_rate,
|
|
current_phase = current_phase,
|
|
current_ci = current_ci,
|
|
last_obs_date = last_date
|
|
))
|
|
}
|
|
|
|
# Analyze each season
|
|
season_analysis <- all_seasons %>%
|
|
group_by(field_id, season_id) %>%
|
|
group_modify(~ {
|
|
analysis <- analyze_season_curve(.x)
|
|
as.data.frame(analysis)
|
|
}) %>%
|
|
ungroup()
|
|
|
|
# Merge with season summary
|
|
season_results <- season_summary %>%
|
|
left_join(season_analysis, by = c("field_id", "season_id"))
|
|
|
|
cat("Analyzed", nrow(season_results), "seasons\n\n")
|
|
|
|
# ============================================================================
|
|
# STEP 4: HARVEST TIMING PATTERNS (Historical Analysis)
|
|
# ============================================================================
|
|
|
|
cat("=== ANALYZING HISTORICAL HARVEST TIMING PATTERNS ===\n\n")
|
|
|
|
# Look at completed seasons only
|
|
historical_seasons <- season_results %>%
|
|
filter(!is_current) %>%
|
|
mutate(
|
|
days_peak_to_harvest = as.numeric(season_end - peak_date)
|
|
)
|
|
|
|
cat("Historical season statistics (completed harvests):\n\n")
|
|
|
|
cat("Average days from peak to harvest:\n")
|
|
peak_to_harvest_stats <- historical_seasons %>%
|
|
filter(!is.na(days_peak_to_harvest)) %>%
|
|
summarise(
|
|
mean_days = mean(days_peak_to_harvest, na.rm = TRUE),
|
|
median_days = median(days_peak_to_harvest, na.rm = TRUE),
|
|
sd_days = sd(days_peak_to_harvest, na.rm = TRUE),
|
|
min_days = min(days_peak_to_harvest, na.rm = TRUE),
|
|
max_days = max(days_peak_to_harvest, na.rm = TRUE)
|
|
)
|
|
print(peak_to_harvest_stats)
|
|
|
|
cat("\n\nPeak CI at harvest time:\n")
|
|
peak_ci_stats <- historical_seasons %>%
|
|
filter(!is.na(peak_ci)) %>%
|
|
summarise(
|
|
mean_peak_ci = mean(peak_ci, na.rm = TRUE),
|
|
median_peak_ci = median(peak_ci, na.rm = TRUE),
|
|
sd_peak_ci = sd(peak_ci, na.rm = TRUE)
|
|
)
|
|
print(peak_ci_stats)
|
|
|
|
cat("\n\nSenescence rate (CI decline per day after peak):\n")
|
|
senescence_stats <- historical_seasons %>%
|
|
filter(!is.na(senescence_rate), senescence_rate < 0) %>%
|
|
summarise(
|
|
mean_rate = mean(senescence_rate, na.rm = TRUE),
|
|
median_rate = median(senescence_rate, na.rm = TRUE),
|
|
sd_rate = sd(senescence_rate, na.rm = TRUE)
|
|
)
|
|
print(senescence_stats)
|
|
|
|
# ============================================================================
|
|
# STEP 5: CURRENT SEASON PREDICTIONS
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== PREDICTING HARVEST FOR CURRENT ONGOING SEASONS ===\n\n")
|
|
|
|
# Get current seasons
|
|
current_seasons <- season_results %>%
|
|
filter(is_current) %>%
|
|
mutate(
|
|
# Use historical average to predict harvest
|
|
predicted_harvest_date = peak_date + peak_to_harvest_stats$mean_days,
|
|
days_until_predicted_harvest = as.numeric(predicted_harvest_date - last_obs_date),
|
|
weeks_until_predicted_harvest = days_until_predicted_harvest / 7
|
|
)
|
|
|
|
cat("Current ongoing seasons (ready for harvest prediction):\n\n")
|
|
|
|
current_predictions <- current_seasons %>%
|
|
mutate(
|
|
days_since_peak = as.numeric(last_obs_date - peak_date)
|
|
) %>%
|
|
select(
|
|
field_id,
|
|
season_id,
|
|
last_harvest = season_start,
|
|
last_observation = last_obs_date,
|
|
current_ci,
|
|
current_phase,
|
|
peak_date,
|
|
peak_ci,
|
|
days_since_peak,
|
|
predicted_harvest = predicted_harvest_date,
|
|
weeks_until_harvest = weeks_until_predicted_harvest
|
|
) %>%
|
|
arrange(weeks_until_harvest)
|
|
|
|
print(current_predictions)
|
|
|
|
cat("\n\nHarvest readiness assessment:\n\n")
|
|
|
|
harvest_alerts <- current_predictions %>%
|
|
mutate(
|
|
alert = case_when(
|
|
current_ci < 2.5 & current_phase == "declining_harvest_approaching" ~ "🚨 HARVEST IMMINENT (CI < 2.5)",
|
|
current_ci < 3.0 & weeks_until_harvest < 2 ~ "⚠️ HARVEST WITHIN 2 WEEKS",
|
|
weeks_until_harvest < 4 ~ "💡 HARVEST WITHIN 1 MONTH",
|
|
current_phase == "growing" ~ "✅ STILL GROWING",
|
|
TRUE ~ "📊 MONITORING"
|
|
)
|
|
) %>%
|
|
select(field_id, current_ci, current_phase, predicted_harvest, alert)
|
|
|
|
print(harvest_alerts)
|
|
|
|
# ============================================================================
|
|
# STEP 6: VALIDATION OF PREDICTION METHOD
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== VALIDATING PREDICTION METHOD ON HISTORICAL DATA ===\n\n")
|
|
|
|
# For each historical season, predict when harvest would occur using only data up to peak
|
|
validation_results <- historical_seasons %>%
|
|
filter(!is.na(peak_date), !is.na(season_end)) %>%
|
|
mutate(
|
|
predicted_harvest = peak_date + peak_to_harvest_stats$mean_days,
|
|
actual_harvest = season_end,
|
|
prediction_error_days = as.numeric(predicted_harvest - actual_harvest),
|
|
prediction_error_weeks = prediction_error_days / 7
|
|
)
|
|
|
|
cat("Prediction accuracy metrics:\n\n")
|
|
|
|
accuracy_metrics <- validation_results %>%
|
|
summarise(
|
|
n_predictions = n(),
|
|
mean_error_days = mean(abs(prediction_error_days), na.rm = TRUE),
|
|
median_error_days = median(abs(prediction_error_days), na.rm = TRUE),
|
|
rmse_days = sqrt(mean(prediction_error_days^2, na.rm = TRUE)),
|
|
within_2_weeks = sum(abs(prediction_error_weeks) <= 2, na.rm = TRUE),
|
|
pct_within_2_weeks = 100 * sum(abs(prediction_error_weeks) <= 2, na.rm = TRUE) / n()
|
|
)
|
|
|
|
print(accuracy_metrics)
|
|
|
|
cat("\n\nSample predictions vs actual:\n")
|
|
print(validation_results %>%
|
|
select(field_id, season_id, peak_date, predicted_harvest, actual_harvest,
|
|
prediction_error_weeks) %>%
|
|
head(15))
|
|
|
|
# ============================================================================
|
|
# SUMMARY
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== OPERATIONAL HARVEST PREDICTION SUMMARY ===\n\n")
|
|
|
|
cat("METHODOLOGY:\n")
|
|
cat("1. Segment CI time series by harvest dates (each season = planting to harvest)\n")
|
|
cat("2. Smooth CI data to identify peak (maturity point)\n")
|
|
cat("3. Historical pattern: Average", round(peak_to_harvest_stats$mean_days), "days from peak to harvest\n")
|
|
cat("4. Current season prediction: Peak date +", round(peak_to_harvest_stats$mean_days), "days\n\n")
|
|
|
|
cat("PREDICTION ACCURACY (Historical Validation):\n")
|
|
cat(" - Mean absolute error:", round(accuracy_metrics$mean_error_days), "days\n")
|
|
cat(" - RMSE:", round(accuracy_metrics$rmse_days), "days\n")
|
|
cat(" - Accuracy within 2 weeks:", round(accuracy_metrics$pct_within_2_weeks), "%\n\n")
|
|
|
|
cat("HARVEST TRIGGER (Operational Rule):\n")
|
|
cat(" - Primary: CI drops below 2.5 while in declining phase\n")
|
|
cat(" - Secondary: Predicted harvest date approaches (±2 weeks)\n")
|
|
cat(" - Confirmation: Visual inspection when both conditions met\n\n")
|
|
|
|
cat("FIELDS READY FOR HARVEST NOW:\n")
|
|
ready_now <- harvest_alerts %>%
|
|
filter(grepl("IMMINENT|WITHIN 2 WEEKS", alert))
|
|
|
|
if (nrow(ready_now) > 0) {
|
|
print(ready_now)
|
|
} else {
|
|
cat(" No fields at immediate harvest stage\n")
|
|
}
|
|
|
|
cat("\n=== ANALYSIS COMPLETE ===\n")
|