SmartCane/predict_harvest_operational.R
2026-01-06 14:17:37 +01:00

448 lines
14 KiB
R

# ============================================================================
# OPERATIONAL HARVEST PREDICTION
# Analyze current season growth curves to predict harvest timing
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(terra)
library(sf)
library(here)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))
# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
cat("=== LOADING DATA ===\n\n")
# Load CI time series
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
time_series_daily <- ci_data_raw %>%
mutate(
date = as.Date(Date),
week = isoweek(date),
year = isoyear(date)
) %>%
select(
field_id = field,
date,
week,
year,
mean_ci = FitData
) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
# Load harvest data
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
filter(field %in% fields_with_ci) %>%
arrange(field, season_end)
cat("Loaded CI data for", length(fields_with_ci), "fields\n")
cat("Loaded harvest data for", length(unique(harvest_data_filtered$field)), "fields\n\n")
# ============================================================================
# STEP 2: SEGMENT TIME SERIES BY SEASON
# ============================================================================
cat("=== SEGMENTING TIME SERIES INTO INDIVIDUAL SEASONS ===\n\n")
# For each field, create seasons based on harvest dates
# Season starts day after previous harvest, ends at next harvest
create_seasons <- function(field_name, ci_ts, harvest_df) {
# Get CI data for this field
field_ci <- ci_ts %>%
filter(field_id == field_name) %>%
arrange(date)
# Get harvest dates for this field
field_harvests <- harvest_df %>%
filter(field == field_name) %>%
arrange(season_end) %>%
mutate(season_id = row_number())
if (nrow(field_harvests) == 0) {
return(NULL)
}
# Create season segments
seasons_list <- list()
for (i in 1:nrow(field_harvests)) {
# Season start: day after previous harvest (or start of data if first season)
if (i == 1) {
season_start <- min(field_ci$date)
} else {
season_start <- field_harvests$season_end[i-1] + 1
}
# Season end: current harvest date
season_end <- field_harvests$season_end[i]
# Extract CI data for this season
season_ci <- field_ci %>%
filter(date >= season_start, date <= season_end)
if (nrow(season_ci) > 0) {
season_ci$season_id <- i
season_ci$season_start_date <- season_start
season_ci$season_end_date <- season_end
season_ci$days_in_season <- as.numeric(season_end - season_start)
season_ci$days_since_start <- as.numeric(season_ci$date - season_start)
season_ci$days_until_harvest <- as.numeric(season_end - season_ci$date)
seasons_list[[i]] <- season_ci
}
}
# Add current ongoing season (after last harvest)
if (nrow(field_harvests) > 0) {
last_harvest <- field_harvests$season_end[nrow(field_harvests)]
current_season_start <- last_harvest + 1
current_season_ci <- field_ci %>%
filter(date >= current_season_start)
if (nrow(current_season_ci) > 0) {
current_season_ci$season_id <- nrow(field_harvests) + 1
current_season_ci$season_start_date <- current_season_start
current_season_ci$season_end_date <- NA # Unknown - this is what we're predicting
current_season_ci$days_in_season <- NA
current_season_ci$days_since_start <- as.numeric(current_season_ci$date - current_season_start)
current_season_ci$days_until_harvest <- NA
seasons_list[[length(seasons_list) + 1]] <- current_season_ci
}
}
if (length(seasons_list) > 0) {
return(bind_rows(seasons_list))
} else {
return(NULL)
}
}
# Create segmented data for all fields
all_seasons <- lapply(fields_with_ci, function(field_name) {
seasons <- create_seasons(field_name, time_series_daily, harvest_data_filtered)
if (!is.null(seasons)) {
seasons$field_id <- field_name
}
return(seasons)
}) %>%
bind_rows()
cat("Created", nrow(all_seasons), "season-segmented observations\n")
cat("Total seasons:", length(unique(paste(all_seasons$field_id, all_seasons$season_id))), "\n\n")
# Summary by season
season_summary <- all_seasons %>%
group_by(field_id, season_id) %>%
summarise(
season_start = min(season_start_date),
season_end = max(season_end_date),
n_observations = n(),
days_duration = max(days_in_season, na.rm = TRUE),
max_ci = max(mean_ci, na.rm = TRUE),
is_current = all(is.na(season_end_date)),
.groups = "drop"
)
cat("Season summary:\n")
print(head(season_summary, 20))
# ============================================================================
# STEP 3: GROWTH CURVE ANALYSIS PER SEASON
# ============================================================================
cat("\n\n=== ANALYZING GROWTH CURVES PER SEASON ===\n\n")
# Smoothing function (Savitzky-Golay style moving average)
smooth_ci <- function(ci_values, window = 15) {
n <- length(ci_values)
if (n < window) window <- max(3, n)
smoothed <- rep(NA, n)
half_window <- floor(window / 2)
for (i in 1:n) {
start_idx <- max(1, i - half_window)
end_idx <- min(n, i + half_window)
smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE)
}
return(smoothed)
}
# Detect peak and senescence
analyze_season_curve <- function(season_df) {
if (nrow(season_df) < 20) {
return(list(
peak_date = NA,
peak_ci = NA,
peak_days_since_start = NA,
senescence_start_date = NA,
senescence_rate = NA,
current_phase = "insufficient_data"
))
}
# Smooth the curve
season_df$ci_smooth <- smooth_ci(season_df$mean_ci)
# Find peak
peak_idx <- which.max(season_df$ci_smooth)
peak_date <- season_df$date[peak_idx]
peak_ci <- season_df$ci_smooth[peak_idx]
peak_days <- season_df$days_since_start[peak_idx]
# Check if we're past the peak
last_date <- max(season_df$date)
is_post_peak <- last_date > peak_date
# Calculate senescence rate (slope after peak)
if (is_post_peak && peak_idx < nrow(season_df) - 5) {
post_peak_data <- season_df[(peak_idx):nrow(season_df), ]
# Fit linear model to post-peak data
lm_post <- lm(ci_smooth ~ days_since_start, data = post_peak_data)
senescence_rate <- coef(lm_post)[2] # Slope
senescence_start <- peak_date
} else {
senescence_rate <- NA
senescence_start <- NA
}
# Determine current phase
current_ci <- tail(season_df$ci_smooth, 1)
if (is.na(current_ci)) {
current_phase <- "unknown"
} else if (!is_post_peak) {
current_phase <- "growing"
} else if (current_ci > 2.5) {
current_phase <- "post_peak_maturing"
} else {
current_phase <- "declining_harvest_approaching"
}
return(list(
peak_date = peak_date,
peak_ci = peak_ci,
peak_days_since_start = peak_days,
senescence_start_date = senescence_start,
senescence_rate = senescence_rate,
current_phase = current_phase,
current_ci = current_ci,
last_obs_date = last_date
))
}
# Analyze each season
season_analysis <- all_seasons %>%
group_by(field_id, season_id) %>%
group_modify(~ {
analysis <- analyze_season_curve(.x)
as.data.frame(analysis)
}) %>%
ungroup()
# Merge with season summary
season_results <- season_summary %>%
left_join(season_analysis, by = c("field_id", "season_id"))
cat("Analyzed", nrow(season_results), "seasons\n\n")
# ============================================================================
# STEP 4: HARVEST TIMING PATTERNS (Historical Analysis)
# ============================================================================
cat("=== ANALYZING HISTORICAL HARVEST TIMING PATTERNS ===\n\n")
# Look at completed seasons only
historical_seasons <- season_results %>%
filter(!is_current) %>%
mutate(
days_peak_to_harvest = as.numeric(season_end - peak_date)
)
cat("Historical season statistics (completed harvests):\n\n")
cat("Average days from peak to harvest:\n")
peak_to_harvest_stats <- historical_seasons %>%
filter(!is.na(days_peak_to_harvest)) %>%
summarise(
mean_days = mean(days_peak_to_harvest, na.rm = TRUE),
median_days = median(days_peak_to_harvest, na.rm = TRUE),
sd_days = sd(days_peak_to_harvest, na.rm = TRUE),
min_days = min(days_peak_to_harvest, na.rm = TRUE),
max_days = max(days_peak_to_harvest, na.rm = TRUE)
)
print(peak_to_harvest_stats)
cat("\n\nPeak CI at harvest time:\n")
peak_ci_stats <- historical_seasons %>%
filter(!is.na(peak_ci)) %>%
summarise(
mean_peak_ci = mean(peak_ci, na.rm = TRUE),
median_peak_ci = median(peak_ci, na.rm = TRUE),
sd_peak_ci = sd(peak_ci, na.rm = TRUE)
)
print(peak_ci_stats)
cat("\n\nSenescence rate (CI decline per day after peak):\n")
senescence_stats <- historical_seasons %>%
filter(!is.na(senescence_rate), senescence_rate < 0) %>%
summarise(
mean_rate = mean(senescence_rate, na.rm = TRUE),
median_rate = median(senescence_rate, na.rm = TRUE),
sd_rate = sd(senescence_rate, na.rm = TRUE)
)
print(senescence_stats)
# ============================================================================
# STEP 5: CURRENT SEASON PREDICTIONS
# ============================================================================
cat("\n\n=== PREDICTING HARVEST FOR CURRENT ONGOING SEASONS ===\n\n")
# Get current seasons
current_seasons <- season_results %>%
filter(is_current) %>%
mutate(
# Use historical average to predict harvest
predicted_harvest_date = peak_date + peak_to_harvest_stats$mean_days,
days_until_predicted_harvest = as.numeric(predicted_harvest_date - last_obs_date),
weeks_until_predicted_harvest = days_until_predicted_harvest / 7
)
cat("Current ongoing seasons (ready for harvest prediction):\n\n")
current_predictions <- current_seasons %>%
mutate(
days_since_peak = as.numeric(last_obs_date - peak_date)
) %>%
select(
field_id,
season_id,
last_harvest = season_start,
last_observation = last_obs_date,
current_ci,
current_phase,
peak_date,
peak_ci,
days_since_peak,
predicted_harvest = predicted_harvest_date,
weeks_until_harvest = weeks_until_predicted_harvest
) %>%
arrange(weeks_until_harvest)
print(current_predictions)
cat("\n\nHarvest readiness assessment:\n\n")
harvest_alerts <- current_predictions %>%
mutate(
alert = case_when(
current_ci < 2.5 & current_phase == "declining_harvest_approaching" ~ "🚨 HARVEST IMMINENT (CI < 2.5)",
current_ci < 3.0 & weeks_until_harvest < 2 ~ "⚠️ HARVEST WITHIN 2 WEEKS",
weeks_until_harvest < 4 ~ "💡 HARVEST WITHIN 1 MONTH",
current_phase == "growing" ~ "✅ STILL GROWING",
TRUE ~ "📊 MONITORING"
)
) %>%
select(field_id, current_ci, current_phase, predicted_harvest, alert)
print(harvest_alerts)
# ============================================================================
# STEP 6: VALIDATION OF PREDICTION METHOD
# ============================================================================
cat("\n\n=== VALIDATING PREDICTION METHOD ON HISTORICAL DATA ===\n\n")
# For each historical season, predict when harvest would occur using only data up to peak
validation_results <- historical_seasons %>%
filter(!is.na(peak_date), !is.na(season_end)) %>%
mutate(
predicted_harvest = peak_date + peak_to_harvest_stats$mean_days,
actual_harvest = season_end,
prediction_error_days = as.numeric(predicted_harvest - actual_harvest),
prediction_error_weeks = prediction_error_days / 7
)
cat("Prediction accuracy metrics:\n\n")
accuracy_metrics <- validation_results %>%
summarise(
n_predictions = n(),
mean_error_days = mean(abs(prediction_error_days), na.rm = TRUE),
median_error_days = median(abs(prediction_error_days), na.rm = TRUE),
rmse_days = sqrt(mean(prediction_error_days^2, na.rm = TRUE)),
within_2_weeks = sum(abs(prediction_error_weeks) <= 2, na.rm = TRUE),
pct_within_2_weeks = 100 * sum(abs(prediction_error_weeks) <= 2, na.rm = TRUE) / n()
)
print(accuracy_metrics)
cat("\n\nSample predictions vs actual:\n")
print(validation_results %>%
select(field_id, season_id, peak_date, predicted_harvest, actual_harvest,
prediction_error_weeks) %>%
head(15))
# ============================================================================
# SUMMARY
# ============================================================================
cat("\n\n=== OPERATIONAL HARVEST PREDICTION SUMMARY ===\n\n")
cat("METHODOLOGY:\n")
cat("1. Segment CI time series by harvest dates (each season = planting to harvest)\n")
cat("2. Smooth CI data to identify peak (maturity point)\n")
cat("3. Historical pattern: Average", round(peak_to_harvest_stats$mean_days), "days from peak to harvest\n")
cat("4. Current season prediction: Peak date +", round(peak_to_harvest_stats$mean_days), "days\n\n")
cat("PREDICTION ACCURACY (Historical Validation):\n")
cat(" - Mean absolute error:", round(accuracy_metrics$mean_error_days), "days\n")
cat(" - RMSE:", round(accuracy_metrics$rmse_days), "days\n")
cat(" - Accuracy within 2 weeks:", round(accuracy_metrics$pct_within_2_weeks), "%\n\n")
cat("HARVEST TRIGGER (Operational Rule):\n")
cat(" - Primary: CI drops below 2.5 while in declining phase\n")
cat(" - Secondary: Predicted harvest date approaches (±2 weeks)\n")
cat(" - Confirmation: Visual inspection when both conditions met\n\n")
cat("FIELDS READY FOR HARVEST NOW:\n")
ready_now <- harvest_alerts %>%
filter(grepl("IMMINENT|WITHIN 2 WEEKS", alert))
if (nrow(ready_now) > 0) {
print(ready_now)
} else {
cat(" No fields at immediate harvest stage\n")
}
cat("\n=== ANALYSIS COMPLETE ===\n")