SmartCane/r_app/experiments/harvest_prediction/old/analyze_harvest_methods.R
2026-01-06 14:17:37 +01:00

442 lines
15 KiB
R

# ============================================================================
# HARVEST PREDICTION METHODS ANALYSIS
# Using existing CI data to explore growth curve modeling approaches
# ============================================================================
suppressPackageStartupMessages({
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(terra)
library(sf)
library(here)
library(ggplot2)
})
# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)
source(here("r_app", "parameters_project.R"))
# ============================================================================
# STEP 1: LOAD EXISTING CI TIME SERIES DATA
# ============================================================================
cat("=== LOADING CI TIME SERIES DATA ===\n\n")
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
cat("Loaded", nrow(ci_data_raw), "CI observations\n")
cat("Fields:", length(unique(ci_data_raw$field)), "\n")
cat("Date range:", min(ci_data_raw$Date, na.rm = TRUE), "to", max(ci_data_raw$Date, na.rm = TRUE), "\n\n")
# Prepare daily time series
time_series_daily <- ci_data_raw %>%
mutate(
date = as.Date(Date),
week = isoweek(date),
year = isoyear(date)
) %>%
select(
field_id = field,
date,
week,
year,
mean_ci = FitData
) %>%
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
arrange(field_id, date)
cat("Processed", nrow(time_series_daily), "daily CI observations\n")
cat("Sample of data:\n")
print(head(time_series_daily, 20))
# ============================================================================
# STEP 2: LOAD ACTUAL HARVEST DATA
# ============================================================================
cat("\n\n=== LOADING HARVEST DATA ===\n\n")
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
mutate(
season_start = as.Date(season_start),
season_end = as.Date(season_end)
) %>%
filter(!is.na(season_end))
# Get only fields that we have CI data for
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
filter(field %in% fields_with_ci) %>%
mutate(
harvest_week = isoweek(season_end),
harvest_year = isoyear(season_end)
)
cat("Total harvest records:", nrow(harvest_data_filtered), "\n")
cat("Fields with both CI and harvest data:", length(unique(harvest_data_filtered$field)), "\n")
cat("Date range:", min(harvest_data_filtered$season_end), "to", max(harvest_data_filtered$season_end), "\n\n")
# ============================================================================
# STEP 3: GROWTH CURVE MODELING FUNCTIONS
# ============================================================================
cat("=== IMPLEMENTING GROWTH CURVE MODELS ===\n\n")
# ------------------------------------------------------------------------
# 3.1: Current method - Quadratic polynomial (for comparison)
# ------------------------------------------------------------------------
fit_quadratic <- function(dates, ci_values) {
# Convert dates to numeric (days since first observation)
t <- as.numeric(dates - min(dates))
# Fit quadratic: y = a + b*t + c*t^2
model <- tryCatch({
lm(ci_values ~ t + I(t^2))
}, error = function(e) NULL)
if (is.null(model)) {
return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA))
}
fitted_values <- predict(model)
rsq <- summary(model)$r.squared
return(list(
fitted = fitted_values,
params = coef(model),
rsq = rsq,
model_type = "quadratic"
))
}
# ------------------------------------------------------------------------
# 3.2: Logistic curve - S-shaped growth
# ------------------------------------------------------------------------
fit_logistic <- function(dates, ci_values) {
# Convert dates to numeric
t <- as.numeric(dates - min(dates))
# Initial parameter estimates
K_init <- max(ci_values, na.rm = TRUE) # Carrying capacity
r_init <- 0.05 # Growth rate
t0_init <- median(t) # Inflection point
# Logistic function: y = K / (1 + exp(-r*(t-t0)))
logistic_fn <- function(t, K, r, t0) {
K / (1 + exp(-r * (t - t0)))
}
model <- tryCatch({
nls(ci_values ~ K / (1 + exp(-r * (t - t0))),
start = list(K = K_init, r = r_init, t0 = t0_init),
control = nls.control(maxiter = 100, warnOnly = TRUE))
}, error = function(e) NULL)
if (is.null(model)) {
return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA))
}
fitted_values <- predict(model)
rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2)
return(list(
fitted = fitted_values,
params = coef(model),
rsq = rsq,
model_type = "logistic"
))
}
# ------------------------------------------------------------------------
# 3.3: Double Logistic - For multi-phase growth (tillering + grand growth)
# ------------------------------------------------------------------------
fit_double_logistic <- function(dates, ci_values) {
# Convert dates to numeric
t <- as.numeric(dates - min(dates))
# Initial parameters for two growth phases
K1_init <- max(ci_values) * 0.6 # First phase peak
K2_init <- max(ci_values) # Second phase peak
r1_init <- 0.08 # First phase growth rate
r2_init <- 0.05 # Second phase growth rate
t1_init <- quantile(t, 0.25) # First inflection
t2_init <- quantile(t, 0.75) # Second inflection
# Double logistic: y = K1/(1+exp(-r1*(t-t1))) + K2/(1+exp(-r2*(t-t2)))
model <- tryCatch({
nls(ci_values ~ K1 / (1 + exp(-r1 * (t - t1))) + K2 / (1 + exp(-r2 * (t - t2))),
start = list(K1 = K1_init, r1 = r1_init, t1 = t1_init,
K2 = K2_init, r2 = r2_init, t2 = t2_init),
control = nls.control(maxiter = 100, warnOnly = TRUE))
}, error = function(e) NULL)
if (is.null(model)) {
return(list(fitted = rep(NA, length(dates)), params = rep(NA, 6), rsq = NA))
}
fitted_values <- predict(model)
rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2)
return(list(
fitted = fitted_values,
params = coef(model),
rsq = rsq,
model_type = "double_logistic"
))
}
# ------------------------------------------------------------------------
# 3.4: Savitzky-Golay smoothing (TIMESAT approach)
# ------------------------------------------------------------------------
fit_savgol <- function(dates, ci_values, window_length = 21, poly_order = 3) {
# Simple implementation of Savitzky-Golay filter
# For a proper implementation, use signal::sgolayfilt
n <- length(ci_values)
if (n < window_length) {
window_length <- ifelse(n %% 2 == 1, n, n - 1)
}
if (window_length < poly_order + 1) {
return(list(fitted = ci_values, params = NA, rsq = NA, model_type = "savgol"))
}
# Use moving average as simple smoothing for now
# In production, use signal package
half_window <- floor(window_length / 2)
smoothed <- rep(NA, n)
for (i in 1:n) {
start_idx <- max(1, i - half_window)
end_idx <- min(n, i + half_window)
smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE)
}
rsq <- 1 - sum((ci_values - smoothed)^2, na.rm = TRUE) / sum((ci_values - mean(ci_values, na.rm = TRUE))^2, na.rm = TRUE)
return(list(
fitted = smoothed,
params = c(window_length = window_length, poly_order = poly_order),
rsq = rsq,
model_type = "savgol"
))
}
# ------------------------------------------------------------------------
# 3.5: Extract phenological metrics from fitted curve
# ------------------------------------------------------------------------
extract_phenology <- function(dates, fitted_values) {
if (all(is.na(fitted_values))) {
return(list(
sos = NA, pos = NA, eos = NA, los = NA,
amplitude = NA, greenup_rate = NA, senescence_rate = NA
))
}
# Peak of Season (POS)
pos_idx <- which.max(fitted_values)
pos_date <- dates[pos_idx]
pos_value <- fitted_values[pos_idx]
# Baseline (minimum value)
baseline <- min(fitted_values, na.rm = TRUE)
amplitude <- pos_value - baseline
# Start of Season (SOS) - 20% of amplitude above baseline
sos_threshold <- baseline + 0.2 * amplitude
sos_idx <- which(fitted_values >= sos_threshold)[1]
sos_date <- ifelse(!is.na(sos_idx), dates[sos_idx], NA)
# End of Season (EOS) - return to 20% of amplitude after peak
eos_candidates <- which(fitted_values[pos_idx:length(fitted_values)] <= sos_threshold)
eos_idx <- ifelse(length(eos_candidates) > 0, pos_idx + eos_candidates[1] - 1, NA)
eos_date <- ifelse(!is.na(eos_idx), dates[eos_idx], NA)
# Length of Season
los <- ifelse(!is.na(sos_date) && !is.na(eos_date),
as.numeric(as.Date(eos_date, origin = "1970-01-01") - as.Date(sos_date, origin = "1970-01-01")),
NA)
# Rate of green-up (slope from SOS to POS)
if (!is.na(sos_idx) && pos_idx > sos_idx) {
greenup_days <- pos_idx - sos_idx
greenup_rate <- (pos_value - fitted_values[sos_idx]) / greenup_days
} else {
greenup_rate <- NA
}
# Rate of senescence (slope from POS to EOS) - KEY FOR HARVEST PREDICTION
if (!is.na(eos_idx) && eos_idx > pos_idx) {
senescence_days <- eos_idx - pos_idx
senescence_rate <- (fitted_values[eos_idx] - pos_value) / senescence_days
} else {
senescence_rate <- NA
}
return(list(
sos = as.Date(sos_date, origin = "1970-01-01"),
pos = as.Date(pos_date, origin = "1970-01-01"),
eos = as.Date(eos_date, origin = "1970-01-01"),
los = los,
amplitude = amplitude,
greenup_rate = greenup_rate,
senescence_rate = senescence_rate
))
}
cat("Growth curve modeling functions defined:\n")
cat(" - fit_quadratic() - Current polynomial approach\n")
cat(" - fit_logistic() - S-curve for single growth phase\n")
cat(" - fit_double_logistic() - Two-phase growth model\n")
cat(" - fit_savgol() - TIMESAT-style smoothing\n")
cat(" - extract_phenology() - Derive season metrics\n\n")
# ============================================================================
# STEP 4: COMPARE MODELS ON SAMPLE FIELDS
# ============================================================================
cat("=== TESTING MODELS ON SAMPLE FIELDS ===\n\n")
# Select a few fields with good data coverage
sample_fields <- harvest_data_filtered %>%
group_by(field) %>%
summarise(n_harvests = n(), .groups = "drop") %>%
filter(n_harvests >= 3) %>%
slice_head(n = 3) %>%
pull(field)
cat("Sample fields for analysis:", paste(sample_fields, collapse = ", "), "\n\n")
# Analyze each field
comparison_results <- list()
for (field_name in sample_fields) {
cat("\n--- Analyzing field:", field_name, "---\n")
# Get time series for this field
field_ts <- time_series_daily %>%
filter(field_id == field_name) %>%
arrange(date)
if (nrow(field_ts) < 100) {
cat("Insufficient data (", nrow(field_ts), " observations)\n")
next
}
cat("Time series length:", nrow(field_ts), "days\n")
cat("Date range:", min(field_ts$date), "to", max(field_ts$date), "\n")
# Fit all models
dates <- field_ts$date
ci_values <- field_ts$mean_ci
cat("\nFitting models...\n")
quad_fit <- fit_quadratic(dates, ci_values)
cat(" Quadratic R²:", round(quad_fit$rsq, 3), "\n")
logistic_fit <- fit_logistic(dates, ci_values)
cat(" Logistic R²:", round(logistic_fit$rsq, 3), "\n")
double_log_fit <- fit_double_logistic(dates, ci_values)
cat(" Double Logistic R²:", round(double_log_fit$rsq, 3), "\n")
savgol_fit <- fit_savgol(dates, ci_values)
cat(" Savitzky-Golay R²:", round(savgol_fit$rsq, 3), "\n")
# Extract phenology from best-fitting model
best_model <- list(quad_fit, logistic_fit, double_log_fit, savgol_fit)
best_idx <- which.max(sapply(best_model, function(x) ifelse(is.na(x$rsq), -Inf, x$rsq)))
best_fit <- best_model[[best_idx]]
cat("\nBest model:", best_fit$model_type, "\n")
phenology <- extract_phenology(dates, best_fit$fitted)
cat("Phenological metrics:\n")
cat(" Start of Season:", as.character(phenology$sos), "\n")
cat(" Peak of Season:", as.character(phenology$pos), "\n")
cat(" End of Season:", as.character(phenology$eos), "\n")
cat(" Season Length:", phenology$los, "days\n")
cat(" Senescence Rate:", round(phenology$senescence_rate, 4), "CI/day\n")
# Get actual harvests for this field
field_harvests <- harvest_data_filtered %>%
filter(field == field_name) %>%
select(season_end, harvest_year) %>%
arrange(season_end)
cat("\nActual harvest dates:\n")
print(field_harvests)
# Store results
comparison_results[[field_name]] <- list(
field = field_name,
n_obs = nrow(field_ts),
models = list(
quadratic = quad_fit,
logistic = logistic_fit,
double_logistic = double_log_fit,
savgol = savgol_fit
),
best_model = best_fit$model_type,
phenology = phenology,
actual_harvests = field_harvests
)
}
# ============================================================================
# STEP 5: SUMMARY AND RECOMMENDATIONS
# ============================================================================
cat("\n\n=== ANALYSIS SUMMARY ===\n\n")
cat("Growth Curve Model Performance:\n\n")
# Calculate average R² for each model type
all_rsq <- data.frame(
field = character(),
quadratic = numeric(),
logistic = numeric(),
double_logistic = numeric(),
savgol = numeric()
)
for (field_name in names(comparison_results)) {
result <- comparison_results[[field_name]]
all_rsq <- rbind(all_rsq, data.frame(
field = field_name,
quadratic = result$models$quadratic$rsq,
logistic = result$models$logistic$rsq,
double_logistic = result$models$double_logistic$rsq,
savgol = result$models$savgol$rsq
))
}
cat("Average R² by model:\n")
print(colMeans(all_rsq[, -1], na.rm = TRUE))
cat("\n\nKEY FINDINGS:\n\n")
cat("1. MODEL SELECTION:\n")
cat(" - Compare R² values above to determine best-fitting model type\n")
cat(" - Logistic/Double Logistic better capture biological growth patterns\n")
cat(" - Savitzky-Golay provides smooth curves without parametric assumptions\n\n")
cat("2. HARVEST PREDICTION STRATEGY:\n")
cat(" - Use phenological metrics (especially senescence rate)\n")
cat(" - Peak of Season (POS) timing correlates with harvest window\n")
cat(" - Rapid senescence after POS may indicate approaching harvest\n")
cat(" - For sugarcane: harvest often occurs DURING maturation, not after senescence\n\n")
cat("3. NEXT STEPS:\n")
cat(" a) Implement the best-performing model across all fields\n")
cat(" b) Correlate POS dates with actual harvest dates\n")
cat(" c) Build prediction model: harvest_date = f(POS, senescence_rate, field_age)\n")
cat(" d) Test predictive accuracy (weeks ahead of harvest)\n")
cat(" e) Consider multiple indices (NDVI, NDRE, EVI) if available\n\n")
cat("=== ANALYSIS COMPLETE ===\n")