442 lines
15 KiB
R
442 lines
15 KiB
R
# ============================================================================
|
|
# HARVEST PREDICTION METHODS ANALYSIS
|
|
# Using existing CI data to explore growth curve modeling approaches
|
|
# ============================================================================
|
|
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(terra)
|
|
library(sf)
|
|
library(here)
|
|
library(ggplot2)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# ============================================================================
|
|
# STEP 1: LOAD EXISTING CI TIME SERIES DATA
|
|
# ============================================================================
|
|
|
|
cat("=== LOADING CI TIME SERIES DATA ===\n\n")
|
|
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
cat("Loaded", nrow(ci_data_raw), "CI observations\n")
|
|
cat("Fields:", length(unique(ci_data_raw$field)), "\n")
|
|
cat("Date range:", min(ci_data_raw$Date, na.rm = TRUE), "to", max(ci_data_raw$Date, na.rm = TRUE), "\n\n")
|
|
|
|
# Prepare daily time series
|
|
time_series_daily <- ci_data_raw %>%
|
|
mutate(
|
|
date = as.Date(Date),
|
|
week = isoweek(date),
|
|
year = isoyear(date)
|
|
) %>%
|
|
select(
|
|
field_id = field,
|
|
date,
|
|
week,
|
|
year,
|
|
mean_ci = FitData
|
|
) %>%
|
|
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
|
arrange(field_id, date)
|
|
|
|
cat("Processed", nrow(time_series_daily), "daily CI observations\n")
|
|
cat("Sample of data:\n")
|
|
print(head(time_series_daily, 20))
|
|
|
|
# ============================================================================
|
|
# STEP 2: LOAD ACTUAL HARVEST DATA
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== LOADING HARVEST DATA ===\n\n")
|
|
|
|
harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
) %>%
|
|
filter(!is.na(season_end))
|
|
|
|
# Get only fields that we have CI data for
|
|
fields_with_ci <- unique(time_series_daily$field_id)
|
|
harvest_data_filtered <- harvest_data %>%
|
|
filter(field %in% fields_with_ci) %>%
|
|
mutate(
|
|
harvest_week = isoweek(season_end),
|
|
harvest_year = isoyear(season_end)
|
|
)
|
|
|
|
cat("Total harvest records:", nrow(harvest_data_filtered), "\n")
|
|
cat("Fields with both CI and harvest data:", length(unique(harvest_data_filtered$field)), "\n")
|
|
cat("Date range:", min(harvest_data_filtered$season_end), "to", max(harvest_data_filtered$season_end), "\n\n")
|
|
|
|
# ============================================================================
|
|
# STEP 3: GROWTH CURVE MODELING FUNCTIONS
|
|
# ============================================================================
|
|
|
|
cat("=== IMPLEMENTING GROWTH CURVE MODELS ===\n\n")
|
|
|
|
# ------------------------------------------------------------------------
|
|
# 3.1: Current method - Quadratic polynomial (for comparison)
|
|
# ------------------------------------------------------------------------
|
|
fit_quadratic <- function(dates, ci_values) {
|
|
# Convert dates to numeric (days since first observation)
|
|
t <- as.numeric(dates - min(dates))
|
|
|
|
# Fit quadratic: y = a + b*t + c*t^2
|
|
model <- tryCatch({
|
|
lm(ci_values ~ t + I(t^2))
|
|
}, error = function(e) NULL)
|
|
|
|
if (is.null(model)) {
|
|
return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA))
|
|
}
|
|
|
|
fitted_values <- predict(model)
|
|
rsq <- summary(model)$r.squared
|
|
|
|
return(list(
|
|
fitted = fitted_values,
|
|
params = coef(model),
|
|
rsq = rsq,
|
|
model_type = "quadratic"
|
|
))
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
# 3.2: Logistic curve - S-shaped growth
|
|
# ------------------------------------------------------------------------
|
|
fit_logistic <- function(dates, ci_values) {
|
|
# Convert dates to numeric
|
|
t <- as.numeric(dates - min(dates))
|
|
|
|
# Initial parameter estimates
|
|
K_init <- max(ci_values, na.rm = TRUE) # Carrying capacity
|
|
r_init <- 0.05 # Growth rate
|
|
t0_init <- median(t) # Inflection point
|
|
|
|
# Logistic function: y = K / (1 + exp(-r*(t-t0)))
|
|
logistic_fn <- function(t, K, r, t0) {
|
|
K / (1 + exp(-r * (t - t0)))
|
|
}
|
|
|
|
model <- tryCatch({
|
|
nls(ci_values ~ K / (1 + exp(-r * (t - t0))),
|
|
start = list(K = K_init, r = r_init, t0 = t0_init),
|
|
control = nls.control(maxiter = 100, warnOnly = TRUE))
|
|
}, error = function(e) NULL)
|
|
|
|
if (is.null(model)) {
|
|
return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA))
|
|
}
|
|
|
|
fitted_values <- predict(model)
|
|
rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2)
|
|
|
|
return(list(
|
|
fitted = fitted_values,
|
|
params = coef(model),
|
|
rsq = rsq,
|
|
model_type = "logistic"
|
|
))
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
# 3.3: Double Logistic - For multi-phase growth (tillering + grand growth)
|
|
# ------------------------------------------------------------------------
|
|
fit_double_logistic <- function(dates, ci_values) {
|
|
# Convert dates to numeric
|
|
t <- as.numeric(dates - min(dates))
|
|
|
|
# Initial parameters for two growth phases
|
|
K1_init <- max(ci_values) * 0.6 # First phase peak
|
|
K2_init <- max(ci_values) # Second phase peak
|
|
r1_init <- 0.08 # First phase growth rate
|
|
r2_init <- 0.05 # Second phase growth rate
|
|
t1_init <- quantile(t, 0.25) # First inflection
|
|
t2_init <- quantile(t, 0.75) # Second inflection
|
|
|
|
# Double logistic: y = K1/(1+exp(-r1*(t-t1))) + K2/(1+exp(-r2*(t-t2)))
|
|
model <- tryCatch({
|
|
nls(ci_values ~ K1 / (1 + exp(-r1 * (t - t1))) + K2 / (1 + exp(-r2 * (t - t2))),
|
|
start = list(K1 = K1_init, r1 = r1_init, t1 = t1_init,
|
|
K2 = K2_init, r2 = r2_init, t2 = t2_init),
|
|
control = nls.control(maxiter = 100, warnOnly = TRUE))
|
|
}, error = function(e) NULL)
|
|
|
|
if (is.null(model)) {
|
|
return(list(fitted = rep(NA, length(dates)), params = rep(NA, 6), rsq = NA))
|
|
}
|
|
|
|
fitted_values <- predict(model)
|
|
rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2)
|
|
|
|
return(list(
|
|
fitted = fitted_values,
|
|
params = coef(model),
|
|
rsq = rsq,
|
|
model_type = "double_logistic"
|
|
))
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
# 3.4: Savitzky-Golay smoothing (TIMESAT approach)
|
|
# ------------------------------------------------------------------------
|
|
fit_savgol <- function(dates, ci_values, window_length = 21, poly_order = 3) {
|
|
# Simple implementation of Savitzky-Golay filter
|
|
# For a proper implementation, use signal::sgolayfilt
|
|
|
|
n <- length(ci_values)
|
|
if (n < window_length) {
|
|
window_length <- ifelse(n %% 2 == 1, n, n - 1)
|
|
}
|
|
if (window_length < poly_order + 1) {
|
|
return(list(fitted = ci_values, params = NA, rsq = NA, model_type = "savgol"))
|
|
}
|
|
|
|
# Use moving average as simple smoothing for now
|
|
# In production, use signal package
|
|
half_window <- floor(window_length / 2)
|
|
smoothed <- rep(NA, n)
|
|
|
|
for (i in 1:n) {
|
|
start_idx <- max(1, i - half_window)
|
|
end_idx <- min(n, i + half_window)
|
|
smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE)
|
|
}
|
|
|
|
rsq <- 1 - sum((ci_values - smoothed)^2, na.rm = TRUE) / sum((ci_values - mean(ci_values, na.rm = TRUE))^2, na.rm = TRUE)
|
|
|
|
return(list(
|
|
fitted = smoothed,
|
|
params = c(window_length = window_length, poly_order = poly_order),
|
|
rsq = rsq,
|
|
model_type = "savgol"
|
|
))
|
|
}
|
|
|
|
# ------------------------------------------------------------------------
|
|
# 3.5: Extract phenological metrics from fitted curve
|
|
# ------------------------------------------------------------------------
|
|
extract_phenology <- function(dates, fitted_values) {
|
|
if (all(is.na(fitted_values))) {
|
|
return(list(
|
|
sos = NA, pos = NA, eos = NA, los = NA,
|
|
amplitude = NA, greenup_rate = NA, senescence_rate = NA
|
|
))
|
|
}
|
|
|
|
# Peak of Season (POS)
|
|
pos_idx <- which.max(fitted_values)
|
|
pos_date <- dates[pos_idx]
|
|
pos_value <- fitted_values[pos_idx]
|
|
|
|
# Baseline (minimum value)
|
|
baseline <- min(fitted_values, na.rm = TRUE)
|
|
amplitude <- pos_value - baseline
|
|
|
|
# Start of Season (SOS) - 20% of amplitude above baseline
|
|
sos_threshold <- baseline + 0.2 * amplitude
|
|
sos_idx <- which(fitted_values >= sos_threshold)[1]
|
|
sos_date <- ifelse(!is.na(sos_idx), dates[sos_idx], NA)
|
|
|
|
# End of Season (EOS) - return to 20% of amplitude after peak
|
|
eos_candidates <- which(fitted_values[pos_idx:length(fitted_values)] <= sos_threshold)
|
|
eos_idx <- ifelse(length(eos_candidates) > 0, pos_idx + eos_candidates[1] - 1, NA)
|
|
eos_date <- ifelse(!is.na(eos_idx), dates[eos_idx], NA)
|
|
|
|
# Length of Season
|
|
los <- ifelse(!is.na(sos_date) && !is.na(eos_date),
|
|
as.numeric(as.Date(eos_date, origin = "1970-01-01") - as.Date(sos_date, origin = "1970-01-01")),
|
|
NA)
|
|
|
|
# Rate of green-up (slope from SOS to POS)
|
|
if (!is.na(sos_idx) && pos_idx > sos_idx) {
|
|
greenup_days <- pos_idx - sos_idx
|
|
greenup_rate <- (pos_value - fitted_values[sos_idx]) / greenup_days
|
|
} else {
|
|
greenup_rate <- NA
|
|
}
|
|
|
|
# Rate of senescence (slope from POS to EOS) - KEY FOR HARVEST PREDICTION
|
|
if (!is.na(eos_idx) && eos_idx > pos_idx) {
|
|
senescence_days <- eos_idx - pos_idx
|
|
senescence_rate <- (fitted_values[eos_idx] - pos_value) / senescence_days
|
|
} else {
|
|
senescence_rate <- NA
|
|
}
|
|
|
|
return(list(
|
|
sos = as.Date(sos_date, origin = "1970-01-01"),
|
|
pos = as.Date(pos_date, origin = "1970-01-01"),
|
|
eos = as.Date(eos_date, origin = "1970-01-01"),
|
|
los = los,
|
|
amplitude = amplitude,
|
|
greenup_rate = greenup_rate,
|
|
senescence_rate = senescence_rate
|
|
))
|
|
}
|
|
|
|
cat("Growth curve modeling functions defined:\n")
|
|
cat(" - fit_quadratic() - Current polynomial approach\n")
|
|
cat(" - fit_logistic() - S-curve for single growth phase\n")
|
|
cat(" - fit_double_logistic() - Two-phase growth model\n")
|
|
cat(" - fit_savgol() - TIMESAT-style smoothing\n")
|
|
cat(" - extract_phenology() - Derive season metrics\n\n")
|
|
|
|
# ============================================================================
|
|
# STEP 4: COMPARE MODELS ON SAMPLE FIELDS
|
|
# ============================================================================
|
|
|
|
cat("=== TESTING MODELS ON SAMPLE FIELDS ===\n\n")
|
|
|
|
# Select a few fields with good data coverage
|
|
sample_fields <- harvest_data_filtered %>%
|
|
group_by(field) %>%
|
|
summarise(n_harvests = n(), .groups = "drop") %>%
|
|
filter(n_harvests >= 3) %>%
|
|
slice_head(n = 3) %>%
|
|
pull(field)
|
|
|
|
cat("Sample fields for analysis:", paste(sample_fields, collapse = ", "), "\n\n")
|
|
|
|
# Analyze each field
|
|
comparison_results <- list()
|
|
|
|
for (field_name in sample_fields) {
|
|
cat("\n--- Analyzing field:", field_name, "---\n")
|
|
|
|
# Get time series for this field
|
|
field_ts <- time_series_daily %>%
|
|
filter(field_id == field_name) %>%
|
|
arrange(date)
|
|
|
|
if (nrow(field_ts) < 100) {
|
|
cat("Insufficient data (", nrow(field_ts), " observations)\n")
|
|
next
|
|
}
|
|
|
|
cat("Time series length:", nrow(field_ts), "days\n")
|
|
cat("Date range:", min(field_ts$date), "to", max(field_ts$date), "\n")
|
|
|
|
# Fit all models
|
|
dates <- field_ts$date
|
|
ci_values <- field_ts$mean_ci
|
|
|
|
cat("\nFitting models...\n")
|
|
|
|
quad_fit <- fit_quadratic(dates, ci_values)
|
|
cat(" Quadratic R²:", round(quad_fit$rsq, 3), "\n")
|
|
|
|
logistic_fit <- fit_logistic(dates, ci_values)
|
|
cat(" Logistic R²:", round(logistic_fit$rsq, 3), "\n")
|
|
|
|
double_log_fit <- fit_double_logistic(dates, ci_values)
|
|
cat(" Double Logistic R²:", round(double_log_fit$rsq, 3), "\n")
|
|
|
|
savgol_fit <- fit_savgol(dates, ci_values)
|
|
cat(" Savitzky-Golay R²:", round(savgol_fit$rsq, 3), "\n")
|
|
|
|
# Extract phenology from best-fitting model
|
|
best_model <- list(quad_fit, logistic_fit, double_log_fit, savgol_fit)
|
|
best_idx <- which.max(sapply(best_model, function(x) ifelse(is.na(x$rsq), -Inf, x$rsq)))
|
|
best_fit <- best_model[[best_idx]]
|
|
|
|
cat("\nBest model:", best_fit$model_type, "\n")
|
|
|
|
phenology <- extract_phenology(dates, best_fit$fitted)
|
|
cat("Phenological metrics:\n")
|
|
cat(" Start of Season:", as.character(phenology$sos), "\n")
|
|
cat(" Peak of Season:", as.character(phenology$pos), "\n")
|
|
cat(" End of Season:", as.character(phenology$eos), "\n")
|
|
cat(" Season Length:", phenology$los, "days\n")
|
|
cat(" Senescence Rate:", round(phenology$senescence_rate, 4), "CI/day\n")
|
|
|
|
# Get actual harvests for this field
|
|
field_harvests <- harvest_data_filtered %>%
|
|
filter(field == field_name) %>%
|
|
select(season_end, harvest_year) %>%
|
|
arrange(season_end)
|
|
|
|
cat("\nActual harvest dates:\n")
|
|
print(field_harvests)
|
|
|
|
# Store results
|
|
comparison_results[[field_name]] <- list(
|
|
field = field_name,
|
|
n_obs = nrow(field_ts),
|
|
models = list(
|
|
quadratic = quad_fit,
|
|
logistic = logistic_fit,
|
|
double_logistic = double_log_fit,
|
|
savgol = savgol_fit
|
|
),
|
|
best_model = best_fit$model_type,
|
|
phenology = phenology,
|
|
actual_harvests = field_harvests
|
|
)
|
|
}
|
|
|
|
# ============================================================================
|
|
# STEP 5: SUMMARY AND RECOMMENDATIONS
|
|
# ============================================================================
|
|
|
|
cat("\n\n=== ANALYSIS SUMMARY ===\n\n")
|
|
|
|
cat("Growth Curve Model Performance:\n\n")
|
|
|
|
# Calculate average R² for each model type
|
|
all_rsq <- data.frame(
|
|
field = character(),
|
|
quadratic = numeric(),
|
|
logistic = numeric(),
|
|
double_logistic = numeric(),
|
|
savgol = numeric()
|
|
)
|
|
|
|
for (field_name in names(comparison_results)) {
|
|
result <- comparison_results[[field_name]]
|
|
all_rsq <- rbind(all_rsq, data.frame(
|
|
field = field_name,
|
|
quadratic = result$models$quadratic$rsq,
|
|
logistic = result$models$logistic$rsq,
|
|
double_logistic = result$models$double_logistic$rsq,
|
|
savgol = result$models$savgol$rsq
|
|
))
|
|
}
|
|
|
|
cat("Average R² by model:\n")
|
|
print(colMeans(all_rsq[, -1], na.rm = TRUE))
|
|
|
|
cat("\n\nKEY FINDINGS:\n\n")
|
|
|
|
cat("1. MODEL SELECTION:\n")
|
|
cat(" - Compare R² values above to determine best-fitting model type\n")
|
|
cat(" - Logistic/Double Logistic better capture biological growth patterns\n")
|
|
cat(" - Savitzky-Golay provides smooth curves without parametric assumptions\n\n")
|
|
|
|
cat("2. HARVEST PREDICTION STRATEGY:\n")
|
|
cat(" - Use phenological metrics (especially senescence rate)\n")
|
|
cat(" - Peak of Season (POS) timing correlates with harvest window\n")
|
|
cat(" - Rapid senescence after POS may indicate approaching harvest\n")
|
|
cat(" - For sugarcane: harvest often occurs DURING maturation, not after senescence\n\n")
|
|
|
|
cat("3. NEXT STEPS:\n")
|
|
cat(" a) Implement the best-performing model across all fields\n")
|
|
cat(" b) Correlate POS dates with actual harvest dates\n")
|
|
cat(" c) Build prediction model: harvest_date = f(POS, senescence_rate, field_age)\n")
|
|
cat(" d) Test predictive accuracy (weeks ahead of harvest)\n")
|
|
cat(" e) Consider multiple indices (NDVI, NDRE, EVI) if available\n\n")
|
|
|
|
cat("=== ANALYSIS COMPLETE ===\n")
|