# ============================================================================ # HARVEST PREDICTION METHODS ANALYSIS # Using existing CI data to explore growth curve modeling approaches # ============================================================================ suppressPackageStartupMessages({ library(readxl) library(dplyr) library(tidyr) library(lubridate) library(terra) library(sf) library(here) library(ggplot2) }) # Set project directory project_dir <- "esa" assign("project_dir", project_dir, envir = .GlobalEnv) source(here("r_app", "parameters_project.R")) # ============================================================================ # STEP 1: LOAD EXISTING CI TIME SERIES DATA # ============================================================================ cat("=== LOADING CI TIME SERIES DATA ===\n\n") ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds") ci_data_raw <- readRDS(ci_rds_file) %>% ungroup() cat("Loaded", nrow(ci_data_raw), "CI observations\n") cat("Fields:", length(unique(ci_data_raw$field)), "\n") cat("Date range:", min(ci_data_raw$Date, na.rm = TRUE), "to", max(ci_data_raw$Date, na.rm = TRUE), "\n\n") # Prepare daily time series time_series_daily <- ci_data_raw %>% mutate( date = as.Date(Date), week = isoweek(date), year = isoyear(date) ) %>% select( field_id = field, date, week, year, mean_ci = FitData ) %>% filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>% arrange(field_id, date) cat("Processed", nrow(time_series_daily), "daily CI observations\n") cat("Sample of data:\n") print(head(time_series_daily, 20)) # ============================================================================ # STEP 2: LOAD ACTUAL HARVEST DATA # ============================================================================ cat("\n\n=== LOADING HARVEST DATA ===\n\n") harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>% mutate( season_start = as.Date(season_start), season_end = as.Date(season_end) ) %>% filter(!is.na(season_end)) # Get only fields that we have CI data for fields_with_ci <- unique(time_series_daily$field_id) harvest_data_filtered <- harvest_data %>% filter(field %in% fields_with_ci) %>% mutate( harvest_week = isoweek(season_end), harvest_year = isoyear(season_end) ) cat("Total harvest records:", nrow(harvest_data_filtered), "\n") cat("Fields with both CI and harvest data:", length(unique(harvest_data_filtered$field)), "\n") cat("Date range:", min(harvest_data_filtered$season_end), "to", max(harvest_data_filtered$season_end), "\n\n") # ============================================================================ # STEP 3: GROWTH CURVE MODELING FUNCTIONS # ============================================================================ cat("=== IMPLEMENTING GROWTH CURVE MODELS ===\n\n") # ------------------------------------------------------------------------ # 3.1: Current method - Quadratic polynomial (for comparison) # ------------------------------------------------------------------------ fit_quadratic <- function(dates, ci_values) { # Convert dates to numeric (days since first observation) t <- as.numeric(dates - min(dates)) # Fit quadratic: y = a + b*t + c*t^2 model <- tryCatch({ lm(ci_values ~ t + I(t^2)) }, error = function(e) NULL) if (is.null(model)) { return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA)) } fitted_values <- predict(model) rsq <- summary(model)$r.squared return(list( fitted = fitted_values, params = coef(model), rsq = rsq, model_type = "quadratic" )) } # ------------------------------------------------------------------------ # 3.2: Logistic curve - S-shaped growth # ------------------------------------------------------------------------ fit_logistic <- function(dates, ci_values) { # Convert dates to numeric t <- as.numeric(dates - min(dates)) # Initial parameter estimates K_init <- max(ci_values, na.rm = TRUE) # Carrying capacity r_init <- 0.05 # Growth rate t0_init <- median(t) # Inflection point # Logistic function: y = K / (1 + exp(-r*(t-t0))) logistic_fn <- function(t, K, r, t0) { K / (1 + exp(-r * (t - t0))) } model <- tryCatch({ nls(ci_values ~ K / (1 + exp(-r * (t - t0))), start = list(K = K_init, r = r_init, t0 = t0_init), control = nls.control(maxiter = 100, warnOnly = TRUE)) }, error = function(e) NULL) if (is.null(model)) { return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA)) } fitted_values <- predict(model) rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2) return(list( fitted = fitted_values, params = coef(model), rsq = rsq, model_type = "logistic" )) } # ------------------------------------------------------------------------ # 3.3: Double Logistic - For multi-phase growth (tillering + grand growth) # ------------------------------------------------------------------------ fit_double_logistic <- function(dates, ci_values) { # Convert dates to numeric t <- as.numeric(dates - min(dates)) # Initial parameters for two growth phases K1_init <- max(ci_values) * 0.6 # First phase peak K2_init <- max(ci_values) # Second phase peak r1_init <- 0.08 # First phase growth rate r2_init <- 0.05 # Second phase growth rate t1_init <- quantile(t, 0.25) # First inflection t2_init <- quantile(t, 0.75) # Second inflection # Double logistic: y = K1/(1+exp(-r1*(t-t1))) + K2/(1+exp(-r2*(t-t2))) model <- tryCatch({ nls(ci_values ~ K1 / (1 + exp(-r1 * (t - t1))) + K2 / (1 + exp(-r2 * (t - t2))), start = list(K1 = K1_init, r1 = r1_init, t1 = t1_init, K2 = K2_init, r2 = r2_init, t2 = t2_init), control = nls.control(maxiter = 100, warnOnly = TRUE)) }, error = function(e) NULL) if (is.null(model)) { return(list(fitted = rep(NA, length(dates)), params = rep(NA, 6), rsq = NA)) } fitted_values <- predict(model) rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2) return(list( fitted = fitted_values, params = coef(model), rsq = rsq, model_type = "double_logistic" )) } # ------------------------------------------------------------------------ # 3.4: Savitzky-Golay smoothing (TIMESAT approach) # ------------------------------------------------------------------------ fit_savgol <- function(dates, ci_values, window_length = 21, poly_order = 3) { # Simple implementation of Savitzky-Golay filter # For a proper implementation, use signal::sgolayfilt n <- length(ci_values) if (n < window_length) { window_length <- ifelse(n %% 2 == 1, n, n - 1) } if (window_length < poly_order + 1) { return(list(fitted = ci_values, params = NA, rsq = NA, model_type = "savgol")) } # Use moving average as simple smoothing for now # In production, use signal package half_window <- floor(window_length / 2) smoothed <- rep(NA, n) for (i in 1:n) { start_idx <- max(1, i - half_window) end_idx <- min(n, i + half_window) smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE) } rsq <- 1 - sum((ci_values - smoothed)^2, na.rm = TRUE) / sum((ci_values - mean(ci_values, na.rm = TRUE))^2, na.rm = TRUE) return(list( fitted = smoothed, params = c(window_length = window_length, poly_order = poly_order), rsq = rsq, model_type = "savgol" )) } # ------------------------------------------------------------------------ # 3.5: Extract phenological metrics from fitted curve # ------------------------------------------------------------------------ extract_phenology <- function(dates, fitted_values) { if (all(is.na(fitted_values))) { return(list( sos = NA, pos = NA, eos = NA, los = NA, amplitude = NA, greenup_rate = NA, senescence_rate = NA )) } # Peak of Season (POS) pos_idx <- which.max(fitted_values) pos_date <- dates[pos_idx] pos_value <- fitted_values[pos_idx] # Baseline (minimum value) baseline <- min(fitted_values, na.rm = TRUE) amplitude <- pos_value - baseline # Start of Season (SOS) - 20% of amplitude above baseline sos_threshold <- baseline + 0.2 * amplitude sos_idx <- which(fitted_values >= sos_threshold)[1] sos_date <- ifelse(!is.na(sos_idx), dates[sos_idx], NA) # End of Season (EOS) - return to 20% of amplitude after peak eos_candidates <- which(fitted_values[pos_idx:length(fitted_values)] <= sos_threshold) eos_idx <- ifelse(length(eos_candidates) > 0, pos_idx + eos_candidates[1] - 1, NA) eos_date <- ifelse(!is.na(eos_idx), dates[eos_idx], NA) # Length of Season los <- ifelse(!is.na(sos_date) && !is.na(eos_date), as.numeric(as.Date(eos_date, origin = "1970-01-01") - as.Date(sos_date, origin = "1970-01-01")), NA) # Rate of green-up (slope from SOS to POS) if (!is.na(sos_idx) && pos_idx > sos_idx) { greenup_days <- pos_idx - sos_idx greenup_rate <- (pos_value - fitted_values[sos_idx]) / greenup_days } else { greenup_rate <- NA } # Rate of senescence (slope from POS to EOS) - KEY FOR HARVEST PREDICTION if (!is.na(eos_idx) && eos_idx > pos_idx) { senescence_days <- eos_idx - pos_idx senescence_rate <- (fitted_values[eos_idx] - pos_value) / senescence_days } else { senescence_rate <- NA } return(list( sos = as.Date(sos_date, origin = "1970-01-01"), pos = as.Date(pos_date, origin = "1970-01-01"), eos = as.Date(eos_date, origin = "1970-01-01"), los = los, amplitude = amplitude, greenup_rate = greenup_rate, senescence_rate = senescence_rate )) } cat("Growth curve modeling functions defined:\n") cat(" - fit_quadratic() - Current polynomial approach\n") cat(" - fit_logistic() - S-curve for single growth phase\n") cat(" - fit_double_logistic() - Two-phase growth model\n") cat(" - fit_savgol() - TIMESAT-style smoothing\n") cat(" - extract_phenology() - Derive season metrics\n\n") # ============================================================================ # STEP 4: COMPARE MODELS ON SAMPLE FIELDS # ============================================================================ cat("=== TESTING MODELS ON SAMPLE FIELDS ===\n\n") # Select a few fields with good data coverage sample_fields <- harvest_data_filtered %>% group_by(field) %>% summarise(n_harvests = n(), .groups = "drop") %>% filter(n_harvests >= 3) %>% slice_head(n = 3) %>% pull(field) cat("Sample fields for analysis:", paste(sample_fields, collapse = ", "), "\n\n") # Analyze each field comparison_results <- list() for (field_name in sample_fields) { cat("\n--- Analyzing field:", field_name, "---\n") # Get time series for this field field_ts <- time_series_daily %>% filter(field_id == field_name) %>% arrange(date) if (nrow(field_ts) < 100) { cat("Insufficient data (", nrow(field_ts), " observations)\n") next } cat("Time series length:", nrow(field_ts), "days\n") cat("Date range:", min(field_ts$date), "to", max(field_ts$date), "\n") # Fit all models dates <- field_ts$date ci_values <- field_ts$mean_ci cat("\nFitting models...\n") quad_fit <- fit_quadratic(dates, ci_values) cat(" Quadratic R²:", round(quad_fit$rsq, 3), "\n") logistic_fit <- fit_logistic(dates, ci_values) cat(" Logistic R²:", round(logistic_fit$rsq, 3), "\n") double_log_fit <- fit_double_logistic(dates, ci_values) cat(" Double Logistic R²:", round(double_log_fit$rsq, 3), "\n") savgol_fit <- fit_savgol(dates, ci_values) cat(" Savitzky-Golay R²:", round(savgol_fit$rsq, 3), "\n") # Extract phenology from best-fitting model best_model <- list(quad_fit, logistic_fit, double_log_fit, savgol_fit) best_idx <- which.max(sapply(best_model, function(x) ifelse(is.na(x$rsq), -Inf, x$rsq))) best_fit <- best_model[[best_idx]] cat("\nBest model:", best_fit$model_type, "\n") phenology <- extract_phenology(dates, best_fit$fitted) cat("Phenological metrics:\n") cat(" Start of Season:", as.character(phenology$sos), "\n") cat(" Peak of Season:", as.character(phenology$pos), "\n") cat(" End of Season:", as.character(phenology$eos), "\n") cat(" Season Length:", phenology$los, "days\n") cat(" Senescence Rate:", round(phenology$senescence_rate, 4), "CI/day\n") # Get actual harvests for this field field_harvests <- harvest_data_filtered %>% filter(field == field_name) %>% select(season_end, harvest_year) %>% arrange(season_end) cat("\nActual harvest dates:\n") print(field_harvests) # Store results comparison_results[[field_name]] <- list( field = field_name, n_obs = nrow(field_ts), models = list( quadratic = quad_fit, logistic = logistic_fit, double_logistic = double_log_fit, savgol = savgol_fit ), best_model = best_fit$model_type, phenology = phenology, actual_harvests = field_harvests ) } # ============================================================================ # STEP 5: SUMMARY AND RECOMMENDATIONS # ============================================================================ cat("\n\n=== ANALYSIS SUMMARY ===\n\n") cat("Growth Curve Model Performance:\n\n") # Calculate average R² for each model type all_rsq <- data.frame( field = character(), quadratic = numeric(), logistic = numeric(), double_logistic = numeric(), savgol = numeric() ) for (field_name in names(comparison_results)) { result <- comparison_results[[field_name]] all_rsq <- rbind(all_rsq, data.frame( field = field_name, quadratic = result$models$quadratic$rsq, logistic = result$models$logistic$rsq, double_logistic = result$models$double_logistic$rsq, savgol = result$models$savgol$rsq )) } cat("Average R² by model:\n") print(colMeans(all_rsq[, -1], na.rm = TRUE)) cat("\n\nKEY FINDINGS:\n\n") cat("1. MODEL SELECTION:\n") cat(" - Compare R² values above to determine best-fitting model type\n") cat(" - Logistic/Double Logistic better capture biological growth patterns\n") cat(" - Savitzky-Golay provides smooth curves without parametric assumptions\n\n") cat("2. HARVEST PREDICTION STRATEGY:\n") cat(" - Use phenological metrics (especially senescence rate)\n") cat(" - Peak of Season (POS) timing correlates with harvest window\n") cat(" - Rapid senescence after POS may indicate approaching harvest\n") cat(" - For sugarcane: harvest often occurs DURING maturation, not after senescence\n\n") cat("3. NEXT STEPS:\n") cat(" a) Implement the best-performing model across all fields\n") cat(" b) Correlate POS dates with actual harvest dates\n") cat(" c) Build prediction model: harvest_date = f(POS, senescence_rate, field_age)\n") cat(" d) Test predictive accuracy (weeks ahead of harvest)\n") cat(" e) Consider multiple indices (NDVI, NDRE, EVI) if available\n\n") cat("=== ANALYSIS COMPLETE ===\n")