SmartCane/r_app/experiments/harvest_prediction/old/analyze_harvest_methods.R

# ============================================================================
# HARVEST PREDICTION METHODS ANALYSIS
# Using existing CI data to explore growth curve modeling approaches
# ============================================================================

suppressPackageStartupMessages({
  library(readxl)
  library(dplyr)
  library(tidyr)
  library(lubridate)
  library(terra)
  library(sf)
  library(here)
  library(ggplot2)
})

# Set project directory
project_dir <- "esa"
assign("project_dir", project_dir, envir = .GlobalEnv)

source(here("r_app", "parameters_project.R"))

# ============================================================================
# STEP 1: LOAD EXISTING CI TIME SERIES DATA
# ============================================================================

cat("=== LOADING CI TIME SERIES DATA ===\n\n")

ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()

cat("Loaded", nrow(ci_data_raw), "CI observations\n")
cat("Fields:", length(unique(ci_data_raw$field)), "\n")
cat("Date range:", min(ci_data_raw$Date, na.rm = TRUE), "to", max(ci_data_raw$Date, na.rm = TRUE), "\n\n")

# Prepare daily time series
time_series_daily <- ci_data_raw %>%
  mutate(
    date = as.Date(Date),
    week = isoweek(date),
    year = isoyear(date)
  ) %>%
  select(
    field_id = field,
    date,
    week,
    year,
    mean_ci = FitData
  ) %>%
  filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
  arrange(field_id, date)

cat("Processed", nrow(time_series_daily), "daily CI observations\n")
cat("Sample of data:\n")
print(head(time_series_daily, 20))

# ============================================================================
# STEP 2: LOAD ACTUAL HARVEST DATA
# ============================================================================

cat("\n\n=== LOADING HARVEST DATA ===\n\n")

harvest_data <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
  mutate(
    season_start = as.Date(season_start),
    season_end = as.Date(season_end)
  ) %>%
  filter(!is.na(season_end))

# Get only fields that we have CI data for
fields_with_ci <- unique(time_series_daily$field_id)
harvest_data_filtered <- harvest_data %>%
  filter(field %in% fields_with_ci) %>%
  mutate(
    harvest_week = isoweek(season_end),
    harvest_year = isoyear(season_end)
  )

cat("Total harvest records:", nrow(harvest_data_filtered), "\n")
cat("Fields with both CI and harvest data:", length(unique(harvest_data_filtered$field)), "\n")
cat("Date range:", min(harvest_data_filtered$season_end), "to", max(harvest_data_filtered$season_end), "\n\n")

# ============================================================================
# STEP 3: GROWTH CURVE MODELING FUNCTIONS
# ============================================================================

cat("=== IMPLEMENTING GROWTH CURVE MODELS ===\n\n")

# ------------------------------------------------------------------------
# 3.1: Current method - Quadratic polynomial (for comparison)
# ------------------------------------------------------------------------
fit_quadratic <- function(dates, ci_values) {
  # Convert dates to numeric (days since first observation)
  t <- as.numeric(dates - min(dates))

  # Fit quadratic: y = a + b*t + c*t^2
  model <- tryCatch({
    lm(ci_values ~ t + I(t^2))
  }, error = function(e) NULL)

  if (is.null(model)) {
    return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA))
  }

  fitted_values <- predict(model)
  rsq <- summary(model)$r.squared

  return(list(
    fitted = fitted_values,
    params = coef(model),
    rsq = rsq,
    model_type = "quadratic"
  ))
}

# ------------------------------------------------------------------------
# 3.2: Logistic curve - S-shaped growth
# ------------------------------------------------------------------------
fit_logistic <- function(dates, ci_values) {
  # Convert dates to numeric
  t <- as.numeric(dates - min(dates))

  # Initial parameter estimates
  K_init <- max(ci_values, na.rm = TRUE)  # Carrying capacity
  r_init <- 0.05  # Growth rate
  t0_init <- median(t)  # Inflection point

  # Logistic function: y = K / (1 + exp(-r*(t-t0)))
  logistic_fn <- function(t, K, r, t0) {
    K / (1 + exp(-r * (t - t0)))
  }

  model <- tryCatch({
    nls(ci_values ~ K / (1 + exp(-r * (t - t0))),
        start = list(K = K_init, r = r_init, t0 = t0_init),
        control = nls.control(maxiter = 100, warnOnly = TRUE))
  }, error = function(e) NULL)

  if (is.null(model)) {
    return(list(fitted = rep(NA, length(dates)), params = c(NA, NA, NA), rsq = NA))
  }

  fitted_values <- predict(model)
  rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2)

  return(list(
    fitted = fitted_values,
    params = coef(model),
    rsq = rsq,
    model_type = "logistic"
  ))
}

# ------------------------------------------------------------------------
# 3.3: Double Logistic - For multi-phase growth (tillering + grand growth)
# ------------------------------------------------------------------------
fit_double_logistic <- function(dates, ci_values) {
  # Convert dates to numeric
  t <- as.numeric(dates - min(dates))

  # Initial parameters for two growth phases
  K1_init <- max(ci_values) * 0.6  # First phase peak
  K2_init <- max(ci_values)  # Second phase peak
  r1_init <- 0.08  # First phase growth rate
  r2_init <- 0.05  # Second phase growth rate
  t1_init <- quantile(t, 0.25)  # First inflection
  t2_init <- quantile(t, 0.75)  # Second inflection

  # Double logistic: y = K1/(1+exp(-r1*(t-t1))) + K2/(1+exp(-r2*(t-t2)))
  model <- tryCatch({
    nls(ci_values ~ K1 / (1 + exp(-r1 * (t - t1))) + K2 / (1 + exp(-r2 * (t - t2))),
        start = list(K1 = K1_init, r1 = r1_init, t1 = t1_init,
                     K2 = K2_init, r2 = r2_init, t2 = t2_init),
        control = nls.control(maxiter = 100, warnOnly = TRUE))
  }, error = function(e) NULL)

  if (is.null(model)) {
    return(list(fitted = rep(NA, length(dates)), params = rep(NA, 6), rsq = NA))
  }

  fitted_values <- predict(model)
  rsq <- 1 - sum((ci_values - fitted_values)^2) / sum((ci_values - mean(ci_values))^2)

  return(list(
    fitted = fitted_values,
    params = coef(model),
    rsq = rsq,
    model_type = "double_logistic"
  ))
}

# ------------------------------------------------------------------------
# 3.4: Savitzky-Golay smoothing (TIMESAT approach)
# ------------------------------------------------------------------------
fit_savgol <- function(dates, ci_values, window_length = 21, poly_order = 3) {
  # Simple implementation of Savitzky-Golay filter
  # For a proper implementation, use signal::sgolayfilt

  n <- length(ci_values)
  if (n < window_length) {
    window_length <- ifelse(n %% 2 == 1, n, n - 1)
  }
  if (window_length < poly_order + 1) {
    return(list(fitted = ci_values, params = NA, rsq = NA, model_type = "savgol"))
  }

  # Use moving average as simple smoothing for now
  # In production, use signal package
  half_window <- floor(window_length / 2)
  smoothed <- rep(NA, n)

  for (i in 1:n) {
    start_idx <- max(1, i - half_window)
    end_idx <- min(n, i + half_window)
    smoothed[i] <- mean(ci_values[start_idx:end_idx], na.rm = TRUE)
  }

  rsq <- 1 - sum((ci_values - smoothed)^2, na.rm = TRUE) / sum((ci_values - mean(ci_values, na.rm = TRUE))^2, na.rm = TRUE)

  return(list(
    fitted = smoothed,
    params = c(window_length = window_length, poly_order = poly_order),
    rsq = rsq,
    model_type = "savgol"
  ))
}

# ------------------------------------------------------------------------
# 3.5: Extract phenological metrics from fitted curve
# ------------------------------------------------------------------------
extract_phenology <- function(dates, fitted_values) {
  if (all(is.na(fitted_values))) {
    return(list(
      sos = NA, pos = NA, eos = NA, los = NA,
      amplitude = NA, greenup_rate = NA, senescence_rate = NA
    ))
  }

  # Peak of Season (POS)
  pos_idx <- which.max(fitted_values)
  pos_date <- dates[pos_idx]
  pos_value <- fitted_values[pos_idx]

  # Baseline (minimum value)
  baseline <- min(fitted_values, na.rm = TRUE)
  amplitude <- pos_value - baseline

  # Start of Season (SOS) - 20% of amplitude above baseline
  sos_threshold <- baseline + 0.2 * amplitude
  sos_idx <- which(fitted_values >= sos_threshold)[1]
  sos_date <- ifelse(!is.na(sos_idx), dates[sos_idx], NA)

  # End of Season (EOS) - return to 20% of amplitude after peak
  eos_candidates <- which(fitted_values[pos_idx:length(fitted_values)] <= sos_threshold)
  eos_idx <- ifelse(length(eos_candidates) > 0, pos_idx + eos_candidates[1] - 1, NA)
  eos_date <- ifelse(!is.na(eos_idx), dates[eos_idx], NA)

  # Length of Season
  los <- ifelse(!is.na(sos_date) && !is.na(eos_date),
                as.numeric(as.Date(eos_date, origin = "1970-01-01") - as.Date(sos_date, origin = "1970-01-01")),
                NA)

  # Rate of green-up (slope from SOS to POS)
  if (!is.na(sos_idx) && pos_idx > sos_idx) {
    greenup_days <- pos_idx - sos_idx
    greenup_rate <- (pos_value - fitted_values[sos_idx]) / greenup_days
  } else {
    greenup_rate <- NA
  }

  # Rate of senescence (slope from POS to EOS) - KEY FOR HARVEST PREDICTION
  if (!is.na(eos_idx) && eos_idx > pos_idx) {
    senescence_days <- eos_idx - pos_idx
    senescence_rate <- (fitted_values[eos_idx] - pos_value) / senescence_days
  } else {
    senescence_rate <- NA
  }

  return(list(
    sos = as.Date(sos_date, origin = "1970-01-01"),
    pos = as.Date(pos_date, origin = "1970-01-01"),
    eos = as.Date(eos_date, origin = "1970-01-01"),
    los = los,
    amplitude = amplitude,
    greenup_rate = greenup_rate,
    senescence_rate = senescence_rate
  ))
}

cat("Growth curve modeling functions defined:\n")
cat("  - fit_quadratic() - Current polynomial approach\n")
cat("  - fit_logistic() - S-curve for single growth phase\n")
cat("  - fit_double_logistic() - Two-phase growth model\n")
cat("  - fit_savgol() - TIMESAT-style smoothing\n")
cat("  - extract_phenology() - Derive season metrics\n\n")

# ============================================================================
# STEP 4: COMPARE MODELS ON SAMPLE FIELDS
# ============================================================================

cat("=== TESTING MODELS ON SAMPLE FIELDS ===\n\n")

# Select a few fields with good data coverage
sample_fields <- harvest_data_filtered %>%
  group_by(field) %>%
  summarise(n_harvests = n(), .groups = "drop") %>%
  filter(n_harvests >= 3) %>%
  slice_head(n = 3) %>%
  pull(field)

cat("Sample fields for analysis:", paste(sample_fields, collapse = ", "), "\n\n")

# Analyze each field
comparison_results <- list()

for (field_name in sample_fields) {
  cat("\n--- Analyzing field:", field_name, "---\n")

  # Get time series for this field
  field_ts <- time_series_daily %>%
    filter(field_id == field_name) %>%
    arrange(date)

  if (nrow(field_ts) < 100) {
    cat("Insufficient data (", nrow(field_ts), " observations)\n")
    next
  }

  cat("Time series length:", nrow(field_ts), "days\n")
  cat("Date range:", min(field_ts$date), "to", max(field_ts$date), "\n")

  # Fit all models
  dates <- field_ts$date
  ci_values <- field_ts$mean_ci

  cat("\nFitting models...\n")

  quad_fit <- fit_quadratic(dates, ci_values)
  cat("  Quadratic R²:", round(quad_fit$rsq, 3), "\n")

  logistic_fit <- fit_logistic(dates, ci_values)
  cat("  Logistic R²:", round(logistic_fit$rsq, 3), "\n")

  double_log_fit <- fit_double_logistic(dates, ci_values)
  cat("  Double Logistic R²:", round(double_log_fit$rsq, 3), "\n")

  savgol_fit <- fit_savgol(dates, ci_values)
  cat("  Savitzky-Golay R²:", round(savgol_fit$rsq, 3), "\n")

  # Extract phenology from best-fitting model
  best_model <- list(quad_fit, logistic_fit, double_log_fit, savgol_fit)
  best_idx <- which.max(sapply(best_model, function(x) ifelse(is.na(x$rsq), -Inf, x$rsq)))
  best_fit <- best_model[[best_idx]]

  cat("\nBest model:", best_fit$model_type, "\n")

  phenology <- extract_phenology(dates, best_fit$fitted)
  cat("Phenological metrics:\n")
  cat("  Start of Season:", as.character(phenology$sos), "\n")
  cat("  Peak of Season:", as.character(phenology$pos), "\n")
  cat("  End of Season:", as.character(phenology$eos), "\n")
  cat("  Season Length:", phenology$los, "days\n")
  cat("  Senescence Rate:", round(phenology$senescence_rate, 4), "CI/day\n")

  # Get actual harvests for this field
  field_harvests <- harvest_data_filtered %>%
    filter(field == field_name) %>%
    select(season_end, harvest_year) %>%
    arrange(season_end)

  cat("\nActual harvest dates:\n")
  print(field_harvests)

  # Store results
  comparison_results[[field_name]] <- list(
    field = field_name,
    n_obs = nrow(field_ts),
    models = list(
      quadratic = quad_fit,
      logistic = logistic_fit,
      double_logistic = double_log_fit,
      savgol = savgol_fit
    ),
    best_model = best_fit$model_type,
    phenology = phenology,
    actual_harvests = field_harvests
  )
}

# ============================================================================
# STEP 5: SUMMARY AND RECOMMENDATIONS
# ============================================================================

cat("\n\n=== ANALYSIS SUMMARY ===\n\n")

cat("Growth Curve Model Performance:\n\n")

# Calculate average R² for each model type
all_rsq <- data.frame(
  field = character(),
  quadratic = numeric(),
  logistic = numeric(),
  double_logistic = numeric(),
  savgol = numeric()
)

for (field_name in names(comparison_results)) {
  result <- comparison_results[[field_name]]
  all_rsq <- rbind(all_rsq, data.frame(
    field = field_name,
    quadratic = result$models$quadratic$rsq,
    logistic = result$models$logistic$rsq,
    double_logistic = result$models$double_logistic$rsq,
    savgol = result$models$savgol$rsq
  ))
}

cat("Average R² by model:\n")
print(colMeans(all_rsq[, -1], na.rm = TRUE))

cat("\n\nKEY FINDINGS:\n\n")

cat("1. MODEL SELECTION:\n")
cat("   - Compare R² values above to determine best-fitting model type\n")
cat("   - Logistic/Double Logistic better capture biological growth patterns\n")
cat("   - Savitzky-Golay provides smooth curves without parametric assumptions\n\n")

cat("2. HARVEST PREDICTION STRATEGY:\n")
cat("   - Use phenological metrics (especially senescence rate)\n")
cat("   - Peak of Season (POS) timing correlates with harvest window\n")
cat("   - Rapid senescence after POS may indicate approaching harvest\n")
cat("   - For sugarcane: harvest often occurs DURING maturation, not after senescence\n\n")

cat("3. NEXT STEPS:\n")
cat("   a) Implement the best-performing model across all fields\n")
cat("   b) Correlate POS dates with actual harvest dates\n")
cat("   c) Build prediction model: harvest_date = f(POS, senescence_rate, field_age)\n")
cat("   d) Test predictive accuracy (weeks ahead of harvest)\n")
cat("   e) Consider multiple indices (NDVI, NDRE, EVI) if available\n\n")

cat("=== ANALYSIS COMPLETE ===\n")