SmartCane/r_app/experiments/optimal_ci_analysis.R

# Optimal CI Analysis - Day 30 CI values with quadratic fitting
# Author: SmartCane Analysis Team
# Date: 2025-06-12

# Load required libraries
library(ggplot2)
library(dplyr)
library(readr)

# Set file path
rds_file_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane/laravel_app/storage/app/chemba/Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds"

# Check if file exists
if (!file.exists(rds_file_path)) {
  stop("RDS file not found at specified path: ", rds_file_path)
}

# Load the data
cat("Loading RDS file...\n")
ci_data <- readRDS(rds_file_path)

# Display structure of the data to understand it better
cat("Data structure:\n")
str(ci_data)
cat("\nFirst few rows:\n")
head(ci_data)
cat("\nColumn names:\n")
print(colnames(ci_data))

# Filter data based on requirements
cat("\nApplying data filters...\n")

# 1. Filter out models that don't reach at least DOY 300
model_doy_max <- ci_data %>%
  group_by(model) %>%
  summarise(max_doy = max(DOY, na.rm = TRUE), .groups = 'drop')

valid_models <- model_doy_max %>%
  filter(max_doy >= 300) %>%
  pull(model)

cat(paste("Models before filtering:", n_distinct(ci_data$model), "\n"))
cat(paste("Models reaching at least DOY 300:", length(valid_models), "\n"))

ci_data <- ci_data %>%
  filter(model %in% valid_models)

# 2. Apply IQR filtering per DOY (remove outliers outside IQR)
cat("Applying IQR filtering per DOY...\n")
original_rows <- nrow(ci_data)

ci_data <- ci_data %>%
  group_by(DOY) %>%
  mutate(
    Q1 = quantile(FitData, 0.25, na.rm = TRUE),
    Q3 = quantile(FitData, 0.75, na.rm = TRUE),
    IQR = Q3 - Q1,
    lower_bound = Q1 - 1.5 * IQR,
    upper_bound = Q3 + 1.5 * IQR
  ) %>%
  filter(FitData >= lower_bound & FitData <= upper_bound) %>%
  select(-Q1, -Q3, -IQR, -lower_bound, -upper_bound) %>%
  ungroup()

filtered_rows <- nrow(ci_data)
cat(paste("Rows before IQR filtering:", original_rows, "\n"))
cat(paste("Rows after IQR filtering:", filtered_rows, "\n"))
cat(paste("Removed", original_rows - filtered_rows, "outliers (",
          round(100 * (original_rows - filtered_rows) / original_rows, 1), "%)\n"))

# Check what day values are available after filtering
if ("DOY" %in% colnames(ci_data)) {
  cat("\nUnique DOY values after filtering:\n")
  print(sort(unique(ci_data$DOY)))
} else {
  cat("\nNo 'DOY' column found. Available columns:\n")
  print(colnames(ci_data))
}

# Extract CI values at day 30 for each field
cat("\nExtracting day 30 CI values...\n")

# Set column names based on known structure
day_col <- "DOY"
ci_col <- "FitData"
field_col <- "model"

# Try different possible field column name combinations
if ("field" %in% colnames(ci_data)) {
  field_col <- "field"
} else if ("Field" %in% colnames(ci_data)) {
  field_col <- "Field"
} else if ("pivot" %in% colnames(ci_data)) {
  field_col <- "pivot"
} else if ("Pivot" %in% colnames(ci_data)) {
  field_col <- "Pivot"
} else if ("field_id" %in% colnames(ci_data)) {
  field_col <- "field_id"
} else if ("Field_ID" %in% colnames(ci_data)) {
  field_col <- "Field_ID"
}

# Check if we found the required columns
if (!("DOY" %in% colnames(ci_data))) {
  stop("DOY column not found in data")
}
if (!("FitData" %in% colnames(ci_data))) {
  stop("FitData column not found in data")
}
if (is.null(field_col)) {
  cat("Could not automatically identify field column. Please check column names.\n")
  cat("Available columns: ", paste(colnames(ci_data), collapse = ", "), "\n")
  stop("Manual field column identification required")
}

cat(paste("Using columns - DOY:", day_col, "Field:", field_col, "FitData:", ci_col, "\n"))

# Extract day 30 data
day_30_data <- ci_data %>%
  filter(DOY %% 30 == 0) %>%
  select(field = !!sym(field_col), ci = !!sym(ci_col), DOY) %>%
  na.omit() %>%
  arrange(field)

cat(paste("Found", nrow(day_30_data), "fields with day 30 CI values\n"))

if (nrow(day_30_data) == 0) {
  stop("No data found for day 30. Check if day 30 exists in the dataset.")
}

# Display summary of day 30 data
cat("\nSummary of day 30 CI values:\n")
print(summary(day_30_data))

# Add field index for plotting (assuming fields represent some spatial or sequential order)
#day_30_data$field_index <- 1:nrow(day_30_data)

# Create scatter plot
p1 <- ggplot(day_30_data, aes(x = DOY, y = ci)) +
  geom_point(color = "blue", size = 3, alpha = 0.7) +
  labs(
    title = "CI Values at Day 30 for All Fields",
    x = "Day of Year (DOY)",
    y = "CI Value",
    subtitle = paste("Total fields:", nrow(day_30_data))
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

print(p1)

# Try multiple curve fitting approaches
cat("\nTrying different curve fitting approaches...\n")

# Aggregate data by DOY (take mean CI for each DOY to reduce noise)
aggregated_data <- day_30_data %>%
  group_by(DOY) %>%
  summarise(
    mean_ci = mean(ci, na.rm = TRUE),
    median_ci = median(ci, na.rm = TRUE),
    count = n(),
    .groups = 'drop'
  ) %>%
  filter(count >= 5)  # Only keep DOY values with sufficient data points

cat(paste("Aggregated to", nrow(aggregated_data), "DOY points with sufficient data\n"))

# Create prediction data for smooth curves
x_smooth <- seq(min(aggregated_data$DOY), max(aggregated_data$DOY), length.out = 100)

# 1. Quadratic model (as requested)
cat("\n1. Fitting quadratic model...\n")
quad_model <- lm(mean_ci ~ poly(DOY, 2, raw = TRUE), data = aggregated_data)
quad_pred <- predict(quad_model, newdata = data.frame(DOY = x_smooth))
quad_r2 <- summary(quad_model)$r.squared
cat(paste("Quadratic R² =", round(quad_r2, 3), "\n"))

# 2. Logistic growth model: y = K / (1 + exp(-r*(x-x0))) - biologically realistic
cat("\n2. Fitting logistic growth model...\n")
tryCatch({
  # Estimate starting values
  K_start <- max(aggregated_data$mean_ci) * 1.1  # Carrying capacity
  r_start <- 0.05  # Growth rate
  x0_start <- mean(aggregated_data$DOY)  # Inflection point

  logistic_model <- nls(mean_ci ~ K / (1 + exp(-r * (DOY - x0))),
                       data = aggregated_data,
                       start = list(K = K_start, r = r_start, x0 = x0_start),
                       control = nls.control(maxiter = 1000))

  logistic_pred <- predict(logistic_model, newdata = data.frame(DOY = x_smooth))
  logistic_r2 <- 1 - sum(residuals(logistic_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
  cat(paste("Logistic growth R² =", round(logistic_r2, 3), "\n"))
}, error = function(e) {
  cat("Logistic growth model failed to converge\n")
  logistic_model <- NULL
  logistic_pred <- NULL
  logistic_r2 <- NA
})

# 3. Beta function model: good for crop growth curves with clear peak
cat("\n3. Fitting Beta function model...\n")
tryCatch({
  # Normalize DOY to 0-1 range for Beta function
  doy_min <- min(aggregated_data$DOY)
  doy_max <- max(aggregated_data$DOY)
  aggregated_data$doy_norm <- (aggregated_data$DOY - doy_min) / (doy_max - doy_min)

  # Beta function: y = a * (x^(p-1)) * ((1-x)^(q-1)) + c
  beta_model <- nls(mean_ci ~ a * (doy_norm^(p-1)) * ((1-doy_norm)^(q-1)) + c,
                   data = aggregated_data,
                   start = list(a = max(aggregated_data$mean_ci) * 20, p = 2, q = 3, c = min(aggregated_data$mean_ci)),
                   control = nls.control(maxiter = 1000))

  # Predict on normalized scale then convert back
  x_smooth_norm <- (x_smooth - doy_min) / (doy_max - doy_min)
  beta_pred <- predict(beta_model, newdata = data.frame(doy_norm = x_smooth_norm))
  beta_r2 <- 1 - sum(residuals(beta_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
  cat(paste("Beta function R² =", round(beta_r2, 3), "\n"))
}, error = function(e) {
  cat("Beta function model failed to converge\n")
  beta_model <- NULL
  beta_pred <- NULL
  beta_r2 <- NA
})

# 4. Gaussian (normal) curve: good for symmetric growth patterns
cat("\n4. Fitting Gaussian curve...\n")
tryCatch({
  # Gaussian: y = a * exp(-((x-mu)^2)/(2*sigma^2)) + c
  gaussian_model <- nls(mean_ci ~ a * exp(-((DOY - mu)^2)/(2 * sigma^2)) + c,
                       data = aggregated_data,
                       start = list(a = max(aggregated_data$mean_ci),
                                   mu = aggregated_data$DOY[which.max(aggregated_data$mean_ci)],
                                   sigma = 50,
                                   c = min(aggregated_data$mean_ci)),
                       control = nls.control(maxiter = 1000))

  gaussian_pred <- predict(gaussian_model, newdata = data.frame(DOY = x_smooth))
  gaussian_r2 <- 1 - sum(residuals(gaussian_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
  cat(paste("Gaussian R² =", round(gaussian_r2, 3), "\n"))
}, error = function(e) {
  cat("Gaussian model failed to converge\n")
  gaussian_model <- NULL
  gaussian_pred <- NULL
  gaussian_r2 <- NA
})

# 5. LOESS (local regression) - for comparison
cat("\n5. Fitting LOESS smoothing...\n")
loess_model <- loess(mean_ci ~ DOY, data = aggregated_data, span = 0.5)
loess_pred <- predict(loess_model, newdata = data.frame(DOY = x_smooth))
loess_r2 <- 1 - sum(residuals(loess_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
cat(paste("LOESS R² =", round(loess_r2, 3), "\n"))

# Calculate confidence intervals for both models
cat("\nCalculating confidence intervals...\n")

# Function to calculate confidence intervals using residual-based method
calculate_ci <- function(model, data, newdata, alpha = 0.5) {
  # Get model predictions
  pred_vals <- predict(model, newdata = newdata)

  # For parametric models (lm), use built-in prediction intervals
  if(class(model)[1] == "lm") {
    pred_intervals <- predict(model, newdata = newdata, interval = "confidence", level = 1 - alpha)
    return(list(
      lower = pred_intervals[, "lwr"],
      upper = pred_intervals[, "upr"]
    ))
  }

  # For LOESS, calculate confidence intervals using residual bootstrap
  if(class(model)[1] == "loess") {
    # Calculate residuals from the original model
    fitted_vals <- fitted(model)
    residuals_vals <- residuals(model)
    residual_sd <- sd(residuals_vals, na.rm = TRUE)

    # Use normal approximation for confidence intervals
    # For 50% CI, use 67% quantile (approximately 0.67 standard deviations)
    margin <- qnorm(1 - alpha/2) * residual_sd

    return(list(
      lower = pred_vals - margin,
      upper = pred_vals + margin
    ))
  }

  # Fallback method
  residual_sd <- sd(residuals(model), na.rm = TRUE)
  margin <- qnorm(1 - alpha/2) * residual_sd
  return(list(
    lower = pred_vals - margin,
    upper = pred_vals + margin
  ))
}# Calculate CIs for quadratic model using aggregated data (same as model fitting)
quad_ci <- calculate_ci(quad_model, aggregated_data, data.frame(DOY = x_smooth))

# Calculate CIs for LOESS model using aggregated data (same as model fitting)
loess_ci <- calculate_ci(loess_model, aggregated_data, data.frame(DOY = x_smooth))

# Create separate plots for LOESS and Quadratic models
cat("\nCreating LOESS plot with confidence intervals...\n")

# LOESS plot
p_loess <- ggplot(day_30_data, aes(x = DOY, y = ci)) +
  geom_point(color = "lightblue", size = 1.5, alpha = 0.4) +
  geom_point(data = aggregated_data, aes(x = DOY, y = mean_ci),
             color = "darkblue", size = 3, alpha = 0.8) +
  geom_ribbon(data = data.frame(DOY = x_smooth,
                               lower = loess_ci$lower,
                               upper = loess_ci$upper),
              aes(x = DOY, ymin = lower, ymax = upper),
              alpha = 0.3, fill = "purple", inherit.aes = FALSE) +
  geom_line(data = data.frame(DOY = x_smooth, loess = loess_pred),
            aes(x = DOY, y = loess),
            color = "purple", size = 1.5) +
  labs(
    title = "LOESS Model - CI Values Over Growing Season",
    x = "Day of Year (DOY)",
    y = "CI Value",
    subtitle = paste("LOESS R² =", round(loess_r2, 3), "| 50% Confidence Interval")
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

# Find optimal point for LOESS
loess_max_idx <- which.max(loess_pred)
loess_optimal_doy <- x_smooth[loess_max_idx]
loess_optimal_ci <- loess_pred[loess_max_idx]

p_loess <- p_loess +
  geom_point(aes(x = loess_optimal_doy, y = loess_optimal_ci),
             color = "red", size = 5, shape = 8) +
  annotate("text",
           x = loess_optimal_doy + 30,
           y = loess_optimal_ci,
           label = paste("Optimal: DOY", round(loess_optimal_doy, 1), "\nCI =", round(loess_optimal_ci, 3)),
           color = "red", size = 4, fontface = "bold")

print(p_loess)

cat("\nCreating Quadratic plot with confidence intervals...\n")

# Quadratic plot
p_quadratic <- ggplot(day_30_data, aes(x = DOY, y = ci)) +
  geom_point(color = "lightcoral", size = 1.5, alpha = 0.4) +
  geom_point(data = aggregated_data, aes(x = DOY, y = mean_ci),
             color = "darkred", size = 3, alpha = 0.8) +
  geom_ribbon(data = data.frame(DOY = x_smooth,
                               lower = quad_ci$lower,
                               upper = quad_ci$upper),
              aes(x = DOY, ymin = lower, ymax = upper),
              alpha = 0.3, fill = "red", inherit.aes = FALSE) +
  geom_line(data = data.frame(DOY = x_smooth, quadratic = quad_pred),
            aes(x = DOY, y = quadratic),
            color = "red", size = 1.5) +
  labs(
    title = "Quadratic Model - CI Values Over Growing Season",
    x = "Day of Year (DOY)",
    y = "CI Value",
    subtitle = paste("Quadratic R² =", round(quad_r2, 3), "| 50% Confidence Interval")
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

# Find optimal point for Quadratic
quad_coeffs <- coef(quad_model)
a <- quad_coeffs[3]
b <- quad_coeffs[2]

if(a != 0) {
  quad_optimal_doy <- -b / (2*a)
  doy_range <- range(aggregated_data$DOY)
  if(quad_optimal_doy >= doy_range[1] && quad_optimal_doy <= doy_range[2]) {
    quad_optimal_ci <- predict(quad_model, newdata = data.frame(DOY = quad_optimal_doy))
  } else {
    quad_optimal_doy <- x_smooth[which.max(quad_pred)]
    quad_optimal_ci <- max(quad_pred)
  }
} else {
  quad_optimal_doy <- x_smooth[which.max(quad_pred)]
  quad_optimal_ci <- max(quad_pred)
}

p_quadratic <- p_quadratic +
  geom_point(aes(x = quad_optimal_doy, y = quad_optimal_ci),
             color = "darkred", size = 5, shape = 8) +
  annotate("text",
           x = quad_optimal_doy + 30,
           y = quad_optimal_ci,
           label = paste("Optimal: DOY", round(quad_optimal_doy, 1), "\nCI =", round(quad_optimal_ci, 3)),
           color = "darkred", size = 4, fontface = "bold")

print(p_quadratic)
print(p_loess)

# Print results summary
cat("\n=== RESULTS SUMMARY ===\n")
cat(paste("LOESS Model - R² =", round(loess_r2, 3), "\n"))
cat(paste("  Optimal DOY:", round(loess_optimal_doy, 1), "\n"))
cat(paste("  Optimal CI:", round(loess_optimal_ci, 4), "\n\n"))

cat(paste("Quadratic Model - R² =", round(quad_r2, 3), "\n"))
cat(paste("  Optimal DOY:", round(quad_optimal_doy, 1), "\n"))
cat(paste("  Optimal CI:", round(quad_optimal_ci, 4), "\n"))