# Optimal CI Analysis - Day 30 CI values with quadratic fitting # Author: SmartCane Analysis Team # Date: 2025-06-12 # Load required libraries library(ggplot2) library(dplyr) library(readr) # Set file path rds_file_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane/laravel_app/storage/app/chemba/Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds" # Check if file exists if (!file.exists(rds_file_path)) { stop("RDS file not found at specified path: ", rds_file_path) } # Load the data cat("Loading RDS file...\n") ci_data <- readRDS(rds_file_path) # Display structure of the data to understand it better cat("Data structure:\n") str(ci_data) cat("\nFirst few rows:\n") head(ci_data) cat("\nColumn names:\n") print(colnames(ci_data)) # Filter data based on requirements cat("\nApplying data filters...\n") # 1. Filter out models that don't reach at least DOY 300 model_doy_max <- ci_data %>% group_by(model) %>% summarise(max_doy = max(DOY, na.rm = TRUE), .groups = 'drop') valid_models <- model_doy_max %>% filter(max_doy >= 300) %>% pull(model) cat(paste("Models before filtering:", n_distinct(ci_data$model), "\n")) cat(paste("Models reaching at least DOY 300:", length(valid_models), "\n")) ci_data <- ci_data %>% filter(model %in% valid_models) # 2. Apply IQR filtering per DOY (remove outliers outside IQR) cat("Applying IQR filtering per DOY...\n") original_rows <- nrow(ci_data) ci_data <- ci_data %>% group_by(DOY) %>% mutate( Q1 = quantile(FitData, 0.25, na.rm = TRUE), Q3 = quantile(FitData, 0.75, na.rm = TRUE), IQR = Q3 - Q1, lower_bound = Q1 - 1.5 * IQR, upper_bound = Q3 + 1.5 * IQR ) %>% filter(FitData >= lower_bound & FitData <= upper_bound) %>% select(-Q1, -Q3, -IQR, -lower_bound, -upper_bound) %>% ungroup() filtered_rows <- nrow(ci_data) cat(paste("Rows before IQR filtering:", original_rows, "\n")) cat(paste("Rows after IQR filtering:", filtered_rows, "\n")) cat(paste("Removed", original_rows - filtered_rows, "outliers (", round(100 * (original_rows - filtered_rows) / original_rows, 1), "%)\n")) # Check what day values are available after filtering if ("DOY" %in% colnames(ci_data)) { cat("\nUnique DOY values after filtering:\n") print(sort(unique(ci_data$DOY))) } else { cat("\nNo 'DOY' column found. Available columns:\n") print(colnames(ci_data)) } # Extract CI values at day 30 for each field cat("\nExtracting day 30 CI values...\n") # Set column names based on known structure day_col <- "DOY" ci_col <- "FitData" field_col <- "model" # Try different possible field column name combinations if ("field" %in% colnames(ci_data)) { field_col <- "field" } else if ("Field" %in% colnames(ci_data)) { field_col <- "Field" } else if ("pivot" %in% colnames(ci_data)) { field_col <- "pivot" } else if ("Pivot" %in% colnames(ci_data)) { field_col <- "Pivot" } else if ("field_id" %in% colnames(ci_data)) { field_col <- "field_id" } else if ("Field_ID" %in% colnames(ci_data)) { field_col <- "Field_ID" } # Check if we found the required columns if (!("DOY" %in% colnames(ci_data))) { stop("DOY column not found in data") } if (!("FitData" %in% colnames(ci_data))) { stop("FitData column not found in data") } if (is.null(field_col)) { cat("Could not automatically identify field column. Please check column names.\n") cat("Available columns: ", paste(colnames(ci_data), collapse = ", "), "\n") stop("Manual field column identification required") } cat(paste("Using columns - DOY:", day_col, "Field:", field_col, "FitData:", ci_col, "\n")) # Extract day 30 data day_30_data <- ci_data %>% filter(DOY %% 30 == 0) %>% select(field = !!sym(field_col), ci = !!sym(ci_col), DOY) %>% na.omit() %>% arrange(field) cat(paste("Found", nrow(day_30_data), "fields with day 30 CI values\n")) if (nrow(day_30_data) == 0) { stop("No data found for day 30. Check if day 30 exists in the dataset.") } # Display summary of day 30 data cat("\nSummary of day 30 CI values:\n") print(summary(day_30_data)) # Add field index for plotting (assuming fields represent some spatial or sequential order) #day_30_data$field_index <- 1:nrow(day_30_data) # Create scatter plot p1 <- ggplot(day_30_data, aes(x = DOY, y = ci)) + geom_point(color = "blue", size = 3, alpha = 0.7) + labs( title = "CI Values at Day 30 for All Fields", x = "Day of Year (DOY)", y = "CI Value", subtitle = paste("Total fields:", nrow(day_30_data)) ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.title = element_text(size = 12), axis.text = element_text(size = 10) ) print(p1) # Try multiple curve fitting approaches cat("\nTrying different curve fitting approaches...\n") # Aggregate data by DOY (take mean CI for each DOY to reduce noise) aggregated_data <- day_30_data %>% group_by(DOY) %>% summarise( mean_ci = mean(ci, na.rm = TRUE), median_ci = median(ci, na.rm = TRUE), count = n(), .groups = 'drop' ) %>% filter(count >= 5) # Only keep DOY values with sufficient data points cat(paste("Aggregated to", nrow(aggregated_data), "DOY points with sufficient data\n")) # Create prediction data for smooth curves x_smooth <- seq(min(aggregated_data$DOY), max(aggregated_data$DOY), length.out = 100) # 1. Quadratic model (as requested) cat("\n1. Fitting quadratic model...\n") quad_model <- lm(mean_ci ~ poly(DOY, 2, raw = TRUE), data = aggregated_data) quad_pred <- predict(quad_model, newdata = data.frame(DOY = x_smooth)) quad_r2 <- summary(quad_model)$r.squared cat(paste("Quadratic R² =", round(quad_r2, 3), "\n")) # 2. Logistic growth model: y = K / (1 + exp(-r*(x-x0))) - biologically realistic cat("\n2. Fitting logistic growth model...\n") tryCatch({ # Estimate starting values K_start <- max(aggregated_data$mean_ci) * 1.1 # Carrying capacity r_start <- 0.05 # Growth rate x0_start <- mean(aggregated_data$DOY) # Inflection point logistic_model <- nls(mean_ci ~ K / (1 + exp(-r * (DOY - x0))), data = aggregated_data, start = list(K = K_start, r = r_start, x0 = x0_start), control = nls.control(maxiter = 1000)) logistic_pred <- predict(logistic_model, newdata = data.frame(DOY = x_smooth)) logistic_r2 <- 1 - sum(residuals(logistic_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2) cat(paste("Logistic growth R² =", round(logistic_r2, 3), "\n")) }, error = function(e) { cat("Logistic growth model failed to converge\n") logistic_model <- NULL logistic_pred <- NULL logistic_r2 <- NA }) # 3. Beta function model: good for crop growth curves with clear peak cat("\n3. Fitting Beta function model...\n") tryCatch({ # Normalize DOY to 0-1 range for Beta function doy_min <- min(aggregated_data$DOY) doy_max <- max(aggregated_data$DOY) aggregated_data$doy_norm <- (aggregated_data$DOY - doy_min) / (doy_max - doy_min) # Beta function: y = a * (x^(p-1)) * ((1-x)^(q-1)) + c beta_model <- nls(mean_ci ~ a * (doy_norm^(p-1)) * ((1-doy_norm)^(q-1)) + c, data = aggregated_data, start = list(a = max(aggregated_data$mean_ci) * 20, p = 2, q = 3, c = min(aggregated_data$mean_ci)), control = nls.control(maxiter = 1000)) # Predict on normalized scale then convert back x_smooth_norm <- (x_smooth - doy_min) / (doy_max - doy_min) beta_pred <- predict(beta_model, newdata = data.frame(doy_norm = x_smooth_norm)) beta_r2 <- 1 - sum(residuals(beta_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2) cat(paste("Beta function R² =", round(beta_r2, 3), "\n")) }, error = function(e) { cat("Beta function model failed to converge\n") beta_model <- NULL beta_pred <- NULL beta_r2 <- NA }) # 4. Gaussian (normal) curve: good for symmetric growth patterns cat("\n4. Fitting Gaussian curve...\n") tryCatch({ # Gaussian: y = a * exp(-((x-mu)^2)/(2*sigma^2)) + c gaussian_model <- nls(mean_ci ~ a * exp(-((DOY - mu)^2)/(2 * sigma^2)) + c, data = aggregated_data, start = list(a = max(aggregated_data$mean_ci), mu = aggregated_data$DOY[which.max(aggregated_data$mean_ci)], sigma = 50, c = min(aggregated_data$mean_ci)), control = nls.control(maxiter = 1000)) gaussian_pred <- predict(gaussian_model, newdata = data.frame(DOY = x_smooth)) gaussian_r2 <- 1 - sum(residuals(gaussian_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2) cat(paste("Gaussian R² =", round(gaussian_r2, 3), "\n")) }, error = function(e) { cat("Gaussian model failed to converge\n") gaussian_model <- NULL gaussian_pred <- NULL gaussian_r2 <- NA }) # 5. LOESS (local regression) - for comparison cat("\n5. Fitting LOESS smoothing...\n") loess_model <- loess(mean_ci ~ DOY, data = aggregated_data, span = 0.5) loess_pred <- predict(loess_model, newdata = data.frame(DOY = x_smooth)) loess_r2 <- 1 - sum(residuals(loess_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2) cat(paste("LOESS R² =", round(loess_r2, 3), "\n")) # Calculate confidence intervals for both models cat("\nCalculating confidence intervals...\n") # Function to calculate confidence intervals using residual-based method calculate_ci <- function(model, data, newdata, alpha = 0.5) { # Get model predictions pred_vals <- predict(model, newdata = newdata) # For parametric models (lm), use built-in prediction intervals if(class(model)[1] == "lm") { pred_intervals <- predict(model, newdata = newdata, interval = "confidence", level = 1 - alpha) return(list( lower = pred_intervals[, "lwr"], upper = pred_intervals[, "upr"] )) } # For LOESS, calculate confidence intervals using residual bootstrap if(class(model)[1] == "loess") { # Calculate residuals from the original model fitted_vals <- fitted(model) residuals_vals <- residuals(model) residual_sd <- sd(residuals_vals, na.rm = TRUE) # Use normal approximation for confidence intervals # For 50% CI, use 67% quantile (approximately 0.67 standard deviations) margin <- qnorm(1 - alpha/2) * residual_sd return(list( lower = pred_vals - margin, upper = pred_vals + margin )) } # Fallback method residual_sd <- sd(residuals(model), na.rm = TRUE) margin <- qnorm(1 - alpha/2) * residual_sd return(list( lower = pred_vals - margin, upper = pred_vals + margin )) }# Calculate CIs for quadratic model using aggregated data (same as model fitting) quad_ci <- calculate_ci(quad_model, aggregated_data, data.frame(DOY = x_smooth)) # Calculate CIs for LOESS model using aggregated data (same as model fitting) loess_ci <- calculate_ci(loess_model, aggregated_data, data.frame(DOY = x_smooth)) # Create separate plots for LOESS and Quadratic models cat("\nCreating LOESS plot with confidence intervals...\n") # LOESS plot p_loess <- ggplot(day_30_data, aes(x = DOY, y = ci)) + geom_point(color = "lightblue", size = 1.5, alpha = 0.4) + geom_point(data = aggregated_data, aes(x = DOY, y = mean_ci), color = "darkblue", size = 3, alpha = 0.8) + geom_ribbon(data = data.frame(DOY = x_smooth, lower = loess_ci$lower, upper = loess_ci$upper), aes(x = DOY, ymin = lower, ymax = upper), alpha = 0.3, fill = "purple", inherit.aes = FALSE) + geom_line(data = data.frame(DOY = x_smooth, loess = loess_pred), aes(x = DOY, y = loess), color = "purple", size = 1.5) + labs( title = "LOESS Model - CI Values Over Growing Season", x = "Day of Year (DOY)", y = "CI Value", subtitle = paste("LOESS R² =", round(loess_r2, 3), "| 50% Confidence Interval") ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.title = element_text(size = 12), axis.text = element_text(size = 10) ) # Find optimal point for LOESS loess_max_idx <- which.max(loess_pred) loess_optimal_doy <- x_smooth[loess_max_idx] loess_optimal_ci <- loess_pred[loess_max_idx] p_loess <- p_loess + geom_point(aes(x = loess_optimal_doy, y = loess_optimal_ci), color = "red", size = 5, shape = 8) + annotate("text", x = loess_optimal_doy + 30, y = loess_optimal_ci, label = paste("Optimal: DOY", round(loess_optimal_doy, 1), "\nCI =", round(loess_optimal_ci, 3)), color = "red", size = 4, fontface = "bold") print(p_loess) cat("\nCreating Quadratic plot with confidence intervals...\n") # Quadratic plot p_quadratic <- ggplot(day_30_data, aes(x = DOY, y = ci)) + geom_point(color = "lightcoral", size = 1.5, alpha = 0.4) + geom_point(data = aggregated_data, aes(x = DOY, y = mean_ci), color = "darkred", size = 3, alpha = 0.8) + geom_ribbon(data = data.frame(DOY = x_smooth, lower = quad_ci$lower, upper = quad_ci$upper), aes(x = DOY, ymin = lower, ymax = upper), alpha = 0.3, fill = "red", inherit.aes = FALSE) + geom_line(data = data.frame(DOY = x_smooth, quadratic = quad_pred), aes(x = DOY, y = quadratic), color = "red", size = 1.5) + labs( title = "Quadratic Model - CI Values Over Growing Season", x = "Day of Year (DOY)", y = "CI Value", subtitle = paste("Quadratic R² =", round(quad_r2, 3), "| 50% Confidence Interval") ) + theme_minimal() + theme( plot.title = element_text(size = 14, face = "bold"), axis.title = element_text(size = 12), axis.text = element_text(size = 10) ) # Find optimal point for Quadratic quad_coeffs <- coef(quad_model) a <- quad_coeffs[3] b <- quad_coeffs[2] if(a != 0) { quad_optimal_doy <- -b / (2*a) doy_range <- range(aggregated_data$DOY) if(quad_optimal_doy >= doy_range[1] && quad_optimal_doy <= doy_range[2]) { quad_optimal_ci <- predict(quad_model, newdata = data.frame(DOY = quad_optimal_doy)) } else { quad_optimal_doy <- x_smooth[which.max(quad_pred)] quad_optimal_ci <- max(quad_pred) } } else { quad_optimal_doy <- x_smooth[which.max(quad_pred)] quad_optimal_ci <- max(quad_pred) } p_quadratic <- p_quadratic + geom_point(aes(x = quad_optimal_doy, y = quad_optimal_ci), color = "darkred", size = 5, shape = 8) + annotate("text", x = quad_optimal_doy + 30, y = quad_optimal_ci, label = paste("Optimal: DOY", round(quad_optimal_doy, 1), "\nCI =", round(quad_optimal_ci, 3)), color = "darkred", size = 4, fontface = "bold") print(p_quadratic) print(p_loess) # Print results summary cat("\n=== RESULTS SUMMARY ===\n") cat(paste("LOESS Model - R² =", round(loess_r2, 3), "\n")) cat(paste(" Optimal DOY:", round(loess_optimal_doy, 1), "\n")) cat(paste(" Optimal CI:", round(loess_optimal_ci, 4), "\n\n")) cat(paste("Quadratic Model - R² =", round(quad_r2, 3), "\n")) cat(paste(" Optimal DOY:", round(quad_optimal_doy, 1), "\n")) cat(paste(" Optimal CI:", round(quad_optimal_ci, 4), "\n"))