SmartCane/r_app/experiments/optimal_ci_analysis.R
Timon 6efcc8cfec Fix CI report pipeline: update tmap v4 syntax, add continuous color scales, fix formatting
- Updated all CI maps to use tm_scale_continuous() for proper tmap v4 compatibility
- Added fixed color scale limits (1-8 for CI, -3 to +3 for differences) for consistent field comparison
- Fixed YAML header formatting issues in CI_report_dashboard_planet.Rmd
- Positioned RGB map before CI overview map as requested
- Removed all obsolete use_breaks parameter references
- Enhanced error handling and logging throughout the pipeline
- Added new experimental analysis scripts and improvements to mosaic creation
2025-06-19 20:37:20 +02:00

422 lines
15 KiB
R

# Optimal CI Analysis - Day 30 CI values with quadratic fitting
# Author: SmartCane Analysis Team
# Date: 2025-06-12
# Load required libraries
library(ggplot2)
library(dplyr)
library(readr)
# Set file path
rds_file_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane/laravel_app/storage/app/chemba/Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds"
# Check if file exists
if (!file.exists(rds_file_path)) {
stop("RDS file not found at specified path: ", rds_file_path)
}
# Load the data
cat("Loading RDS file...\n")
ci_data <- readRDS(rds_file_path)
# Display structure of the data to understand it better
cat("Data structure:\n")
str(ci_data)
cat("\nFirst few rows:\n")
head(ci_data)
cat("\nColumn names:\n")
print(colnames(ci_data))
# Filter data based on requirements
cat("\nApplying data filters...\n")
# 1. Filter out models that don't reach at least DOY 300
model_doy_max <- ci_data %>%
group_by(model) %>%
summarise(max_doy = max(DOY, na.rm = TRUE), .groups = 'drop')
valid_models <- model_doy_max %>%
filter(max_doy >= 300) %>%
pull(model)
cat(paste("Models before filtering:", n_distinct(ci_data$model), "\n"))
cat(paste("Models reaching at least DOY 300:", length(valid_models), "\n"))
ci_data <- ci_data %>%
filter(model %in% valid_models)
# 2. Apply IQR filtering per DOY (remove outliers outside IQR)
cat("Applying IQR filtering per DOY...\n")
original_rows <- nrow(ci_data)
ci_data <- ci_data %>%
group_by(DOY) %>%
mutate(
Q1 = quantile(FitData, 0.25, na.rm = TRUE),
Q3 = quantile(FitData, 0.75, na.rm = TRUE),
IQR = Q3 - Q1,
lower_bound = Q1 - 1.5 * IQR,
upper_bound = Q3 + 1.5 * IQR
) %>%
filter(FitData >= lower_bound & FitData <= upper_bound) %>%
select(-Q1, -Q3, -IQR, -lower_bound, -upper_bound) %>%
ungroup()
filtered_rows <- nrow(ci_data)
cat(paste("Rows before IQR filtering:", original_rows, "\n"))
cat(paste("Rows after IQR filtering:", filtered_rows, "\n"))
cat(paste("Removed", original_rows - filtered_rows, "outliers (",
round(100 * (original_rows - filtered_rows) / original_rows, 1), "%)\n"))
# Check what day values are available after filtering
if ("DOY" %in% colnames(ci_data)) {
cat("\nUnique DOY values after filtering:\n")
print(sort(unique(ci_data$DOY)))
} else {
cat("\nNo 'DOY' column found. Available columns:\n")
print(colnames(ci_data))
}
# Extract CI values at day 30 for each field
cat("\nExtracting day 30 CI values...\n")
# Set column names based on known structure
day_col <- "DOY"
ci_col <- "FitData"
field_col <- "model"
# Try different possible field column name combinations
if ("field" %in% colnames(ci_data)) {
field_col <- "field"
} else if ("Field" %in% colnames(ci_data)) {
field_col <- "Field"
} else if ("pivot" %in% colnames(ci_data)) {
field_col <- "pivot"
} else if ("Pivot" %in% colnames(ci_data)) {
field_col <- "Pivot"
} else if ("field_id" %in% colnames(ci_data)) {
field_col <- "field_id"
} else if ("Field_ID" %in% colnames(ci_data)) {
field_col <- "Field_ID"
}
# Check if we found the required columns
if (!("DOY" %in% colnames(ci_data))) {
stop("DOY column not found in data")
}
if (!("FitData" %in% colnames(ci_data))) {
stop("FitData column not found in data")
}
if (is.null(field_col)) {
cat("Could not automatically identify field column. Please check column names.\n")
cat("Available columns: ", paste(colnames(ci_data), collapse = ", "), "\n")
stop("Manual field column identification required")
}
cat(paste("Using columns - DOY:", day_col, "Field:", field_col, "FitData:", ci_col, "\n"))
# Extract day 30 data
day_30_data <- ci_data %>%
filter(DOY %% 30 == 0) %>%
select(field = !!sym(field_col), ci = !!sym(ci_col), DOY) %>%
na.omit() %>%
arrange(field)
cat(paste("Found", nrow(day_30_data), "fields with day 30 CI values\n"))
if (nrow(day_30_data) == 0) {
stop("No data found for day 30. Check if day 30 exists in the dataset.")
}
# Display summary of day 30 data
cat("\nSummary of day 30 CI values:\n")
print(summary(day_30_data))
# Add field index for plotting (assuming fields represent some spatial or sequential order)
#day_30_data$field_index <- 1:nrow(day_30_data)
# Create scatter plot
p1 <- ggplot(day_30_data, aes(x = DOY, y = ci)) +
geom_point(color = "blue", size = 3, alpha = 0.7) +
labs(
title = "CI Values at Day 30 for All Fields",
x = "Day of Year (DOY)",
y = "CI Value",
subtitle = paste("Total fields:", nrow(day_30_data))
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
print(p1)
# Try multiple curve fitting approaches
cat("\nTrying different curve fitting approaches...\n")
# Aggregate data by DOY (take mean CI for each DOY to reduce noise)
aggregated_data <- day_30_data %>%
group_by(DOY) %>%
summarise(
mean_ci = mean(ci, na.rm = TRUE),
median_ci = median(ci, na.rm = TRUE),
count = n(),
.groups = 'drop'
) %>%
filter(count >= 5) # Only keep DOY values with sufficient data points
cat(paste("Aggregated to", nrow(aggregated_data), "DOY points with sufficient data\n"))
# Create prediction data for smooth curves
x_smooth <- seq(min(aggregated_data$DOY), max(aggregated_data$DOY), length.out = 100)
# 1. Quadratic model (as requested)
cat("\n1. Fitting quadratic model...\n")
quad_model <- lm(mean_ci ~ poly(DOY, 2, raw = TRUE), data = aggregated_data)
quad_pred <- predict(quad_model, newdata = data.frame(DOY = x_smooth))
quad_r2 <- summary(quad_model)$r.squared
cat(paste("Quadratic R² =", round(quad_r2, 3), "\n"))
# 2. Logistic growth model: y = K / (1 + exp(-r*(x-x0))) - biologically realistic
cat("\n2. Fitting logistic growth model...\n")
tryCatch({
# Estimate starting values
K_start <- max(aggregated_data$mean_ci) * 1.1 # Carrying capacity
r_start <- 0.05 # Growth rate
x0_start <- mean(aggregated_data$DOY) # Inflection point
logistic_model <- nls(mean_ci ~ K / (1 + exp(-r * (DOY - x0))),
data = aggregated_data,
start = list(K = K_start, r = r_start, x0 = x0_start),
control = nls.control(maxiter = 1000))
logistic_pred <- predict(logistic_model, newdata = data.frame(DOY = x_smooth))
logistic_r2 <- 1 - sum(residuals(logistic_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
cat(paste("Logistic growth R² =", round(logistic_r2, 3), "\n"))
}, error = function(e) {
cat("Logistic growth model failed to converge\n")
logistic_model <- NULL
logistic_pred <- NULL
logistic_r2 <- NA
})
# 3. Beta function model: good for crop growth curves with clear peak
cat("\n3. Fitting Beta function model...\n")
tryCatch({
# Normalize DOY to 0-1 range for Beta function
doy_min <- min(aggregated_data$DOY)
doy_max <- max(aggregated_data$DOY)
aggregated_data$doy_norm <- (aggregated_data$DOY - doy_min) / (doy_max - doy_min)
# Beta function: y = a * (x^(p-1)) * ((1-x)^(q-1)) + c
beta_model <- nls(mean_ci ~ a * (doy_norm^(p-1)) * ((1-doy_norm)^(q-1)) + c,
data = aggregated_data,
start = list(a = max(aggregated_data$mean_ci) * 20, p = 2, q = 3, c = min(aggregated_data$mean_ci)),
control = nls.control(maxiter = 1000))
# Predict on normalized scale then convert back
x_smooth_norm <- (x_smooth - doy_min) / (doy_max - doy_min)
beta_pred <- predict(beta_model, newdata = data.frame(doy_norm = x_smooth_norm))
beta_r2 <- 1 - sum(residuals(beta_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
cat(paste("Beta function R² =", round(beta_r2, 3), "\n"))
}, error = function(e) {
cat("Beta function model failed to converge\n")
beta_model <- NULL
beta_pred <- NULL
beta_r2 <- NA
})
# 4. Gaussian (normal) curve: good for symmetric growth patterns
cat("\n4. Fitting Gaussian curve...\n")
tryCatch({
# Gaussian: y = a * exp(-((x-mu)^2)/(2*sigma^2)) + c
gaussian_model <- nls(mean_ci ~ a * exp(-((DOY - mu)^2)/(2 * sigma^2)) + c,
data = aggregated_data,
start = list(a = max(aggregated_data$mean_ci),
mu = aggregated_data$DOY[which.max(aggregated_data$mean_ci)],
sigma = 50,
c = min(aggregated_data$mean_ci)),
control = nls.control(maxiter = 1000))
gaussian_pred <- predict(gaussian_model, newdata = data.frame(DOY = x_smooth))
gaussian_r2 <- 1 - sum(residuals(gaussian_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
cat(paste("Gaussian R² =", round(gaussian_r2, 3), "\n"))
}, error = function(e) {
cat("Gaussian model failed to converge\n")
gaussian_model <- NULL
gaussian_pred <- NULL
gaussian_r2 <- NA
})
# 5. LOESS (local regression) - for comparison
cat("\n5. Fitting LOESS smoothing...\n")
loess_model <- loess(mean_ci ~ DOY, data = aggregated_data, span = 0.5)
loess_pred <- predict(loess_model, newdata = data.frame(DOY = x_smooth))
loess_r2 <- 1 - sum(residuals(loess_model)^2) / sum((aggregated_data$mean_ci - mean(aggregated_data$mean_ci))^2)
cat(paste("LOESS R² =", round(loess_r2, 3), "\n"))
# Calculate confidence intervals for both models
cat("\nCalculating confidence intervals...\n")
# Function to calculate confidence intervals using residual-based method
calculate_ci <- function(model, data, newdata, alpha = 0.5) {
# Get model predictions
pred_vals <- predict(model, newdata = newdata)
# For parametric models (lm), use built-in prediction intervals
if(class(model)[1] == "lm") {
pred_intervals <- predict(model, newdata = newdata, interval = "confidence", level = 1 - alpha)
return(list(
lower = pred_intervals[, "lwr"],
upper = pred_intervals[, "upr"]
))
}
# For LOESS, calculate confidence intervals using residual bootstrap
if(class(model)[1] == "loess") {
# Calculate residuals from the original model
fitted_vals <- fitted(model)
residuals_vals <- residuals(model)
residual_sd <- sd(residuals_vals, na.rm = TRUE)
# Use normal approximation for confidence intervals
# For 50% CI, use 67% quantile (approximately 0.67 standard deviations)
margin <- qnorm(1 - alpha/2) * residual_sd
return(list(
lower = pred_vals - margin,
upper = pred_vals + margin
))
}
# Fallback method
residual_sd <- sd(residuals(model), na.rm = TRUE)
margin <- qnorm(1 - alpha/2) * residual_sd
return(list(
lower = pred_vals - margin,
upper = pred_vals + margin
))
}# Calculate CIs for quadratic model using aggregated data (same as model fitting)
quad_ci <- calculate_ci(quad_model, aggregated_data, data.frame(DOY = x_smooth))
# Calculate CIs for LOESS model using aggregated data (same as model fitting)
loess_ci <- calculate_ci(loess_model, aggregated_data, data.frame(DOY = x_smooth))
# Create separate plots for LOESS and Quadratic models
cat("\nCreating LOESS plot with confidence intervals...\n")
# LOESS plot
p_loess <- ggplot(day_30_data, aes(x = DOY, y = ci)) +
geom_point(color = "lightblue", size = 1.5, alpha = 0.4) +
geom_point(data = aggregated_data, aes(x = DOY, y = mean_ci),
color = "darkblue", size = 3, alpha = 0.8) +
geom_ribbon(data = data.frame(DOY = x_smooth,
lower = loess_ci$lower,
upper = loess_ci$upper),
aes(x = DOY, ymin = lower, ymax = upper),
alpha = 0.3, fill = "purple", inherit.aes = FALSE) +
geom_line(data = data.frame(DOY = x_smooth, loess = loess_pred),
aes(x = DOY, y = loess),
color = "purple", size = 1.5) +
labs(
title = "LOESS Model - CI Values Over Growing Season",
x = "Day of Year (DOY)",
y = "CI Value",
subtitle = paste("LOESS R² =", round(loess_r2, 3), "| 50% Confidence Interval")
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
# Find optimal point for LOESS
loess_max_idx <- which.max(loess_pred)
loess_optimal_doy <- x_smooth[loess_max_idx]
loess_optimal_ci <- loess_pred[loess_max_idx]
p_loess <- p_loess +
geom_point(aes(x = loess_optimal_doy, y = loess_optimal_ci),
color = "red", size = 5, shape = 8) +
annotate("text",
x = loess_optimal_doy + 30,
y = loess_optimal_ci,
label = paste("Optimal: DOY", round(loess_optimal_doy, 1), "\nCI =", round(loess_optimal_ci, 3)),
color = "red", size = 4, fontface = "bold")
print(p_loess)
cat("\nCreating Quadratic plot with confidence intervals...\n")
# Quadratic plot
p_quadratic <- ggplot(day_30_data, aes(x = DOY, y = ci)) +
geom_point(color = "lightcoral", size = 1.5, alpha = 0.4) +
geom_point(data = aggregated_data, aes(x = DOY, y = mean_ci),
color = "darkred", size = 3, alpha = 0.8) +
geom_ribbon(data = data.frame(DOY = x_smooth,
lower = quad_ci$lower,
upper = quad_ci$upper),
aes(x = DOY, ymin = lower, ymax = upper),
alpha = 0.3, fill = "red", inherit.aes = FALSE) +
geom_line(data = data.frame(DOY = x_smooth, quadratic = quad_pred),
aes(x = DOY, y = quadratic),
color = "red", size = 1.5) +
labs(
title = "Quadratic Model - CI Values Over Growing Season",
x = "Day of Year (DOY)",
y = "CI Value",
subtitle = paste("Quadratic R² =", round(quad_r2, 3), "| 50% Confidence Interval")
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
# Find optimal point for Quadratic
quad_coeffs <- coef(quad_model)
a <- quad_coeffs[3]
b <- quad_coeffs[2]
if(a != 0) {
quad_optimal_doy <- -b / (2*a)
doy_range <- range(aggregated_data$DOY)
if(quad_optimal_doy >= doy_range[1] && quad_optimal_doy <= doy_range[2]) {
quad_optimal_ci <- predict(quad_model, newdata = data.frame(DOY = quad_optimal_doy))
} else {
quad_optimal_doy <- x_smooth[which.max(quad_pred)]
quad_optimal_ci <- max(quad_pred)
}
} else {
quad_optimal_doy <- x_smooth[which.max(quad_pred)]
quad_optimal_ci <- max(quad_pred)
}
p_quadratic <- p_quadratic +
geom_point(aes(x = quad_optimal_doy, y = quad_optimal_ci),
color = "darkred", size = 5, shape = 8) +
annotate("text",
x = quad_optimal_doy + 30,
y = quad_optimal_ci,
label = paste("Optimal: DOY", round(quad_optimal_doy, 1), "\nCI =", round(quad_optimal_ci, 3)),
color = "darkred", size = 4, fontface = "bold")
print(p_quadratic)
print(p_loess)
# Print results summary
cat("\n=== RESULTS SUMMARY ===\n")
cat(paste("LOESS Model - R² =", round(loess_r2, 3), "\n"))
cat(paste(" Optimal DOY:", round(loess_optimal_doy, 1), "\n"))
cat(paste(" Optimal CI:", round(loess_optimal_ci, 4), "\n\n"))
cat(paste("Quadratic Model - R² =", round(quad_r2, 3), "\n"))
cat(paste(" Optimal DOY:", round(quad_optimal_doy, 1), "\n"))
cat(paste(" Optimal CI:", round(quad_optimal_ci, 4), "\n"))