688 lines
25 KiB
R
688 lines
25 KiB
R
# 12_TEMPORAL_YIELD_FORECASTING.R
|
||
# ==================================
|
||
# Progressive yield forecasting using full CI time series
|
||
# Predicts yield at multiple time points (DOY 300, 330, 360, 390) to show
|
||
# how forecast accuracy improves as the season progresses
|
||
#
|
||
# Key differences from 11_yield_prediction_comparison.R:
|
||
# - Uses FULL CI time series, not just final cumulative value
|
||
# - Creates sequential features (lagged values, rolling statistics)
|
||
# - Trains separate models for each forecast horizon
|
||
# - Visualizes prediction improvement over time
|
||
#
|
||
# Usage: Rscript 12_temporal_yield_forecasting.R [project_dir]
|
||
|
||
# 1. Load required libraries
|
||
# -------------------------
|
||
suppressPackageStartupMessages({
|
||
library(here)
|
||
library(sf)
|
||
library(dplyr)
|
||
library(tidyr)
|
||
library(lubridate)
|
||
library(readr)
|
||
library(readxl)
|
||
library(caret)
|
||
library(CAST) # For ffs (Forward Feature Selection)
|
||
library(randomForest)
|
||
library(ggplot2)
|
||
library(gridExtra)
|
||
library(purrr)
|
||
})
|
||
|
||
# 2. Helper Functions
|
||
# -----------------
|
||
|
||
#' Safe logging function
|
||
safe_log <- function(message, level = "INFO") {
|
||
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
|
||
cat(sprintf("[%s] %s: %s\n", timestamp, level, message))
|
||
}
|
||
|
||
#' Calculate temporal features from CI time series
|
||
#' @param ci_data Time series of CI values for a field
|
||
#' @param target_doy The DOY at which to calculate features
|
||
calculate_temporal_features <- function(ci_data, target_doy) {
|
||
# Filter to data up to target DOY
|
||
data_up_to_target <- ci_data %>%
|
||
dplyr::filter(DOY <= target_doy) %>%
|
||
dplyr::arrange(DOY)
|
||
|
||
if (nrow(data_up_to_target) < 5) {
|
||
return(NULL) # Not enough data
|
||
}
|
||
|
||
# Calculate features
|
||
features <- data.frame(
|
||
# Current state
|
||
current_CI = tail(data_up_to_target$cumulative_CI, 1),
|
||
current_DOY = target_doy,
|
||
days_observed = nrow(data_up_to_target),
|
||
|
||
# Growth metrics
|
||
total_CI_accumulated = tail(data_up_to_target$cumulative_CI, 1),
|
||
avg_CI_per_day = tail(data_up_to_target$cumulative_CI, 1) / target_doy,
|
||
|
||
# Recent growth (last 30 days)
|
||
recent_CI_30d = ifelse(nrow(data_up_to_target) >= 30,
|
||
tail(data_up_to_target$cumulative_CI, 1) -
|
||
data_up_to_target$cumulative_CI[max(1, nrow(data_up_to_target) - 30)],
|
||
NA),
|
||
|
||
# Growth trajectory
|
||
CI_growth_rate = ifelse(nrow(data_up_to_target) > 2,
|
||
coef(lm(cumulative_CI ~ DOY, data = data_up_to_target))[2],
|
||
NA),
|
||
|
||
# Early season performance (first 150 days)
|
||
early_season_CI = sum(data_up_to_target$cumulative_CI[data_up_to_target$DOY <= 150]),
|
||
|
||
# Growth stability
|
||
CI_variability = sd(diff(data_up_to_target$cumulative_CI), na.rm = TRUE)
|
||
)
|
||
|
||
return(features)
|
||
}
|
||
|
||
#' Create prediction plot for specific forecast horizon
|
||
create_forecast_plot <- function(predicted, actual, forecast_doy, metrics, title_suffix = "") {
|
||
plot_data <- data.frame(
|
||
Predicted = predicted,
|
||
Actual = actual
|
||
) %>% filter(!is.na(Predicted) & !is.na(Actual))
|
||
|
||
if (nrow(plot_data) == 0) return(NULL)
|
||
|
||
min_val <- min(c(plot_data$Predicted, plot_data$Actual))
|
||
max_val <- max(c(plot_data$Predicted, plot_data$Actual))
|
||
|
||
p <- ggplot(plot_data, aes(x = Actual, y = Predicted)) +
|
||
geom_point(alpha = 0.6, size = 2.5, color = "#2E86AB") +
|
||
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red", linewidth = 1) +
|
||
geom_smooth(method = "lm", se = TRUE, color = "#A23B72", fill = "#A23B72", alpha = 0.2) +
|
||
coord_fixed(xlim = c(min_val, max_val), ylim = c(min_val, max_val)) +
|
||
labs(
|
||
title = sprintf("Forecast at DOY %d%s", forecast_doy, title_suffix),
|
||
subtitle = sprintf("RMSE: %.2f t/ha | MAE: %.2f t/ha | R²: %.3f | n: %d",
|
||
metrics$RMSE, metrics$MAE, metrics$R2, metrics$n),
|
||
x = "Actual TCH (t/ha)",
|
||
y = "Predicted TCH (t/ha)"
|
||
) +
|
||
theme_minimal() +
|
||
theme(
|
||
plot.title = element_text(face = "bold", size = 10),
|
||
plot.subtitle = element_text(size = 9, color = "gray40"),
|
||
axis.title = element_text(size = 10),
|
||
axis.text = element_text(size = 9),
|
||
panel.grid.minor = element_blank(),
|
||
panel.border = element_rect(color = "gray80", fill = NA, linewidth = 1)
|
||
)
|
||
|
||
return(p)
|
||
}
|
||
|
||
#' Calculate model performance metrics
|
||
calculate_metrics <- function(predicted, actual) {
|
||
valid_idx <- !is.na(predicted) & !is.na(actual)
|
||
predicted <- predicted[valid_idx]
|
||
actual <- actual[valid_idx]
|
||
|
||
if (length(predicted) == 0) {
|
||
return(list(RMSE = NA, MAE = NA, R2 = NA, n = 0))
|
||
}
|
||
|
||
rmse <- sqrt(mean((predicted - actual)^2))
|
||
mae <- mean(abs(predicted - actual))
|
||
r2 <- cor(predicted, actual)^2
|
||
|
||
return(list(
|
||
RMSE = round(rmse, 2),
|
||
MAE = round(mae, 2),
|
||
R2 = round(r2, 3),
|
||
n = length(predicted)
|
||
))
|
||
}
|
||
|
||
# 3. Main Function
|
||
# --------------
|
||
main <- function() {
|
||
# Process command line arguments
|
||
args <- commandArgs(trailingOnly = TRUE)
|
||
|
||
if (length(args) >= 1 && !is.na(args[1])) {
|
||
project_dir <- as.character(args[1])
|
||
} else {
|
||
project_dir <- "esa" # Default project
|
||
}
|
||
|
||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||
|
||
safe_log("=== TEMPORAL YIELD FORECASTING ===")
|
||
safe_log(paste("Project:", project_dir))
|
||
|
||
# Load project configuration
|
||
tryCatch({
|
||
source(here("r_app", "parameters_project.R"))
|
||
}, error = function(e) {
|
||
stop("Error loading parameters_project.R: ", e$message)
|
||
})
|
||
|
||
# 4. Load yield data
|
||
# ----------------
|
||
yield_excel_path <- file.path(
|
||
"laravel_app", "storage", "app", project_dir, "Data",
|
||
paste0(project_dir, "_yield_data.xlsx")
|
||
)
|
||
|
||
safe_log("Loading yield data...")
|
||
sheet_names <- readxl::excel_sheets(here(yield_excel_path))
|
||
|
||
yield_data_list <- lapply(sheet_names, function(sheet_name) {
|
||
year_matches <- regmatches(sheet_name, gregexpr("[0-9]{4}|[0-9]{2}", sheet_name))[[1]]
|
||
|
||
if (length(year_matches) >= 2) {
|
||
second_year <- year_matches[2]
|
||
if (nchar(second_year) == 2) {
|
||
year_value <- as.numeric(paste0("20", second_year))
|
||
} else {
|
||
year_value <- as.numeric(second_year)
|
||
}
|
||
} else if (length(year_matches) == 1) {
|
||
year_value <- as.numeric(year_matches[1])
|
||
} else {
|
||
year_value <- NA
|
||
}
|
||
|
||
data <- readxl::read_excel(here(yield_excel_path), sheet = sheet_name)
|
||
data$season <- year_value
|
||
data
|
||
})
|
||
|
||
yield_data <- dplyr::bind_rows(yield_data_list) %>%
|
||
dplyr::rename(sub_field = Field) %>%
|
||
dplyr::select(sub_field, season, TCH, Ratoon, Cane_Variety, Irrig_Type) %>%
|
||
dplyr::rename(tonnage_ha = TCH) %>%
|
||
dplyr::filter(!is.na(tonnage_ha))
|
||
|
||
safe_log(sprintf("Loaded %d yield records", nrow(yield_data)))
|
||
|
||
# 5. Load CI time series data
|
||
# -------------------------
|
||
safe_log("Loading CI time series...")
|
||
CI_data <- readRDS(here(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds")) %>%
|
||
dplyr::group_by(model) %>%
|
||
tidyr::fill(field, sub_field, .direction = "downup") %>%
|
||
dplyr::ungroup() %>%
|
||
dplyr::select(sub_field, season, DOY, cumulative_CI) %>%
|
||
dplyr::filter(!is.na(cumulative_CI), DOY > 0)
|
||
|
||
safe_log(sprintf("Loaded CI data: %d observations", nrow(CI_data)))
|
||
|
||
# 6. Define forecast horizons (in DOY)
|
||
# ----------------------------------
|
||
forecast_horizons <- c(300, 330, 360, 390)
|
||
safe_log(paste("Forecast horizons (DOY):", paste(forecast_horizons, collapse = ", ")))
|
||
|
||
# 7. Prepare training data for each horizon
|
||
# ---------------------------------------
|
||
safe_log("\nPreparing temporal features for each forecast horizon...")
|
||
|
||
forecast_data_list <- list()
|
||
|
||
for (horizon_doy in forecast_horizons) {
|
||
safe_log(sprintf("\n=== Processing DOY %d ===", horizon_doy))
|
||
|
||
# For each field-season, calculate features up to this DOY
|
||
horizon_features <- CI_data %>%
|
||
dplyr::group_by(sub_field, season) %>%
|
||
dplyr::group_modify(~ {
|
||
features <- calculate_temporal_features(.x, horizon_doy)
|
||
if (!is.null(features)) {
|
||
return(features)
|
||
} else {
|
||
return(data.frame())
|
||
}
|
||
}) %>%
|
||
dplyr::ungroup()
|
||
|
||
# Join with yield data
|
||
horizon_data <- horizon_features %>%
|
||
dplyr::inner_join(yield_data, by = c("sub_field", "season")) %>%
|
||
dplyr::filter(!is.na(tonnage_ha))
|
||
|
||
safe_log(sprintf(" Features calculated for %d field-seasons", nrow(horizon_data)))
|
||
|
||
forecast_data_list[[as.character(horizon_doy)]] <- horizon_data
|
||
}
|
||
|
||
# 8. Train models for each forecast horizon
|
||
# ---------------------------------------
|
||
safe_log("\n=== TRAINING FORECAST MODELS ===")
|
||
|
||
set.seed(206)
|
||
ctrl <- caret::trainControl(
|
||
method = "cv",
|
||
number = 5,
|
||
savePredictions = "final"
|
||
)
|
||
|
||
models_all_vars <- list()
|
||
models_ffs <- list()
|
||
predictions_all_vars <- list()
|
||
predictions_ffs <- list()
|
||
metrics_all_vars <- list()
|
||
metrics_ffs <- list()
|
||
importance_all_vars <- list()
|
||
importance_ffs <- list()
|
||
selected_features <- list()
|
||
|
||
for (horizon_doy in forecast_horizons) {
|
||
safe_log(sprintf("\n=== TRAINING MODELS FOR DOY %d ===", horizon_doy))
|
||
|
||
train_data <- forecast_data_list[[as.character(horizon_doy)]]
|
||
|
||
# Select predictors (remove NAs and select numeric features)
|
||
predictor_cols <- c("current_CI", "current_DOY", "avg_CI_per_day",
|
||
"recent_CI_30d", "CI_growth_rate", "early_season_CI",
|
||
"CI_variability", "Ratoon")
|
||
|
||
# Filter complete cases
|
||
train_data_clean <- train_data %>%
|
||
dplyr::select(all_of(c(predictor_cols, "tonnage_ha"))) %>%
|
||
tidyr::drop_na()
|
||
|
||
safe_log(sprintf(" Training records: %d", nrow(train_data_clean)))
|
||
|
||
if (nrow(train_data_clean) < 20) {
|
||
safe_log(" Insufficient data, skipping", "WARNING")
|
||
next
|
||
}
|
||
|
||
# ===== MODEL 1: ALL VARIABLES =====
|
||
safe_log(" Training Model 1: All Variables...")
|
||
model_all <- caret::train(
|
||
tonnage_ha ~ .,
|
||
data = train_data_clean,
|
||
method = "rf",
|
||
trControl = ctrl,
|
||
importance = TRUE,
|
||
tuneLength = 3
|
||
)
|
||
|
||
# Get predictions
|
||
preds_all <- predict(model_all, newdata = train_data_clean)
|
||
metrics_all <- calculate_metrics(preds_all, train_data_clean$tonnage_ha)
|
||
|
||
# Extract variable importance
|
||
var_imp_all <- caret::varImp(model_all)$importance
|
||
var_imp_all_df <- data.frame(
|
||
Variable = rownames(var_imp_all),
|
||
Importance = var_imp_all$Overall,
|
||
DOY = horizon_doy,
|
||
Model = "All Variables"
|
||
) %>%
|
||
dplyr::arrange(desc(Importance)) %>%
|
||
dplyr::mutate(Rank = row_number())
|
||
|
||
safe_log(sprintf(" RMSE: %.2f | MAE: %.2f | R²: %.3f",
|
||
metrics_all$RMSE, metrics_all$MAE, metrics_all$R2))
|
||
safe_log(" Top 3 variables:")
|
||
for (i in 1:min(3, nrow(var_imp_all_df))) {
|
||
safe_log(sprintf(" %d. %s (Importance: %.1f)",
|
||
i, var_imp_all_df$Variable[i], var_imp_all_df$Importance[i]))
|
||
}
|
||
|
||
# ===== MODEL 2: FORWARD FEATURE SELECTION =====
|
||
safe_log(" Training Model 2: Forward Feature Selection (ffs)...")
|
||
|
||
ffs_success <- FALSE
|
||
tryCatch({
|
||
# Use faster feature selection with smaller tuneLength
|
||
model_ffs <- CAST::ffs(
|
||
predictors = train_data_clean[, predictor_cols],
|
||
response = train_data_clean$tonnage_ha,
|
||
method = "rf",
|
||
trControl = trainControl(method = "cv", number = 3), # Faster: 3-fold instead of 5
|
||
tuneLength = 1, # Faster: only 1 tuning parameter
|
||
verbose = FALSE
|
||
)
|
||
|
||
# Get selected features
|
||
selected_vars <- model_ffs$selectedvars
|
||
safe_log(sprintf(" Selected %d features: %s",
|
||
length(selected_vars), paste(selected_vars, collapse = ", ")))
|
||
|
||
# Get predictions
|
||
preds_ffs <- predict(model_ffs, newdata = train_data_clean)
|
||
|
||
# Calculate metrics
|
||
metrics_ffs_result <- calculate_metrics(preds_ffs, train_data_clean$tonnage_ha)
|
||
|
||
# Extract variable importance (only for selected variables)
|
||
var_imp_ffs <- caret::varImp(model_ffs)$importance
|
||
var_imp_ffs_df <- data.frame(
|
||
Variable = rownames(var_imp_ffs),
|
||
Importance = var_imp_ffs$Overall,
|
||
DOY = horizon_doy,
|
||
Model = "FFS"
|
||
) %>%
|
||
dplyr::arrange(desc(Importance)) %>%
|
||
dplyr::mutate(Rank = row_number())
|
||
|
||
safe_log(sprintf(" RMSE: %.2f | MAE: %.2f | R²: %.3f",
|
||
metrics_ffs_result$RMSE, metrics_ffs_result$MAE, metrics_ffs_result$R2))
|
||
|
||
# Calculate improvement
|
||
improvement <- ((metrics_all$RMSE - metrics_ffs_result$RMSE) / metrics_all$RMSE) * 100
|
||
if (improvement > 0) {
|
||
safe_log(sprintf(" ✓ FFS improved RMSE by %.1f%%", improvement))
|
||
} else {
|
||
safe_log(sprintf(" ✗ FFS increased RMSE by %.1f%%", abs(improvement)))
|
||
}
|
||
|
||
# Store results - explicitly assign to list position
|
||
models_ffs[[as.character(horizon_doy)]] <- model_ffs
|
||
predictions_ffs[[as.character(horizon_doy)]] <- preds_ffs
|
||
metrics_ffs[[as.character(horizon_doy)]] <- metrics_ffs_result
|
||
importance_ffs[[as.character(horizon_doy)]] <- var_imp_ffs_df
|
||
selected_features[[as.character(horizon_doy)]] <- selected_vars
|
||
|
||
ffs_success <- TRUE
|
||
safe_log(" ✓ FFS model stored successfully")
|
||
|
||
}, error = function(e) {
|
||
safe_log(sprintf(" ERROR in ffs: %s", e$message), "ERROR")
|
||
# Don't set to NULL - just skip assignment so they remain undefined
|
||
})
|
||
|
||
if (!ffs_success) {
|
||
safe_log(" FFS failed - using All Variables model only for this horizon", "WARNING")
|
||
}
|
||
|
||
# Store Model 1 results
|
||
models_all_vars[[as.character(horizon_doy)]] <- model_all
|
||
predictions_all_vars[[as.character(horizon_doy)]] <- preds_all
|
||
metrics_all_vars[[as.character(horizon_doy)]] <- metrics_all
|
||
importance_all_vars[[as.character(horizon_doy)]] <- var_imp_all_df
|
||
}
|
||
|
||
# 9. Create visualizations
|
||
# ----------------------
|
||
safe_log("\n=== CREATING VISUALIZATIONS ===")
|
||
|
||
output_dir <- file.path(reports_dir, "yield_prediction")
|
||
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
|
||
|
||
# Create comparison plots for each horizon (All Variables vs FFS)
|
||
plots_comparison <- list()
|
||
plot_idx <- 1
|
||
|
||
for (horizon_doy in forecast_horizons) {
|
||
if (!is.null(models_all_vars[[as.character(horizon_doy)]])) {
|
||
train_data_clean <- forecast_data_list[[as.character(horizon_doy)]] %>%
|
||
dplyr::select(current_CI, current_DOY, avg_CI_per_day, recent_CI_30d,
|
||
CI_growth_rate, early_season_CI, CI_variability,
|
||
Ratoon, tonnage_ha) %>%
|
||
tidyr::drop_na()
|
||
|
||
# Plot 1: All Variables
|
||
plot_all <- create_forecast_plot(
|
||
predictions_all_vars[[as.character(horizon_doy)]],
|
||
train_data_clean$tonnage_ha,
|
||
horizon_doy,
|
||
metrics_all_vars[[as.character(horizon_doy)]],
|
||
" - All Variables"
|
||
)
|
||
|
||
plots_comparison[[plot_idx]] <- plot_all
|
||
plot_idx <- plot_idx + 1
|
||
|
||
# Plot 2: FFS (if available)
|
||
if (!is.null(models_ffs[[as.character(horizon_doy)]])) {
|
||
selected_vars <- selected_features[[as.character(horizon_doy)]]
|
||
plot_ffs <- create_forecast_plot(
|
||
predictions_ffs[[as.character(horizon_doy)]],
|
||
train_data_clean$tonnage_ha,
|
||
horizon_doy,
|
||
metrics_ffs[[as.character(horizon_doy)]],
|
||
sprintf(" - FFS (%d vars)", length(selected_vars))
|
||
)
|
||
|
||
plots_comparison[[plot_idx]] <- plot_ffs
|
||
plot_idx <- plot_idx + 1
|
||
}
|
||
}
|
||
}
|
||
|
||
# Create RMSE comparison plot
|
||
rmse_comparison_data <- data.frame(
|
||
DOY = forecast_horizons[sapply(forecast_horizons, function(x)
|
||
!is.null(metrics_all_vars[[as.character(x)]]))],
|
||
RMSE_All = sapply(forecast_horizons[sapply(forecast_horizons, function(x)
|
||
!is.null(metrics_all_vars[[as.character(x)]]))], function(x)
|
||
metrics_all_vars[[as.character(x)]]$RMSE),
|
||
RMSE_FFS = sapply(forecast_horizons[sapply(forecast_horizons, function(x)
|
||
!is.null(metrics_ffs[[as.character(x)]]))], function(x)
|
||
metrics_ffs[[as.character(x)]]$RMSE)
|
||
) %>%
|
||
tidyr::pivot_longer(cols = starts_with("RMSE"),
|
||
names_to = "Model", values_to = "RMSE") %>%
|
||
dplyr::mutate(Model = ifelse(Model == "RMSE_All", "All Variables", "FFS"))
|
||
|
||
rmse_comparison_plot <- ggplot(rmse_comparison_data, aes(x = DOY, y = RMSE, color = Model, group = Model)) +
|
||
geom_line(linewidth = 1.2) +
|
||
geom_point(size = 3) +
|
||
geom_text(aes(label = sprintf("%.1f", RMSE)), vjust = -0.8, size = 3) +
|
||
scale_color_manual(values = c("All Variables" = "#E63946", "FFS" = "#06A77D")) +
|
||
labs(
|
||
title = "Model Comparison: All Variables vs Feature Selection",
|
||
subtitle = "RMSE across forecast horizons",
|
||
x = "Days from Planting (DOY)",
|
||
y = "RMSE (t/ha)"
|
||
) +
|
||
theme_minimal() +
|
||
theme(
|
||
plot.title = element_text(face = "bold", size = 10),
|
||
plot.subtitle = element_text(size = 9, color = "gray40"),
|
||
axis.title = element_text(size = 10),
|
||
legend.position = "bottom",
|
||
panel.grid.minor = element_blank()
|
||
)
|
||
|
||
# Create feature selection summary table
|
||
ffs_summary <- data.frame(
|
||
DOY = forecast_horizons[sapply(forecast_horizons, function(x)
|
||
!is.null(selected_features[[as.character(x)]]))],
|
||
Num_Features = sapply(forecast_horizons[sapply(forecast_horizons, function(x)
|
||
!is.null(selected_features[[as.character(x)]]))], function(x)
|
||
length(selected_features[[as.character(x)]])),
|
||
Selected_Features = sapply(forecast_horizons[sapply(forecast_horizons, function(x)
|
||
!is.null(selected_features[[as.character(x)]]))], function(x)
|
||
paste(selected_features[[as.character(x)]], collapse = ", "))
|
||
)
|
||
|
||
# Create table grob
|
||
ffs_table_grob <- gridExtra::tableGrob(
|
||
ffs_summary,
|
||
rows = NULL,
|
||
theme = gridExtra::ttheme_minimal(
|
||
base_size = 8,
|
||
core = list(fg_params = list(hjust = 0, x = 0.05)),
|
||
colhead = list(fg_params = list(fontface = "bold"))
|
||
)
|
||
)
|
||
|
||
ffs_table_plot <- gridExtra::grid.arrange(
|
||
ffs_table_grob,
|
||
top = grid::textGrob("Features Selected by FFS at Each Horizon",
|
||
gp = grid::gpar(fontface = "bold", fontsize = 10))
|
||
)
|
||
|
||
|
||
# Create variable importance comparison plot
|
||
# Bind all importance data frames from both models
|
||
all_importance_list <- c(importance_all_vars, importance_ffs)
|
||
all_importance_list <- all_importance_list[!sapply(all_importance_list, is.null)]
|
||
all_importance <- dplyr::bind_rows(all_importance_list)
|
||
|
||
# Get top 5 variables overall
|
||
top_vars <- all_importance %>%
|
||
dplyr::group_by(Variable) %>%
|
||
dplyr::summarise(AvgImportance = mean(Importance)) %>%
|
||
dplyr::arrange(desc(AvgImportance)) %>%
|
||
dplyr::slice(1:5) %>%
|
||
dplyr::pull(Variable)
|
||
|
||
importance_plot <- all_importance %>%
|
||
dplyr::filter(Variable %in% top_vars) %>%
|
||
ggplot(aes(x = factor(DOY), y = Importance, fill = Model)) +
|
||
geom_col(position = "dodge", width = 0.8) +
|
||
scale_fill_manual(values = c("All Variables" = "#457B9D", "FFS" = "#06A77D")) +
|
||
facet_wrap(~ Variable, ncol = 5, scales = "free_y") +
|
||
labs(
|
||
title = "Variable Importance: All Variables vs FFS",
|
||
subtitle = "Top 5 most important predictors",
|
||
x = "Days from Planting (DOY)",
|
||
y = "Importance",
|
||
fill = "Model Type"
|
||
) +
|
||
theme_minimal() +
|
||
theme(
|
||
plot.title = element_text(face = "bold", size = 10),
|
||
plot.subtitle = element_text(size = 9, color = "gray40"),
|
||
axis.title = element_text(size = 9),
|
||
axis.text = element_text(size = 7),
|
||
axis.text.x = element_text(angle = 45, hjust = 1),
|
||
legend.position = "bottom",
|
||
legend.text = element_text(size = 8),
|
||
strip.text = element_text(size = 8, face = "bold"),
|
||
panel.grid.minor = element_blank()
|
||
)
|
||
|
||
# Combine all plots in a larger grid (4 horizons × 2 models = 8 plots + 2 summary plots)
|
||
if (length(plots_comparison) == 8) {
|
||
combined_plot <- gridExtra::grid.arrange(
|
||
grobs = c(plots_comparison, list(rmse_comparison_plot, ffs_table_plot)),
|
||
ncol = 2,
|
||
nrow = 5,
|
||
heights = c(1.1, 1.1, 1.1, 1.1, 0.9),
|
||
layout_matrix = rbind(
|
||
c(1, 2), # DOY 300: All vs FFS
|
||
c(3, 4), # DOY 330: All vs FFS
|
||
c(5, 6), # DOY 360: All vs FFS
|
||
c(7, 8), # DOY 390: All vs FFS
|
||
c(9, 10) # RMSE comparison + FFS table
|
||
)
|
||
)
|
||
}
|
||
|
||
# Save main comparison plot
|
||
plot_file <- file.path(output_dir, paste0(project_dir, "_temporal_yield_forecasting_comparison.png"))
|
||
ggsave(plot_file, combined_plot, width = 12, height = 18, dpi = 300)
|
||
safe_log(paste("Comparison plot saved to:", plot_file))
|
||
|
||
# Save importance comparison plot separately
|
||
importance_file <- file.path(output_dir, paste0(project_dir, "_variable_importance_comparison.png"))
|
||
ggsave(importance_file, importance_plot, width = 14, height = 6, dpi = 300)
|
||
safe_log(paste("Importance plot saved to:", importance_file))
|
||
|
||
# 10. Save results
|
||
# --------------
|
||
results_file <- file.path(output_dir, paste0(project_dir, "_temporal_forecast_models.rds"))
|
||
saveRDS(list(
|
||
models_all_vars = models_all_vars,
|
||
models_ffs = models_ffs,
|
||
metrics_all_vars = metrics_all_vars,
|
||
metrics_ffs = metrics_ffs,
|
||
importance_all_vars = importance_all_vars,
|
||
importance_ffs = importance_ffs,
|
||
selected_features = selected_features,
|
||
forecast_horizons = forecast_horizons
|
||
), results_file)
|
||
safe_log(paste("Models saved to:", results_file))
|
||
|
||
# Save variable importance to CSV
|
||
importance_csv <- file.path(output_dir, paste0(project_dir, "_variable_importance.csv"))
|
||
write.csv(all_importance, importance_csv, row.names = FALSE)
|
||
safe_log(paste("Variable importance saved to:", importance_csv))
|
||
|
||
# Save selected features summary
|
||
ffs_csv <- file.path(output_dir, paste0(project_dir, "_ffs_selected_features.csv"))
|
||
write.csv(ffs_summary, ffs_csv, row.names = FALSE)
|
||
safe_log(paste("FFS summary saved to:", ffs_csv))
|
||
|
||
# Save model comparison
|
||
comparison_csv <- file.path(output_dir, paste0(project_dir, "_model_comparison.csv"))
|
||
comparison_data <- data.frame(
|
||
DOY = forecast_horizons,
|
||
RMSE_All_Vars = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(metrics_all_vars[[as.character(x)]]),
|
||
metrics_all_vars[[as.character(x)]]$RMSE, NA)),
|
||
RMSE_FFS = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(metrics_ffs[[as.character(x)]]),
|
||
metrics_ffs[[as.character(x)]]$RMSE, NA)),
|
||
MAE_All_Vars = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(metrics_all_vars[[as.character(x)]]),
|
||
metrics_all_vars[[as.character(x)]]$MAE, NA)),
|
||
MAE_FFS = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(metrics_ffs[[as.character(x)]]),
|
||
metrics_ffs[[as.character(x)]]$MAE, NA)),
|
||
R2_All_Vars = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(metrics_all_vars[[as.character(x)]]),
|
||
metrics_all_vars[[as.character(x)]]$R2, NA)),
|
||
R2_FFS = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(metrics_ffs[[as.character(x)]]),
|
||
metrics_ffs[[as.character(x)]]$R2, NA)),
|
||
Num_Features_All = 8,
|
||
Num_Features_FFS = sapply(forecast_horizons, function(x)
|
||
ifelse(!is.null(selected_features[[as.character(x)]]),
|
||
length(selected_features[[as.character(x)]]), NA))
|
||
)
|
||
write.csv(comparison_data, comparison_csv, row.names = FALSE)
|
||
safe_log(paste("Model comparison saved to:", comparison_csv))
|
||
|
||
# Print summary
|
||
cat("\n=== MODEL COMPARISON SUMMARY ===\n")
|
||
print(comparison_data)
|
||
|
||
cat("\n=== SELECTED FEATURES BY FFS ===\n")
|
||
print(ffs_summary)
|
||
|
||
cat("\n=== AVERAGE VARIABLE IMPORTANCE ===\n")
|
||
avg_importance <- all_importance %>%
|
||
dplyr::group_by(Variable, Model) %>%
|
||
dplyr::summarise(AvgImportance = mean(Importance), .groups = "drop") %>%
|
||
dplyr::arrange(Model, desc(AvgImportance))
|
||
print(avg_importance)
|
||
|
||
cat("\n=== PERFORMANCE IMPROVEMENT ===\n")
|
||
for (doy in forecast_horizons) {
|
||
if (!is.null(metrics_all_vars[[as.character(doy)]]) &&
|
||
!is.null(metrics_ffs[[as.character(doy)]])) {
|
||
improvement <- ((metrics_all_vars[[as.character(doy)]]$RMSE -
|
||
metrics_ffs[[as.character(doy)]]$RMSE) /
|
||
metrics_all_vars[[as.character(doy)]]$RMSE) * 100
|
||
|
||
if (improvement > 0) {
|
||
cat(sprintf("DOY %d: FFS improved RMSE by %.1f%% (%.2f → %.2f t/ha)\n",
|
||
doy, improvement,
|
||
metrics_all_vars[[as.character(doy)]]$RMSE,
|
||
metrics_ffs[[as.character(doy)]]$RMSE))
|
||
} else {
|
||
cat(sprintf("DOY %d: All Variables better by %.1f%% (%.2f vs %.2f t/ha)\n",
|
||
doy, abs(improvement),
|
||
metrics_all_vars[[as.character(doy)]]$RMSE,
|
||
metrics_ffs[[as.character(doy)]]$RMSE))
|
||
}
|
||
}
|
||
}
|
||
|
||
safe_log("\n=== TEMPORAL YIELD FORECASTING COMPLETED ===")
|
||
}
|
||
|
||
# 4. Script execution
|
||
# -----------------
|
||
if (sys.nframe() == 0) {
|
||
main()
|
||
}
|