From 34159b3003ff01327f443855300713b7d346f664 Mon Sep 17 00:00:00 2001 From: Timon Date: Tue, 17 Feb 2026 15:02:30 +0100 Subject: [PATCH] Enhance yield prediction capabilities by integrating Random Forest model with Forward Feature Selection; add robust data loading and validation for harvesting data across multiple utility scripts. --- r_app/80_calculate_kpis.R | 19 +- r_app/80_utils_agronomic_support.R | 166 +++++++------- r_app/80_utils_cane_supply.R | 53 +++++ r_app/80_utils_common.R | 334 +++++++++++++++++++++++++++++ 4 files changed, 476 insertions(+), 96 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 208a373..f73a002 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -141,6 +141,10 @@ suppressPackageStartupMessages({ library(writexl) # For writing Excel outputs (KPI summary tables) library(progress) # For progress bars during field processing + # ML models (for yield prediction KPI) + library(caret) # For training Random Forest with cross-validation + library(CAST) # For Forward Feature Selection in caret models + # ML/Analysis (optional - only for harvest model inference) tryCatch({ library(torch) # For PyTorch model inference (harvest readiness prediction) @@ -359,11 +363,8 @@ main <- function() { stop("ERROR loading field boundaries: ", e$message) }) - # Load harvesting data - if (!exists("harvesting_data")) { - warning("harvesting_data not loaded. TCH KPI will use placeholder values.") - harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric()) - } + # Load harvesting data for yield prediction (using common helper function) + harvesting_data <- load_harvest_data(setup$data_dir) # Extract current week/year from end_date current_week <- as.numeric(format(end_date, "%V")) @@ -379,6 +380,7 @@ main <- function() { ci_rds_path = NULL, harvesting_data = harvesting_data, output_dir = reports_dir_kpi, + data_dir = setup$data_dir, project_dir = project_dir ) @@ -419,11 +421,8 @@ main <- function() { stop("ERROR loading field boundaries: ", e$message) }) - # Load harvesting data - if (!exists("harvesting_data")) { - warning("harvesting_data not loaded. TCH KPI will use placeholder values.") - harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric()) - } + # Load harvesting data for yield prediction (using common helper function) + harvesting_data <- load_harvest_data(setup$data_dir) # Extract current week/year from end_date current_week <- as.numeric(format(end_date, "%V")) diff --git a/r_app/80_utils_agronomic_support.R b/r_app/80_utils_agronomic_support.R index 8e55893..a71ea28 100644 --- a/r_app/80_utils_agronomic_support.R +++ b/r_app/80_utils_agronomic_support.R @@ -43,29 +43,6 @@ library(CAST) # These are now sourced from common utils and shared by all client types. # ============================================================================ -#' Prepare harvest predictions and ensure proper alignment with field data -prepare_predictions <- function(harvest_model, field_data, scenario = "optimistic") { - if (is.null(harvest_model) || is.null(field_data)) { - return(NULL) - } - - tryCatch({ - scenario_factor <- switch(scenario, - "pessimistic" = 0.85, - "realistic" = 1.0, - "optimistic" = 1.15, - 1.0) - - predictions <- field_data %>% - mutate(tch_forecasted = field_data$mean_ci * scenario_factor) - - return(predictions) - }, error = function(e) { - message(paste("Error preparing predictions:", e$message)) - return(NULL) - }) -} - # ============================================================================ # AURA KPI CALCULATION FUNCTIONS (6 KPIS) # ============================================================================ @@ -266,74 +243,91 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) { #' KPI 3: Calculate TCH forecasted (tonnes of cane per hectare) #' -#' Projects final harvest tonnage based on CI growth trajectory +#' Projects final harvest tonnage based on historical yield data and CI growth trajectory. +#' Uses a Random Forest model trained on harvest data to predict yields for mature fields. +#' Delegates to calculate_yield_prediction_kpi() in 80_utils_common.R. #' -#' @param field_statistics Current field statistics -#' @param harvesting_data Historical harvest data (with yield observations) -#' @param field_boundaries_sf Field geometries +#' @param field_statistics Current field statistics (dataframe with Mean_CI or mean_ci column) +#' @param harvesting_data Historical harvest data frame (with tonnage_ha column) +#' @param field_boundaries_sf SF object with field geometries +#' @param cumulative_CI_vals_dir Directory with combined CI RDS files (optional) +#' @param data_dir Project data directory (from setup_project_directories or parameters_project.R) +#' Used to build cumulative_CI_vals_dir path if not provided directly (optional) +#' @param project_dir Deprecated: only used if data_dir not provided (optional) #' -#' @return Data frame with field-level TCH forecasts -calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL, field_boundaries_sf = NULL) { +#' @return Data frame with field-level yield forecasts ready for orchestrator +#' Columns: field_idx, tch_forecasted (yields in t/ha) +calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL, + field_boundaries_sf = NULL, + cumulative_CI_vals_dir = NULL, + data_dir = NULL, + project_dir = NULL) { - # Handle both naming conventions (Field_id/Mean_CI vs field_idx/mean_ci) - if ("Field_id" %in% names(field_statistics)) { - # Add field_idx to match field_boundaries row numbers - field_statistics <- field_statistics %>% - mutate(field_idx = match(Field_id, field_boundaries_sf$field)) - mean_ci_col <- "Mean_CI" - } else { - mean_ci_col <- "mean_ci" - } + # Use common utils yield prediction function (handles all ML logic) + # This replaces the previous linear model (TCH = 50 + CI*10) with proper ML prediction - # Filter out any fields without a match - field_statistics <- field_statistics %>% - filter(!is.na(field_idx)) - - if (nrow(field_statistics) == 0) { - warning("No fields matched between statistics and boundaries") + # Validate required parameters + if (is.null(field_boundaries_sf)) { + safe_log("field_boundaries_sf is NULL in calculate_tch_forecasted_kpi", "WARNING") return(data.frame( field_idx = integer(), - mean_ci = numeric(), tch_forecasted = numeric(), - tch_lower_bound = numeric(), - tch_upper_bound = numeric(), - confidence = character(), stringsAsFactors = FALSE )) } - - result <- data.frame( - field_idx = field_statistics$field_idx, - mean_ci = field_statistics[[mean_ci_col]], - tch_forecasted = NA_real_, - tch_lower_bound = NA_real_, - tch_upper_bound = NA_real_, - confidence = NA_character_, - stringsAsFactors = FALSE - ) - # Base TCH model: TCH = 50 + (CI * 10) - # This is a simplified model; production use should include more variables - - for (i in seq_len(nrow(result))) { - if (is.na(result$mean_ci[i])) { - result$confidence[i] <- "No data" - next + # Determine cumulative CI directory + if (is.null(cumulative_CI_vals_dir)) { + # Priority 1: Use provided data_dir parameter + if (!is.null(data_dir)) { + cumulative_CI_vals_dir <- file.path(data_dir, "extracted_ci", "cumulative_vals") + } else if (exists("data_dir", envir = .GlobalEnv)) { + # Priority 2: Fallback to global data_dir from parameters_project.R + cumulative_CI_vals_dir <- file.path(get("data_dir", envir = .GlobalEnv), "extracted_ci", "cumulative_vals") + } else { + # Priority 3: Last resort - log warning and fail gracefully + safe_log("Missing project data directory configuration: provide data_dir parameter or ensure parameters_project.R has set data_dir globally", "WARNING") + safe_log("No training data available for yield prediction", "WARNING") + return(data.frame( + field_idx = integer(), + tch_forecasted = numeric(), + stringsAsFactors = FALSE + )) } - - ci_val <- result$mean_ci[i] - - # Simple linear model - tch_est <- 50 + (ci_val * 10) - - # Confidence interval based on CI range - tch_lower <- tch_est * 0.85 - tch_upper <- tch_est * 1.15 - - result$tch_forecasted[i] <- round(tch_est, 2) - result$tch_lower_bound[i] <- round(tch_lower, 2) - result$tch_upper_bound[i] <- round(tch_upper, 2) - result$confidence[i] <- "Medium" + } + + # Call the shared yield prediction function from common utils + yield_result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir) + + # Extract field-level results from the list + field_results <- yield_result$field_results + + # Convert to format expected by orchestrator + # If no predictions, return empty data frame + if (is.null(field_results) || nrow(field_results) == 0) { + return(data.frame( + field_idx = integer(), + tch_forecasted = numeric(), + stringsAsFactors = FALSE + )) + } + + # Map field names to field_idx using field_boundaries_sf + result <- field_results %>% + mutate( + field_idx = match(field, field_boundaries_sf$field), + tch_forecasted = yield_forecast_t_ha + ) %>% + filter(!is.na(field_idx)) %>% + select(field_idx, tch_forecasted) + + # Ensure result has proper structure even if empty + if (nrow(result) == 0) { + return(data.frame( + field_idx = integer(), + tch_forecasted = numeric(), + stringsAsFactors = FALSE + )) } return(result) @@ -471,7 +465,7 @@ create_summary_tables <- function(all_kpis) { select(field_idx, mean_ci_pct_change, interpretation), tch_forecast = all_kpis$tch_forecasted %>% - select(field_idx, mean_ci, tch_forecasted, tch_lower_bound, tch_upper_bound, confidence), + select(field_idx, tch_forecasted), growth_decline = all_kpis$growth_decline %>% select(field_idx, four_week_trend, trend_interpretation, decline_severity), @@ -479,9 +473,9 @@ create_summary_tables <- function(all_kpis) { weed_pressure = all_kpis$weed_presence %>% select(field_idx, fragmentation_index, weed_pressure_risk), - gap_filling = if (!is.null(all_kpis$gap_filling)) { + gap_filling = if (!is.null(all_kpis$gap_filling) && nrow(all_kpis$gap_filling) > 0) { all_kpis$gap_filling %>% - select(field_idx, gap_score, gap_level, mean_ci) + select(field_idx, gap_score, gap_level) } else { NULL } @@ -527,9 +521,7 @@ create_field_detail_table <- function(field_boundaries_sf, all_kpis, current_wee ) %>% left_join( all_kpis$tch_forecasted %>% - select(field_idx, Mean_CI = mean_ci, TCH_Forecasted = tch_forecasted, - TCH_Lower = tch_lower_bound, TCH_Upper = tch_upper_bound, - TCH_Confidence = confidence), + select(field_idx, TCH_Forecasted = tch_forecasted), by = "field_idx" ) %>% left_join( @@ -647,6 +639,7 @@ calculate_all_field_analysis_agronomic_support <- function( ci_rds_path = NULL, harvesting_data = NULL, output_dir = NULL, + data_dir = NULL, project_dir = NULL ) { @@ -794,7 +787,8 @@ calculate_all_field_analysis_agronomic_support <- function( } message("Calculating KPI 3: TCH Forecasted...") - tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf) + tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf, + data_dir = data_dir, project_dir = project_dir) message("Calculating KPI 4: Growth Decline...") growth_decline_kpi <- calculate_growth_decline_kpi( diff --git a/r_app/80_utils_cane_supply.R b/r_app/80_utils_cane_supply.R index ad2cdf5..fa14f64 100644 --- a/r_app/80_utils_cane_supply.R +++ b/r_app/80_utils_cane_supply.R @@ -166,6 +166,59 @@ calculate_status_alert <- function(imminent_prob, age_week, weekly_ci_change, me NA_character_ } +#' Calculate yield prediction for CANE_SUPPLY workflows (Wrapper) +#' +#' This function wraps the shared yield prediction model from 80_utils_common.R +#' to provide CANE_SUPPLY clients (e.g., ANGATA) with ML-based yield forecasting. +#' +#' Uses Random Forest with Forward Feature Selection trained on: +#' - Cumulative Canopy Index (CI) from growth model +#' - Days of Year (DOY) / crop age +#' - CI-per-day (growth velocity) +#' +#' Predicts yields for mature fields (DOY >= 240, ~8 months) into quartiles: +#' - Top 25%: High-yield fields +#' - Average: Mid-range yield fields +#' - Lowest 25%: Lower-yield fields +#' +#' @param field_boundaries_sf SF object with field geometries +#' @param harvesting_data Data frame with harvest history (must have tonnage_ha column) +#' @param cumulative_CI_vals_dir Directory with combined CI RDS files +#' +#' @return List with: +#' - summary: Data frame with field_groups, count, and yield quartile predictions +#' - field_results: Data frame with field-level forecasts (yield_forecast_t_ha in t/ha) +#' +#' @details +#' **Data Requirements:** +#' - harvesting_data must include tonnage_ha column (yield in t/ha) for training +#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds" +#' - If either is missing, returns graceful fallback with NA values (not fake numbers) +#' +#' **Integration:** +#' This can be called from calculate_all_field_kpis() in cane_supply workflow to add +#' a new "Yield_Forecast" column to the 22-column KPI output. +#' +#' **Example:** +#' ```r +#' yield_result <- calculate_yield_prediction_kpi_cane_supply( +#' field_boundaries_sf, +#' harvesting_data, +#' file.path(data_dir, "combined_CI") +#' ) +#' # yield_result$summary has quartiles +#' # yield_result$field_results has per-field forecasts +#' ``` +calculate_yield_prediction_kpi_cane_supply <- function(field_boundaries_sf, + harvesting_data, + cumulative_CI_vals_dir) { + + # Call the shared yield prediction function from 80_utils_common.R + result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir) + + return(result) +} + #' Build complete per-field KPI dataframe with all 22 columns #' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats diff --git a/r_app/80_utils_common.R b/r_app/80_utils_common.R index 78962e9..29739cf 100644 --- a/r_app/80_utils_common.R +++ b/r_app/80_utils_common.R @@ -8,8 +8,13 @@ # - Field statistics extraction # - Week/year calculations for consistent date handling # - Excel/CSV/RDS export utilities +# - Yield prediction using ML models (Random Forest with Feature Selection) # # Used by: 80_calculate_kpis.R, all client-specific utils files +# +# NOTE: Libraries required by yield prediction (caret, CAST, here) are loaded +# in the main script 80_calculate_kpis.R, not here. This keeps dependencies +# centralized in the orchestrator script. # ============================================================================ # ============================================================================ @@ -590,6 +595,81 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) }) } +# ============================================================================ +# DATA LOADING HELPERS +# ============================================================================ + +#' Load and validate harvest data from harvest.xlsx +#' +#' Encapsulates harvest data loading with validation, type coercion, and error handling. +#' Returns a data frame with required columns (field, year, tonnage_ha) or an empty +#' data frame with proper structure if loading fails. +#' +#' @param data_dir Path to data directory containing harvest.xlsx +#' +#' @return data.frame with columns: field (character), year (numeric), tonnage_ha (numeric) +#' - On success: data frame with N rows of harvest records +#' - On failure: empty data frame with correct structure (0 rows, 3 columns) +#' +#' @details +#' **File Location**: Expected at `file.path(data_dir, "harvest.xlsx")` +#' +#' **Validation**: +#' - Checks file existence before reading +#' - Validates required columns: field, year, tonnage_ha +#' - Coerces year and tonnage_ha to numeric +#' - Logs status messages for debugging +#' +#' **Error Handling**: +#' - Missing file: Returns empty DF (logs NOTE) +#' - Missing columns: Returns empty DF (logs WARNING) +#' - Read errors: Returns empty DF (logs WARNING) +#' - Always returns valid data frame structure (won't return NULL) +#' +#' **Usage**: +#' ```r +#' harvesting_data <- load_harvest_data(setup$data_dir) +#' # harvesting_data is guaranteed to be a data.frame with 3 columns +#' # even if harvest.xlsx is unavailable or invalid +#' ``` +load_harvest_data <- function(data_dir) { + harvesting_data <- NULL + harvest_file <- file.path(data_dir, "harvest.xlsx") + + if (file.exists(harvest_file)) { + tryCatch({ + harvesting_data <- readxl::read_excel(harvest_file) + + # Ensure required columns are present + required_cols <- c("field", "year", "tonnage_ha") + if (all(required_cols %in% names(harvesting_data))) { + # Convert to data frame and ensure column types + harvesting_data <- as.data.frame(harvesting_data) + harvesting_data$year <- as.numeric(harvesting_data$year) + harvesting_data$tonnage_ha <- as.numeric(harvesting_data$tonnage_ha) + + message(paste(" ✓ Loaded harvest data:", nrow(harvesting_data), "records from harvest.xlsx")) + return(harvesting_data) + } else { + message(paste(" WARNING: harvest.xlsx missing required columns. Expected: field, year, tonnage_ha")) + harvesting_data <- NULL + } + }, error = function(e) { + message(paste(" WARNING: Could not read harvest.xlsx:", e$message)) + }) + } else { + message(paste(" NOTE: harvest.xlsx not found at", harvest_file)) + } + + # Fallback: create empty data frame if loading failed + if (is.null(harvesting_data)) { + message(" WARNING: No harvest data available. TCH yield prediction will use graceful fallback (NA values)") + harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric()) + } + + return(harvesting_data) +} + # ============================================================================ # FIELD STATISTICS EXTRACTION # ============================================================================ @@ -1386,3 +1466,257 @@ prepare_predictions <- function(predictions, newdata) { dplyr::left_join(., newdata, by = c("field", "sub_field", "season")) ) } + +# ============================================================================ +# YIELD PREDICTION KPI (SHARED ML-BASED MODEL FOR ALL CLIENT TYPES) +# ============================================================================ + +#' Helper function for graceful fallback when training data unavailable +#' +#' @param field_boundaries Field boundaries (sf or SpatVector) +#' @return List with summary and field_results (both with NA values) +create_fallback_result <- function(field_boundaries) { + # Convert to SpatVector if needed (for terra::project) + if (!inherits(field_boundaries, "SpatVector")) { + field_boundaries <- terra::vect(field_boundaries) + } + field_boundaries_projected <- terra::project(field_boundaries, "EPSG:6933") # Equal Earth projection + field_areas <- terra::expanse(field_boundaries_projected) / 10000 # Convert m² to hectares + total_area <- sum(field_areas) + + summary_result <- data.frame( + field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"), + count = c(0, 0, 0, nrow(field_boundaries)), + value = c(NA_real_, NA_real_, NA_real_, round(total_area, 1)), + stringsAsFactors = FALSE + ) + + field_results <- data.frame( + field = character(0), + sub_field = character(0), + Age_days = numeric(0), + yield_forecast_t_ha = numeric(0), + season = numeric(0), + stringsAsFactors = FALSE + ) + + return(list(summary = summary_result, field_results = field_results)) +} + +#' Calculate yield prediction KPI using Random Forest with Feature Selection +#' +#' Trains a Random Forest model on historical harvest data with cumulative CI, +#' days of year (DOY), and CI-per-day as predictors. Uses CAST::ffs() for +#' Forward Feature Selection. Predicts yields for mature fields (DOY >= 240). +#' +#' @param field_boundaries Field boundaries (sf or SpatVector) +#' @param harvesting_data Data frame with harvest data including tonnage_ha column +#' @param cumulative_CI_vals_dir Directory containing "All_pivots_Cumulative_CI_quadrant_year_v2.rds" +#' +#' @return List with: +#' - summary: Data frame with field_groups, count, value (quartiles and total area) +#' - field_results: Data frame with field-level yield forecasts (yield_forecast_t_ha) +#' +#' @details +#' **Training Data Requirements:** +#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds" +#' - harvesting_data must have tonnage_ha column with numeric yield values +#' - Training stops gracefully if either is missing (returns NA values, not fake numbers) +#' +#' **Model Specifications:** +#' - Algorithm: Random Forest (caret + CAST) +#' - Feature Selection: Forward Feature Selection (CAST::ffs) +#' - Cross-validation: 5-fold CV +#' - Predictors: cumulative_CI, DOY, CI_per_day +#' - Mature field threshold: DOY >= 240 (8 months) +#' - Output: Field-level yield forecasts grouped by quartile +#' +#' **Error Handling:** +#' - Missing tonnage_ha: Returns graceful fallback with NA (not zero) values +#' - No training data: Logs WARNING, returns empty field_results +#' - RDS file missing: Returns graceful fallback +#' - Prediction errors: Wrapped in tryCatch, returns fallback on failure +calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cumulative_CI_vals_dir) { + safe_log("Calculating yield prediction KPI using Random Forest with Feature Selection") + + tryCatch({ + # Check if tonnage_ha column is present and has valid data + if (is.null(harvesting_data) || + !("tonnage_ha" %in% names(harvesting_data)) || + all(is.na(harvesting_data$tonnage_ha))) { + safe_log("No harvest data available: lacking tonnage_ha column or all values are NA", "WARNING") + return(create_fallback_result(field_boundaries)) + } + + # Check if CI quadrant RDS file exists + ci_quadrant_path <- file.path(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds") + if (!file.exists(ci_quadrant_path)) { + safe_log(paste("CI quadrant file not found at:", ci_quadrant_path), "WARNING") + return(create_fallback_result(field_boundaries)) + } + + # Load CI quadrant data and fill missing field/sub_field values + CI_quadrant <- readRDS(ci_quadrant_path) %>% + dplyr::group_by(model) %>% + tidyr::fill(field, sub_field, .direction = "downup") %>% + dplyr::ungroup() + + # Rename year column to season for consistency + harvesting_data_renamed <- harvesting_data %>% dplyr::rename(season = year) + + # Join CI and yield data + CI_and_yield <- dplyr::left_join(CI_quadrant, harvesting_data_renamed, + by = c("field", "sub_field", "season")) %>% + dplyr::group_by(sub_field, season) %>% + dplyr::slice(which.max(DOY)) %>% + dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>% + dplyr::mutate(CI_per_day = cumulative_CI / DOY) + + # Define predictors and response variables + predictors <- c("cumulative_CI", "DOY", "CI_per_day") + response <- "tonnage_ha" + + # Prepare training dataset (fields with harvest data) + CI_and_yield_test <- CI_and_yield %>% + as.data.frame() %>% + dplyr::filter(!is.na(tonnage_ha)) + + # Check if we have minimum training data + if (nrow(CI_and_yield_test) == 0) { + safe_log("No training data available: no fields with tonnage_ha observations", "WARNING") + return(create_fallback_result(field_boundaries)) + } + + # Pre-clean training data: remove rows with any NAs in predictors or response + # This is required because CAST::ffs doesn't support na.rm parameter + CI_and_yield_test <- CI_and_yield_test %>% + dplyr::filter(!dplyr::if_any(dplyr::all_of(c(predictors, response)), is.na)) + + if (nrow(CI_and_yield_test) == 0) { + safe_log("No complete training data after removing NAs in predictors/response", "WARNING") + return(create_fallback_result(field_boundaries)) + } + + # Prepare prediction dataset (fields without harvest data, mature fields only) + prediction_yields <- CI_and_yield %>% + as.data.frame() %>% + dplyr::filter(is.na(tonnage_ha) & DOY >= 240) # Mature fields only + + # Configure model training parameters + ctrl <- caret::trainControl( + method = "cv", + savePredictions = TRUE, + allowParallel = TRUE, + number = 5, + verboseIter = FALSE + ) + + # Train the model with forward feature selection + set.seed(202) # For reproducibility + safe_log("Training Random Forest model with Forward Feature Selection...") + model_ffs_rf <- CAST::ffs( + CI_and_yield_test[, predictors], + CI_and_yield_test[, response], + method = "rf", + trControl = ctrl, + importance = TRUE, + withinSE = TRUE, + tuneLength = 5 + ) + + # Predict yields on validation data (same as training data for RMSE calculation) + pred_ffs_rf <- prepare_predictions( + stats::predict(model_ffs_rf, newdata = CI_and_yield_test), + CI_and_yield_test + ) + + # Extract cross-validated RMSE from the model object (more honest than in-sample RMSE) + # The CAST::ffs model stores CV results in $results data frame + # We extract the RMSE from the best model (lowest RMSE across folds) + if (!is.null(model_ffs_rf$results) && "RMSE" %in% names(model_ffs_rf$results)) { + # Get minimum RMSE from cross-validation results (best model from feature selection) + rmse_value <- min(model_ffs_rf$results$RMSE, na.rm = TRUE) + safe_log(paste("Yield prediction RMSE (cross-validated):", round(rmse_value, 2), "t/ha")) + } else { + # Fallback: compute in-sample RMSE if CV results unavailable, but label it clearly + rmse_value <- sqrt(mean((pred_ffs_rf$predicted_Tcha - CI_and_yield_test$tonnage_ha)^2, na.rm = TRUE)) + safe_log(paste("Yield prediction RMSE (in-sample/training):", round(rmse_value, 2), "t/ha")) + } + + # Predict yields for current season (mature fields >= 240 days) + if (nrow(prediction_yields) > 0) { + pred_rf_current_season <- prepare_predictions( + stats::predict(model_ffs_rf, newdata = prediction_yields), + prediction_yields + ) %>% + dplyr::filter(Age_days >= 240) %>% + dplyr::select(c("field", "Age_days", "predicted_Tcha", "season")) + } else { + pred_rf_current_season <- data.frame() + } + + # Calculate summary statistics for KPI + if (nrow(pred_rf_current_season) > 0) { + safe_log(paste("Number of fields with yield predictions:", nrow(pred_rf_current_season))) + safe_log(paste("Predicted yield range:", + round(min(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1), + "-", + round(max(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1), + "t/ha")) + + # Calculate quartiles for grouping + yield_quartiles <- quantile(pred_rf_current_season$predicted_Tcha, + probs = c(0.25, 0.5, 0.75), na.rm = TRUE) + + # Count fields in each quartile + top_25_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[3], na.rm = TRUE) + average_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[1] & + pred_rf_current_season$predicted_Tcha < yield_quartiles[3], na.rm = TRUE) + lowest_25_count <- sum(pred_rf_current_season$predicted_Tcha < yield_quartiles[1], na.rm = TRUE) + + # Calculate total area + if (!inherits(field_boundaries, "SpatVector")) { + field_boundaries_vect <- terra::vect(field_boundaries) + } else { + field_boundaries_vect <- field_boundaries + } + + # Handle both sf and SpatVector inputs for area calculation + if (inherits(field_boundaries, "sf")) { + field_boundaries_projected <- sf::st_transform(field_boundaries, "EPSG:6933") + field_areas <- sf::st_area(field_boundaries_projected) / 10000 # m² to hectares + } else { + field_boundaries_projected <- terra::project(field_boundaries_vect, "EPSG:6933") + field_areas <- terra::expanse(field_boundaries_projected) / 10000 + } + total_area <- sum(as.numeric(field_areas)) + + safe_log(paste("Total area:", round(total_area, 1), "hectares")) + + # Build summary result + result <- data.frame( + field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"), + count = c(top_25_count, average_count, lowest_25_count, nrow(field_boundaries)), + value = c(round(yield_quartiles[3], 1), round(yield_quartiles[2], 1), + round(yield_quartiles[1], 1), round(total_area, 1)), + stringsAsFactors = FALSE + ) + + # Prepare field-level results + field_level_results <- pred_rf_current_season %>% + dplyr::select(field, Age_days, predicted_Tcha, season) %>% + dplyr::rename(yield_forecast_t_ha = predicted_Tcha) + + safe_log("✓ Yield prediction complete") + return(list(summary = result, field_results = field_level_results)) + } else { + safe_log("No fields meet maturity threshold (DOY >= 240) for prediction", "WARNING") + return(list(summary = create_fallback_result(field_boundaries)$summary, + field_results = data.frame())) + } + + }, error = function(e) { + safe_log(paste("Error in yield prediction:", e$message), "ERROR") + return(create_fallback_result(field_boundaries)) + }) +}