From 34159b3003ff01327f443855300713b7d346f664 Mon Sep 17 00:00:00 2001
From: Timon <timon@resiliencebv.com>
Date: Tue, 17 Feb 2026 15:02:30 +0100
Subject: [PATCH] Enhance yield prediction capabilities by integrating Random
 Forest model with Forward Feature Selection; add robust data loading and
 validation for harvesting data across multiple utility scripts.

---
 r_app/80_calculate_kpis.R          |  19 +-
 r_app/80_utils_agronomic_support.R | 166 +++++++-------
 r_app/80_utils_cane_supply.R       |  53 +++++
 r_app/80_utils_common.R            | 334 +++++++++++++++++++++++++++++
 4 files changed, 476 insertions(+), 96 deletions(-)

diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R
index 208a373..f73a002 100644
--- a/r_app/80_calculate_kpis.R
+++ b/r_app/80_calculate_kpis.R
@@ -141,6 +141,10 @@ suppressPackageStartupMessages({
   library(writexl)   # For writing Excel outputs (KPI summary tables)
   library(progress)  # For progress bars during field processing
   
+  # ML models (for yield prediction KPI)
+  library(caret)     # For training Random Forest with cross-validation
+  library(CAST)      # For Forward Feature Selection in caret models
+  
   # ML/Analysis (optional - only for harvest model inference)
   tryCatch({
     library(torch)   # For PyTorch model inference (harvest readiness prediction)
@@ -359,11 +363,8 @@ main <- function() {
       stop("ERROR loading field boundaries: ", e$message)
     })
     
-    # Load harvesting data
-    if (!exists("harvesting_data")) {
-      warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
-      harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
-    }
+    # Load harvesting data for yield prediction (using common helper function)
+    harvesting_data <- load_harvest_data(setup$data_dir)
     
     # Extract current week/year from end_date
     current_week <- as.numeric(format(end_date, "%V"))
@@ -379,6 +380,7 @@ main <- function() {
       ci_rds_path = NULL,
       harvesting_data = harvesting_data,
       output_dir = reports_dir_kpi,
+      data_dir = setup$data_dir,
       project_dir = project_dir
     )
 
@@ -419,11 +421,8 @@ main <- function() {
       stop("ERROR loading field boundaries: ", e$message)
     })
     
-    # Load harvesting data
-    if (!exists("harvesting_data")) {
-      warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
-      harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
-    }
+    # Load harvesting data for yield prediction (using common helper function)
+    harvesting_data <- load_harvest_data(setup$data_dir)
 
     # Extract current week/year from end_date
     current_week <- as.numeric(format(end_date, "%V"))
diff --git a/r_app/80_utils_agronomic_support.R b/r_app/80_utils_agronomic_support.R
index 8e55893..a71ea28 100644
--- a/r_app/80_utils_agronomic_support.R
+++ b/r_app/80_utils_agronomic_support.R
@@ -43,29 +43,6 @@ library(CAST)
 # These are now sourced from common utils and shared by all client types.
 # ============================================================================
 
-#' Prepare harvest predictions and ensure proper alignment with field data
-prepare_predictions <- function(harvest_model, field_data, scenario = "optimistic") {
-  if (is.null(harvest_model) || is.null(field_data)) {
-    return(NULL)
-  }
-  
-  tryCatch({
-    scenario_factor <- switch(scenario,
-                              "pessimistic" = 0.85,
-                              "realistic" = 1.0,
-                              "optimistic" = 1.15,
-                              1.0)
-    
-    predictions <- field_data %>%
-      mutate(tch_forecasted = field_data$mean_ci * scenario_factor)
-    
-    return(predictions)
-  }, error = function(e) {
-    message(paste("Error preparing predictions:", e$message))
-    return(NULL)
-  })
-}
-
 # ============================================================================
 # AURA KPI CALCULATION FUNCTIONS (6 KPIS)
 # ============================================================================
@@ -266,74 +243,91 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
 
 #' KPI 3: Calculate TCH forecasted (tonnes of cane per hectare)
 #'
-#' Projects final harvest tonnage based on CI growth trajectory
+#' Projects final harvest tonnage based on historical yield data and CI growth trajectory.
+#' Uses a Random Forest model trained on harvest data to predict yields for mature fields.
+#' Delegates to calculate_yield_prediction_kpi() in 80_utils_common.R.
 #'
-#' @param field_statistics Current field statistics
-#' @param harvesting_data Historical harvest data (with yield observations)
-#' @param field_boundaries_sf Field geometries
+#' @param field_statistics Current field statistics (dataframe with Mean_CI or mean_ci column)
+#' @param harvesting_data Historical harvest data frame (with tonnage_ha column)
+#' @param field_boundaries_sf SF object with field geometries
+#' @param cumulative_CI_vals_dir Directory with combined CI RDS files (optional)
+#' @param data_dir Project data directory (from setup_project_directories or parameters_project.R)
+#'   Used to build cumulative_CI_vals_dir path if not provided directly (optional)
+#' @param project_dir Deprecated: only used if data_dir not provided (optional)
 #'
-#' @return Data frame with field-level TCH forecasts
-calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL, field_boundaries_sf = NULL) {
+#' @return Data frame with field-level yield forecasts ready for orchestrator
+#'   Columns: field_idx, tch_forecasted (yields in t/ha)
+calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL, 
+                                          field_boundaries_sf = NULL,
+                                          cumulative_CI_vals_dir = NULL,
+                                          data_dir = NULL,
+                                          project_dir = NULL) {
   
-    # Handle both naming conventions (Field_id/Mean_CI vs field_idx/mean_ci)
-  if ("Field_id" %in% names(field_statistics)) {
-    # Add field_idx to match field_boundaries row numbers
-    field_statistics <- field_statistics %>%
-      mutate(field_idx = match(Field_id, field_boundaries_sf$field))
-    mean_ci_col <- "Mean_CI"
-  } else {
-    mean_ci_col <- "mean_ci"
-  }
+  # Use common utils yield prediction function (handles all ML logic)
+  # This replaces the previous linear model (TCH = 50 + CI*10) with proper ML prediction
   
-  # Filter out any fields without a match
-  field_statistics <- field_statistics %>%
-    filter(!is.na(field_idx))
-  
-  if (nrow(field_statistics) == 0) {
-    warning("No fields matched between statistics and boundaries")
+  # Validate required parameters
+  if (is.null(field_boundaries_sf)) {
+    safe_log("field_boundaries_sf is NULL in calculate_tch_forecasted_kpi", "WARNING")
     return(data.frame(
       field_idx = integer(),
-      mean_ci = numeric(),
       tch_forecasted = numeric(),
-      tch_lower_bound = numeric(),
-      tch_upper_bound = numeric(),
-      confidence = character(),
       stringsAsFactors = FALSE
     ))
   }
-
-  result <- data.frame(
-    field_idx = field_statistics$field_idx,
-    mean_ci = field_statistics[[mean_ci_col]],
-    tch_forecasted = NA_real_,
-    tch_lower_bound = NA_real_,
-    tch_upper_bound = NA_real_,
-    confidence = NA_character_,
-    stringsAsFactors = FALSE
-  )
   
-  # Base TCH model: TCH = 50 + (CI * 10)
-  # This is a simplified model; production use should include more variables
-  
-  for (i in seq_len(nrow(result))) {
-    if (is.na(result$mean_ci[i])) {
-      result$confidence[i] <- "No data"
-      next
+  # Determine cumulative CI directory
+  if (is.null(cumulative_CI_vals_dir)) {
+    # Priority 1: Use provided data_dir parameter
+    if (!is.null(data_dir)) {
+      cumulative_CI_vals_dir <- file.path(data_dir, "extracted_ci", "cumulative_vals")
+    } else if (exists("data_dir", envir = .GlobalEnv)) {
+      # Priority 2: Fallback to global data_dir from parameters_project.R
+      cumulative_CI_vals_dir <- file.path(get("data_dir", envir = .GlobalEnv), "extracted_ci", "cumulative_vals")
+    } else {
+      # Priority 3: Last resort - log warning and fail gracefully
+      safe_log("Missing project data directory configuration: provide data_dir parameter or ensure parameters_project.R has set data_dir globally", "WARNING")
+      safe_log("No training data available for yield prediction", "WARNING")
+      return(data.frame(
+        field_idx = integer(),
+        tch_forecasted = numeric(),
+        stringsAsFactors = FALSE
+      ))
     }
-    
-    ci_val <- result$mean_ci[i]
-    
-    # Simple linear model
-    tch_est <- 50 + (ci_val * 10)
-    
-    # Confidence interval based on CI range
-    tch_lower <- tch_est * 0.85
-    tch_upper <- tch_est * 1.15
-    
-    result$tch_forecasted[i] <- round(tch_est, 2)
-    result$tch_lower_bound[i] <- round(tch_lower, 2)
-    result$tch_upper_bound[i] <- round(tch_upper, 2)
-    result$confidence[i] <- "Medium"
+  }
+  
+  # Call the shared yield prediction function from common utils
+  yield_result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir)
+  
+  # Extract field-level results from the list
+  field_results <- yield_result$field_results
+  
+  # Convert to format expected by orchestrator
+  # If no predictions, return empty data frame
+  if (is.null(field_results) || nrow(field_results) == 0) {
+    return(data.frame(
+      field_idx = integer(),
+      tch_forecasted = numeric(),
+      stringsAsFactors = FALSE
+    ))
+  }
+  
+  # Map field names to field_idx using field_boundaries_sf
+  result <- field_results %>%
+    mutate(
+      field_idx = match(field, field_boundaries_sf$field),
+      tch_forecasted = yield_forecast_t_ha
+    ) %>%
+    filter(!is.na(field_idx)) %>%
+    select(field_idx, tch_forecasted)
+  
+  # Ensure result has proper structure even if empty
+  if (nrow(result) == 0) {
+    return(data.frame(
+      field_idx = integer(),
+      tch_forecasted = numeric(),
+      stringsAsFactors = FALSE
+    ))
   }
   
   return(result)
@@ -471,7 +465,7 @@ create_summary_tables <- function(all_kpis) {
       select(field_idx, mean_ci_pct_change, interpretation),
     
     tch_forecast = all_kpis$tch_forecasted %>%
-      select(field_idx, mean_ci, tch_forecasted, tch_lower_bound, tch_upper_bound, confidence),
+      select(field_idx, tch_forecasted),
     
     growth_decline = all_kpis$growth_decline %>%
       select(field_idx, four_week_trend, trend_interpretation, decline_severity),
@@ -479,9 +473,9 @@ create_summary_tables <- function(all_kpis) {
     weed_pressure = all_kpis$weed_presence %>%
       select(field_idx, fragmentation_index, weed_pressure_risk),
     
-    gap_filling = if (!is.null(all_kpis$gap_filling)) {
+    gap_filling = if (!is.null(all_kpis$gap_filling) && nrow(all_kpis$gap_filling) > 0) {
       all_kpis$gap_filling %>%
-        select(field_idx, gap_score, gap_level, mean_ci)
+        select(field_idx, gap_score, gap_level)
     } else {
       NULL
     }
@@ -527,9 +521,7 @@ create_field_detail_table <- function(field_boundaries_sf, all_kpis, current_wee
     ) %>%
     left_join(
       all_kpis$tch_forecasted %>% 
-        select(field_idx, Mean_CI = mean_ci, TCH_Forecasted = tch_forecasted, 
-               TCH_Lower = tch_lower_bound, TCH_Upper = tch_upper_bound, 
-               TCH_Confidence = confidence),
+        select(field_idx, TCH_Forecasted = tch_forecasted),
       by = "field_idx"
     ) %>%
     left_join(
@@ -647,6 +639,7 @@ calculate_all_field_analysis_agronomic_support <- function(
     ci_rds_path = NULL,
     harvesting_data = NULL,
     output_dir = NULL,
+    data_dir = NULL,
     project_dir = NULL
 ) {
   
@@ -794,7 +787,8 @@ calculate_all_field_analysis_agronomic_support <- function(
   }
   
   message("Calculating KPI 3: TCH Forecasted...")
-  tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf)
+  tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf, 
+                                           data_dir = data_dir, project_dir = project_dir)
   
   message("Calculating KPI 4: Growth Decline...")
   growth_decline_kpi <- calculate_growth_decline_kpi(
diff --git a/r_app/80_utils_cane_supply.R b/r_app/80_utils_cane_supply.R
index ad2cdf5..fa14f64 100644
--- a/r_app/80_utils_cane_supply.R
+++ b/r_app/80_utils_cane_supply.R
@@ -166,6 +166,59 @@ calculate_status_alert <- function(imminent_prob, age_week, weekly_ci_change, me
   NA_character_
 }
 
+#' Calculate yield prediction for CANE_SUPPLY workflows (Wrapper)
+#'
+#' This function wraps the shared yield prediction model from 80_utils_common.R
+#' to provide CANE_SUPPLY clients (e.g., ANGATA) with ML-based yield forecasting.
+#' 
+#' Uses Random Forest with Forward Feature Selection trained on:
+#'   - Cumulative Canopy Index (CI) from growth model
+#'   - Days of Year (DOY) / crop age
+#'   - CI-per-day (growth velocity)
+#'
+#' Predicts yields for mature fields (DOY >= 240, ~8 months) into quartiles:
+#'   - Top 25%: High-yield fields
+#'   - Average: Mid-range yield fields
+#'   - Lowest 25%: Lower-yield fields
+#'
+#' @param field_boundaries_sf SF object with field geometries
+#' @param harvesting_data Data frame with harvest history (must have tonnage_ha column)
+#' @param cumulative_CI_vals_dir Directory with combined CI RDS files
+#'
+#' @return List with:
+#'   - summary: Data frame with field_groups, count, and yield quartile predictions
+#'   - field_results: Data frame with field-level forecasts (yield_forecast_t_ha in t/ha)
+#'
+#' @details
+#' **Data Requirements:**
+#'   - harvesting_data must include tonnage_ha column (yield in t/ha) for training
+#'   - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
+#'   - If either is missing, returns graceful fallback with NA values (not fake numbers)
+#'
+#' **Integration:**
+#'   This can be called from calculate_all_field_kpis() in cane_supply workflow to add
+#'   a new "Yield_Forecast" column to the 22-column KPI output.
+#'
+#' **Example:**
+#'   ```r
+#'   yield_result <- calculate_yield_prediction_kpi_cane_supply(
+#'     field_boundaries_sf, 
+#'     harvesting_data, 
+#'     file.path(data_dir, "combined_CI")
+#'   )
+#'   # yield_result$summary has quartiles
+#'   # yield_result$field_results has per-field forecasts
+#'   ```
+calculate_yield_prediction_kpi_cane_supply <- function(field_boundaries_sf, 
+                                                       harvesting_data, 
+                                                       cumulative_CI_vals_dir) {
+  
+  # Call the shared yield prediction function from 80_utils_common.R
+  result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir)
+  
+  return(result)
+}
+
 
 #' Build complete per-field KPI dataframe with all 22 columns
 #' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats
diff --git a/r_app/80_utils_common.R b/r_app/80_utils_common.R
index 78962e9..29739cf 100644
--- a/r_app/80_utils_common.R
+++ b/r_app/80_utils_common.R
@@ -8,8 +8,13 @@
 #   - Field statistics extraction
 #   - Week/year calculations for consistent date handling
 #   - Excel/CSV/RDS export utilities
+#   - Yield prediction using ML models (Random Forest with Feature Selection)
 #
 # Used by: 80_calculate_kpis.R, all client-specific utils files
+#
+# NOTE: Libraries required by yield prediction (caret, CAST, here) are loaded
+#   in the main script 80_calculate_kpis.R, not here. This keeps dependencies
+#   centralized in the orchestrator script.
 # ============================================================================
 
 # ============================================================================
@@ -590,6 +595,81 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL)
   })
 }
 
+# ============================================================================
+# DATA LOADING HELPERS
+# ============================================================================
+
+#' Load and validate harvest data from harvest.xlsx
+#'
+#' Encapsulates harvest data loading with validation, type coercion, and error handling.
+#' Returns a data frame with required columns (field, year, tonnage_ha) or an empty 
+#' data frame with proper structure if loading fails.
+#'
+#' @param data_dir Path to data directory containing harvest.xlsx
+#'
+#' @return data.frame with columns: field (character), year (numeric), tonnage_ha (numeric)
+#'   - On success: data frame with N rows of harvest records
+#'   - On failure: empty data frame with correct structure (0 rows, 3 columns)
+#'
+#' @details
+#' **File Location**: Expected at `file.path(data_dir, "harvest.xlsx")`
+#'
+#' **Validation**:
+#'   - Checks file existence before reading
+#'   - Validates required columns: field, year, tonnage_ha
+#'   - Coerces year and tonnage_ha to numeric
+#'   - Logs status messages for debugging
+#'
+#' **Error Handling**:
+#'   - Missing file: Returns empty DF (logs NOTE)
+#'   - Missing columns: Returns empty DF (logs WARNING)
+#'   - Read errors: Returns empty DF (logs WARNING)
+#'   - Always returns valid data frame structure (won't return NULL)
+#'
+#' **Usage**:
+#'   ```r
+#'   harvesting_data <- load_harvest_data(setup$data_dir)
+#'   # harvesting_data is guaranteed to be a data.frame with 3 columns
+#'   # even if harvest.xlsx is unavailable or invalid
+#'   ```
+load_harvest_data <- function(data_dir) {
+  harvesting_data <- NULL
+  harvest_file <- file.path(data_dir, "harvest.xlsx")
+  
+  if (file.exists(harvest_file)) {
+    tryCatch({
+      harvesting_data <- readxl::read_excel(harvest_file)
+      
+      # Ensure required columns are present
+      required_cols <- c("field", "year", "tonnage_ha")
+      if (all(required_cols %in% names(harvesting_data))) {
+        # Convert to data frame and ensure column types
+        harvesting_data <- as.data.frame(harvesting_data)
+        harvesting_data$year <- as.numeric(harvesting_data$year)
+        harvesting_data$tonnage_ha <- as.numeric(harvesting_data$tonnage_ha)
+        
+        message(paste("  ✓ Loaded harvest data:", nrow(harvesting_data), "records from harvest.xlsx"))
+        return(harvesting_data)
+      } else {
+        message(paste("  WARNING: harvest.xlsx missing required columns. Expected: field, year, tonnage_ha"))
+        harvesting_data <- NULL
+      }
+    }, error = function(e) {
+      message(paste("  WARNING: Could not read harvest.xlsx:", e$message))
+    })
+  } else {
+    message(paste("  NOTE: harvest.xlsx not found at", harvest_file))
+  }
+  
+  # Fallback: create empty data frame if loading failed
+  if (is.null(harvesting_data)) {
+    message("  WARNING: No harvest data available. TCH yield prediction will use graceful fallback (NA values)")
+    harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
+  }
+  
+  return(harvesting_data)
+}
+
 # ============================================================================
 # FIELD STATISTICS EXTRACTION
 # ============================================================================
@@ -1386,3 +1466,257 @@ prepare_predictions <- function(predictions, newdata) {
     dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
   )
 }
+
+# ============================================================================
+# YIELD PREDICTION KPI (SHARED ML-BASED MODEL FOR ALL CLIENT TYPES)
+# ============================================================================
+
+#' Helper function for graceful fallback when training data unavailable
+#'
+#' @param field_boundaries Field boundaries (sf or SpatVector)
+#' @return List with summary and field_results (both with NA values)
+create_fallback_result <- function(field_boundaries) {
+  # Convert to SpatVector if needed (for terra::project)
+  if (!inherits(field_boundaries, "SpatVector")) {
+    field_boundaries <- terra::vect(field_boundaries)
+  }
+  field_boundaries_projected <- terra::project(field_boundaries, "EPSG:6933") # Equal Earth projection
+  field_areas <- terra::expanse(field_boundaries_projected) / 10000  # Convert m² to hectares
+  total_area <- sum(field_areas)
+
+  summary_result <- data.frame(
+    field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
+    count = c(0, 0, 0, nrow(field_boundaries)),
+    value = c(NA_real_, NA_real_, NA_real_, round(total_area, 1)),
+    stringsAsFactors = FALSE
+  )
+  
+  field_results <- data.frame(
+    field = character(0),
+    sub_field = character(0),
+    Age_days = numeric(0),
+    yield_forecast_t_ha = numeric(0),
+    season = numeric(0),
+    stringsAsFactors = FALSE
+  )
+
+  return(list(summary = summary_result, field_results = field_results))
+}
+
+#' Calculate yield prediction KPI using Random Forest with Feature Selection
+#'
+#' Trains a Random Forest model on historical harvest data with cumulative CI,
+#' days of year (DOY), and CI-per-day as predictors. Uses CAST::ffs() for
+#' Forward Feature Selection. Predicts yields for mature fields (DOY >= 240).
+#' 
+#' @param field_boundaries Field boundaries (sf or SpatVector)
+#' @param harvesting_data Data frame with harvest data including tonnage_ha column
+#' @param cumulative_CI_vals_dir Directory containing "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
+#'
+#' @return List with:
+#'   - summary: Data frame with field_groups, count, value (quartiles and total area)
+#'   - field_results: Data frame with field-level yield forecasts (yield_forecast_t_ha)
+#'
+#' @details
+#' **Training Data Requirements:**
+#'   - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
+#'   - harvesting_data must have tonnage_ha column with numeric yield values
+#'   - Training stops gracefully if either is missing (returns NA values, not fake numbers)
+#'
+#' **Model Specifications:**
+#'   - Algorithm: Random Forest (caret + CAST)
+#'   - Feature Selection: Forward Feature Selection (CAST::ffs)
+#'   - Cross-validation: 5-fold CV
+#'   - Predictors: cumulative_CI, DOY, CI_per_day
+#'   - Mature field threshold: DOY >= 240 (8 months)
+#'   - Output: Field-level yield forecasts grouped by quartile
+#'
+#' **Error Handling:**
+#'   - Missing tonnage_ha: Returns graceful fallback with NA (not zero) values
+#'   - No training data: Logs WARNING, returns empty field_results
+#'   - RDS file missing: Returns graceful fallback
+#'   - Prediction errors: Wrapped in tryCatch, returns fallback on failure
+calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cumulative_CI_vals_dir) {
+  safe_log("Calculating yield prediction KPI using Random Forest with Feature Selection")
+
+  tryCatch({
+    # Check if tonnage_ha column is present and has valid data
+    if (is.null(harvesting_data) || 
+        !("tonnage_ha" %in% names(harvesting_data)) || 
+        all(is.na(harvesting_data$tonnage_ha))) {
+      safe_log("No harvest data available: lacking tonnage_ha column or all values are NA", "WARNING")
+      return(create_fallback_result(field_boundaries))
+    }
+
+    # Check if CI quadrant RDS file exists
+    ci_quadrant_path <- file.path(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds")
+    if (!file.exists(ci_quadrant_path)) {
+      safe_log(paste("CI quadrant file not found at:", ci_quadrant_path), "WARNING")
+      return(create_fallback_result(field_boundaries))
+    }
+
+    # Load CI quadrant data and fill missing field/sub_field values
+    CI_quadrant <- readRDS(ci_quadrant_path) %>%
+      dplyr::group_by(model) %>%
+      tidyr::fill(field, sub_field, .direction = "downup") %>%
+      dplyr::ungroup()
+
+    # Rename year column to season for consistency
+    harvesting_data_renamed <- harvesting_data %>% dplyr::rename(season = year)
+
+    # Join CI and yield data
+    CI_and_yield <- dplyr::left_join(CI_quadrant, harvesting_data_renamed, 
+                                      by = c("field", "sub_field", "season")) %>%
+      dplyr::group_by(sub_field, season) %>%
+      dplyr::slice(which.max(DOY)) %>%
+      dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
+      dplyr::mutate(CI_per_day = cumulative_CI / DOY)
+
+    # Define predictors and response variables
+    predictors <- c("cumulative_CI", "DOY", "CI_per_day")
+    response <- "tonnage_ha"
+
+    # Prepare training dataset (fields with harvest data)
+    CI_and_yield_test <- CI_and_yield %>%
+      as.data.frame() %>%
+      dplyr::filter(!is.na(tonnage_ha))
+
+    # Check if we have minimum training data
+    if (nrow(CI_and_yield_test) == 0) {
+      safe_log("No training data available: no fields with tonnage_ha observations", "WARNING")
+      return(create_fallback_result(field_boundaries))
+    }
+
+    # Pre-clean training data: remove rows with any NAs in predictors or response
+    # This is required because CAST::ffs doesn't support na.rm parameter
+    CI_and_yield_test <- CI_and_yield_test %>%
+      dplyr::filter(!dplyr::if_any(dplyr::all_of(c(predictors, response)), is.na))
+
+    if (nrow(CI_and_yield_test) == 0) {
+      safe_log("No complete training data after removing NAs in predictors/response", "WARNING")
+      return(create_fallback_result(field_boundaries))
+    }
+
+    # Prepare prediction dataset (fields without harvest data, mature fields only)
+    prediction_yields <- CI_and_yield %>%
+      as.data.frame() %>%
+      dplyr::filter(is.na(tonnage_ha) & DOY >= 240)  # Mature fields only
+
+    # Configure model training parameters
+    ctrl <- caret::trainControl(
+      method = "cv",
+      savePredictions = TRUE,
+      allowParallel = TRUE,
+      number = 5,
+      verboseIter = FALSE
+    )
+
+    # Train the model with forward feature selection
+    set.seed(202) # For reproducibility
+    safe_log("Training Random Forest model with Forward Feature Selection...")
+    model_ffs_rf <- CAST::ffs(
+      CI_and_yield_test[, predictors],
+      CI_and_yield_test[, response],
+      method = "rf",
+      trControl = ctrl,
+      importance = TRUE,
+      withinSE = TRUE,
+      tuneLength = 5
+    )
+
+    # Predict yields on validation data (same as training data for RMSE calculation)
+    pred_ffs_rf <- prepare_predictions(
+      stats::predict(model_ffs_rf, newdata = CI_and_yield_test), 
+      CI_and_yield_test
+    )
+
+    # Extract cross-validated RMSE from the model object (more honest than in-sample RMSE)
+    # The CAST::ffs model stores CV results in $results data frame
+    # We extract the RMSE from the best model (lowest RMSE across folds)
+    if (!is.null(model_ffs_rf$results) && "RMSE" %in% names(model_ffs_rf$results)) {
+      # Get minimum RMSE from cross-validation results (best model from feature selection)
+      rmse_value <- min(model_ffs_rf$results$RMSE, na.rm = TRUE)
+      safe_log(paste("Yield prediction RMSE (cross-validated):", round(rmse_value, 2), "t/ha"))
+    } else {
+      # Fallback: compute in-sample RMSE if CV results unavailable, but label it clearly
+      rmse_value <- sqrt(mean((pred_ffs_rf$predicted_Tcha - CI_and_yield_test$tonnage_ha)^2, na.rm = TRUE))
+      safe_log(paste("Yield prediction RMSE (in-sample/training):", round(rmse_value, 2), "t/ha"))
+    }
+
+    # Predict yields for current season (mature fields >= 240 days)
+    if (nrow(prediction_yields) > 0) {
+      pred_rf_current_season <- prepare_predictions(
+        stats::predict(model_ffs_rf, newdata = prediction_yields), 
+        prediction_yields
+      ) %>%
+        dplyr::filter(Age_days >= 240) %>%
+        dplyr::select(c("field", "Age_days", "predicted_Tcha", "season"))
+    } else {
+      pred_rf_current_season <- data.frame()
+    }
+
+    # Calculate summary statistics for KPI
+    if (nrow(pred_rf_current_season) > 0) {
+      safe_log(paste("Number of fields with yield predictions:", nrow(pred_rf_current_season)))
+      safe_log(paste("Predicted yield range:", 
+                     round(min(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
+                     "-", 
+                     round(max(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
+                     "t/ha"))
+
+      # Calculate quartiles for grouping
+      yield_quartiles <- quantile(pred_rf_current_season$predicted_Tcha, 
+                                  probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
+
+      # Count fields in each quartile
+      top_25_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[3], na.rm = TRUE)
+      average_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[1] & 
+                           pred_rf_current_season$predicted_Tcha < yield_quartiles[3], na.rm = TRUE)
+      lowest_25_count <- sum(pred_rf_current_season$predicted_Tcha < yield_quartiles[1], na.rm = TRUE)
+
+      # Calculate total area
+      if (!inherits(field_boundaries, "SpatVector")) {
+        field_boundaries_vect <- terra::vect(field_boundaries)
+      } else {
+        field_boundaries_vect <- field_boundaries
+      }
+
+      # Handle both sf and SpatVector inputs for area calculation
+      if (inherits(field_boundaries, "sf")) {
+        field_boundaries_projected <- sf::st_transform(field_boundaries, "EPSG:6933")
+        field_areas <- sf::st_area(field_boundaries_projected) / 10000  # m² to hectares
+      } else {
+        field_boundaries_projected <- terra::project(field_boundaries_vect, "EPSG:6933")
+        field_areas <- terra::expanse(field_boundaries_projected) / 10000
+      }
+      total_area <- sum(as.numeric(field_areas))
+
+      safe_log(paste("Total area:", round(total_area, 1), "hectares"))
+
+      # Build summary result
+      result <- data.frame(
+        field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
+        count = c(top_25_count, average_count, lowest_25_count, nrow(field_boundaries)),
+        value = c(round(yield_quartiles[3], 1), round(yield_quartiles[2], 1), 
+                 round(yield_quartiles[1], 1), round(total_area, 1)),
+        stringsAsFactors = FALSE
+      )
+
+      # Prepare field-level results
+      field_level_results <- pred_rf_current_season %>%
+        dplyr::select(field, Age_days, predicted_Tcha, season) %>%
+        dplyr::rename(yield_forecast_t_ha = predicted_Tcha)
+
+      safe_log("✓ Yield prediction complete")
+      return(list(summary = result, field_results = field_level_results))
+    } else {
+      safe_log("No fields meet maturity threshold (DOY >= 240) for prediction", "WARNING")
+      return(list(summary = create_fallback_result(field_boundaries)$summary, 
+                 field_results = data.frame()))
+    }
+
+  }, error = function(e) {
+    safe_log(paste("Error in yield prediction:", e$message), "ERROR")
+    return(create_fallback_result(field_boundaries))
+  })
+}