Enhance yield prediction capabilities by integrating Random Forest model with Forward Feature Selection; add robust data loading and validation for harvesting data across multiple utility scripts.

This commit is contained in:
Timon 2026-02-17 15:02:30 +01:00
parent 640e239815
commit 34159b3003
4 changed files with 476 additions and 96 deletions

View file

@ -141,6 +141,10 @@ suppressPackageStartupMessages({
library(writexl) # For writing Excel outputs (KPI summary tables)
library(progress) # For progress bars during field processing
# ML models (for yield prediction KPI)
library(caret) # For training Random Forest with cross-validation
library(CAST) # For Forward Feature Selection in caret models
# ML/Analysis (optional - only for harvest model inference)
tryCatch({
library(torch) # For PyTorch model inference (harvest readiness prediction)
@ -359,11 +363,8 @@ main <- function() {
stop("ERROR loading field boundaries: ", e$message)
})
# Load harvesting data
if (!exists("harvesting_data")) {
warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
}
# Load harvesting data for yield prediction (using common helper function)
harvesting_data <- load_harvest_data(setup$data_dir)
# Extract current week/year from end_date
current_week <- as.numeric(format(end_date, "%V"))
@ -379,6 +380,7 @@ main <- function() {
ci_rds_path = NULL,
harvesting_data = harvesting_data,
output_dir = reports_dir_kpi,
data_dir = setup$data_dir,
project_dir = project_dir
)
@ -419,11 +421,8 @@ main <- function() {
stop("ERROR loading field boundaries: ", e$message)
})
# Load harvesting data
if (!exists("harvesting_data")) {
warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
}
# Load harvesting data for yield prediction (using common helper function)
harvesting_data <- load_harvest_data(setup$data_dir)
# Extract current week/year from end_date
current_week <- as.numeric(format(end_date, "%V"))

View file

@ -43,29 +43,6 @@ library(CAST)
# These are now sourced from common utils and shared by all client types.
# ============================================================================
#' Prepare harvest predictions and ensure proper alignment with field data
prepare_predictions <- function(harvest_model, field_data, scenario = "optimistic") {
if (is.null(harvest_model) || is.null(field_data)) {
return(NULL)
}
tryCatch({
scenario_factor <- switch(scenario,
"pessimistic" = 0.85,
"realistic" = 1.0,
"optimistic" = 1.15,
1.0)
predictions <- field_data %>%
mutate(tch_forecasted = field_data$mean_ci * scenario_factor)
return(predictions)
}, error = function(e) {
message(paste("Error preparing predictions:", e$message))
return(NULL)
})
}
# ============================================================================
# AURA KPI CALCULATION FUNCTIONS (6 KPIS)
# ============================================================================
@ -266,74 +243,91 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
#' KPI 3: Calculate TCH forecasted (tonnes of cane per hectare)
#'
#' Projects final harvest tonnage based on CI growth trajectory
#' Projects final harvest tonnage based on historical yield data and CI growth trajectory.
#' Uses a Random Forest model trained on harvest data to predict yields for mature fields.
#' Delegates to calculate_yield_prediction_kpi() in 80_utils_common.R.
#'
#' @param field_statistics Current field statistics
#' @param harvesting_data Historical harvest data (with yield observations)
#' @param field_boundaries_sf Field geometries
#' @param field_statistics Current field statistics (dataframe with Mean_CI or mean_ci column)
#' @param harvesting_data Historical harvest data frame (with tonnage_ha column)
#' @param field_boundaries_sf SF object with field geometries
#' @param cumulative_CI_vals_dir Directory with combined CI RDS files (optional)
#' @param data_dir Project data directory (from setup_project_directories or parameters_project.R)
#' Used to build cumulative_CI_vals_dir path if not provided directly (optional)
#' @param project_dir Deprecated: only used if data_dir not provided (optional)
#'
#' @return Data frame with field-level TCH forecasts
calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL, field_boundaries_sf = NULL) {
#' @return Data frame with field-level yield forecasts ready for orchestrator
#' Columns: field_idx, tch_forecasted (yields in t/ha)
calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL,
field_boundaries_sf = NULL,
cumulative_CI_vals_dir = NULL,
data_dir = NULL,
project_dir = NULL) {
# Handle both naming conventions (Field_id/Mean_CI vs field_idx/mean_ci)
if ("Field_id" %in% names(field_statistics)) {
# Add field_idx to match field_boundaries row numbers
field_statistics <- field_statistics %>%
mutate(field_idx = match(Field_id, field_boundaries_sf$field))
mean_ci_col <- "Mean_CI"
} else {
mean_ci_col <- "mean_ci"
}
# Use common utils yield prediction function (handles all ML logic)
# This replaces the previous linear model (TCH = 50 + CI*10) with proper ML prediction
# Filter out any fields without a match
field_statistics <- field_statistics %>%
filter(!is.na(field_idx))
if (nrow(field_statistics) == 0) {
warning("No fields matched between statistics and boundaries")
# Validate required parameters
if (is.null(field_boundaries_sf)) {
safe_log("field_boundaries_sf is NULL in calculate_tch_forecasted_kpi", "WARNING")
return(data.frame(
field_idx = integer(),
mean_ci = numeric(),
tch_forecasted = numeric(),
tch_lower_bound = numeric(),
tch_upper_bound = numeric(),
confidence = character(),
stringsAsFactors = FALSE
))
}
result <- data.frame(
field_idx = field_statistics$field_idx,
mean_ci = field_statistics[[mean_ci_col]],
tch_forecasted = NA_real_,
tch_lower_bound = NA_real_,
tch_upper_bound = NA_real_,
confidence = NA_character_,
stringsAsFactors = FALSE
)
# Base TCH model: TCH = 50 + (CI * 10)
# This is a simplified model; production use should include more variables
for (i in seq_len(nrow(result))) {
if (is.na(result$mean_ci[i])) {
result$confidence[i] <- "No data"
next
# Determine cumulative CI directory
if (is.null(cumulative_CI_vals_dir)) {
# Priority 1: Use provided data_dir parameter
if (!is.null(data_dir)) {
cumulative_CI_vals_dir <- file.path(data_dir, "extracted_ci", "cumulative_vals")
} else if (exists("data_dir", envir = .GlobalEnv)) {
# Priority 2: Fallback to global data_dir from parameters_project.R
cumulative_CI_vals_dir <- file.path(get("data_dir", envir = .GlobalEnv), "extracted_ci", "cumulative_vals")
} else {
# Priority 3: Last resort - log warning and fail gracefully
safe_log("Missing project data directory configuration: provide data_dir parameter or ensure parameters_project.R has set data_dir globally", "WARNING")
safe_log("No training data available for yield prediction", "WARNING")
return(data.frame(
field_idx = integer(),
tch_forecasted = numeric(),
stringsAsFactors = FALSE
))
}
ci_val <- result$mean_ci[i]
# Simple linear model
tch_est <- 50 + (ci_val * 10)
# Confidence interval based on CI range
tch_lower <- tch_est * 0.85
tch_upper <- tch_est * 1.15
result$tch_forecasted[i] <- round(tch_est, 2)
result$tch_lower_bound[i] <- round(tch_lower, 2)
result$tch_upper_bound[i] <- round(tch_upper, 2)
result$confidence[i] <- "Medium"
}
# Call the shared yield prediction function from common utils
yield_result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir)
# Extract field-level results from the list
field_results <- yield_result$field_results
# Convert to format expected by orchestrator
# If no predictions, return empty data frame
if (is.null(field_results) || nrow(field_results) == 0) {
return(data.frame(
field_idx = integer(),
tch_forecasted = numeric(),
stringsAsFactors = FALSE
))
}
# Map field names to field_idx using field_boundaries_sf
result <- field_results %>%
mutate(
field_idx = match(field, field_boundaries_sf$field),
tch_forecasted = yield_forecast_t_ha
) %>%
filter(!is.na(field_idx)) %>%
select(field_idx, tch_forecasted)
# Ensure result has proper structure even if empty
if (nrow(result) == 0) {
return(data.frame(
field_idx = integer(),
tch_forecasted = numeric(),
stringsAsFactors = FALSE
))
}
return(result)
@ -471,7 +465,7 @@ create_summary_tables <- function(all_kpis) {
select(field_idx, mean_ci_pct_change, interpretation),
tch_forecast = all_kpis$tch_forecasted %>%
select(field_idx, mean_ci, tch_forecasted, tch_lower_bound, tch_upper_bound, confidence),
select(field_idx, tch_forecasted),
growth_decline = all_kpis$growth_decline %>%
select(field_idx, four_week_trend, trend_interpretation, decline_severity),
@ -479,9 +473,9 @@ create_summary_tables <- function(all_kpis) {
weed_pressure = all_kpis$weed_presence %>%
select(field_idx, fragmentation_index, weed_pressure_risk),
gap_filling = if (!is.null(all_kpis$gap_filling)) {
gap_filling = if (!is.null(all_kpis$gap_filling) && nrow(all_kpis$gap_filling) > 0) {
all_kpis$gap_filling %>%
select(field_idx, gap_score, gap_level, mean_ci)
select(field_idx, gap_score, gap_level)
} else {
NULL
}
@ -527,9 +521,7 @@ create_field_detail_table <- function(field_boundaries_sf, all_kpis, current_wee
) %>%
left_join(
all_kpis$tch_forecasted %>%
select(field_idx, Mean_CI = mean_ci, TCH_Forecasted = tch_forecasted,
TCH_Lower = tch_lower_bound, TCH_Upper = tch_upper_bound,
TCH_Confidence = confidence),
select(field_idx, TCH_Forecasted = tch_forecasted),
by = "field_idx"
) %>%
left_join(
@ -647,6 +639,7 @@ calculate_all_field_analysis_agronomic_support <- function(
ci_rds_path = NULL,
harvesting_data = NULL,
output_dir = NULL,
data_dir = NULL,
project_dir = NULL
) {
@ -794,7 +787,8 @@ calculate_all_field_analysis_agronomic_support <- function(
}
message("Calculating KPI 3: TCH Forecasted...")
tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf)
tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf,
data_dir = data_dir, project_dir = project_dir)
message("Calculating KPI 4: Growth Decline...")
growth_decline_kpi <- calculate_growth_decline_kpi(

View file

@ -166,6 +166,59 @@ calculate_status_alert <- function(imminent_prob, age_week, weekly_ci_change, me
NA_character_
}
#' Calculate yield prediction for CANE_SUPPLY workflows (Wrapper)
#'
#' This function wraps the shared yield prediction model from 80_utils_common.R
#' to provide CANE_SUPPLY clients (e.g., ANGATA) with ML-based yield forecasting.
#'
#' Uses Random Forest with Forward Feature Selection trained on:
#' - Cumulative Canopy Index (CI) from growth model
#' - Days of Year (DOY) / crop age
#' - CI-per-day (growth velocity)
#'
#' Predicts yields for mature fields (DOY >= 240, ~8 months) into quartiles:
#' - Top 25%: High-yield fields
#' - Average: Mid-range yield fields
#' - Lowest 25%: Lower-yield fields
#'
#' @param field_boundaries_sf SF object with field geometries
#' @param harvesting_data Data frame with harvest history (must have tonnage_ha column)
#' @param cumulative_CI_vals_dir Directory with combined CI RDS files
#'
#' @return List with:
#' - summary: Data frame with field_groups, count, and yield quartile predictions
#' - field_results: Data frame with field-level forecasts (yield_forecast_t_ha in t/ha)
#'
#' @details
#' **Data Requirements:**
#' - harvesting_data must include tonnage_ha column (yield in t/ha) for training
#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
#' - If either is missing, returns graceful fallback with NA values (not fake numbers)
#'
#' **Integration:**
#' This can be called from calculate_all_field_kpis() in cane_supply workflow to add
#' a new "Yield_Forecast" column to the 22-column KPI output.
#'
#' **Example:**
#' ```r
#' yield_result <- calculate_yield_prediction_kpi_cane_supply(
#' field_boundaries_sf,
#' harvesting_data,
#' file.path(data_dir, "combined_CI")
#' )
#' # yield_result$summary has quartiles
#' # yield_result$field_results has per-field forecasts
#' ```
calculate_yield_prediction_kpi_cane_supply <- function(field_boundaries_sf,
harvesting_data,
cumulative_CI_vals_dir) {
# Call the shared yield prediction function from 80_utils_common.R
result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir)
return(result)
}
#' Build complete per-field KPI dataframe with all 22 columns
#' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats

View file

@ -8,8 +8,13 @@
# - Field statistics extraction
# - Week/year calculations for consistent date handling
# - Excel/CSV/RDS export utilities
# - Yield prediction using ML models (Random Forest with Feature Selection)
#
# Used by: 80_calculate_kpis.R, all client-specific utils files
#
# NOTE: Libraries required by yield prediction (caret, CAST, here) are loaded
# in the main script 80_calculate_kpis.R, not here. This keeps dependencies
# centralized in the orchestrator script.
# ============================================================================
# ============================================================================
@ -590,6 +595,81 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL)
})
}
# ============================================================================
# DATA LOADING HELPERS
# ============================================================================
#' Load and validate harvest data from harvest.xlsx
#'
#' Encapsulates harvest data loading with validation, type coercion, and error handling.
#' Returns a data frame with required columns (field, year, tonnage_ha) or an empty
#' data frame with proper structure if loading fails.
#'
#' @param data_dir Path to data directory containing harvest.xlsx
#'
#' @return data.frame with columns: field (character), year (numeric), tonnage_ha (numeric)
#' - On success: data frame with N rows of harvest records
#' - On failure: empty data frame with correct structure (0 rows, 3 columns)
#'
#' @details
#' **File Location**: Expected at `file.path(data_dir, "harvest.xlsx")`
#'
#' **Validation**:
#' - Checks file existence before reading
#' - Validates required columns: field, year, tonnage_ha
#' - Coerces year and tonnage_ha to numeric
#' - Logs status messages for debugging
#'
#' **Error Handling**:
#' - Missing file: Returns empty DF (logs NOTE)
#' - Missing columns: Returns empty DF (logs WARNING)
#' - Read errors: Returns empty DF (logs WARNING)
#' - Always returns valid data frame structure (won't return NULL)
#'
#' **Usage**:
#' ```r
#' harvesting_data <- load_harvest_data(setup$data_dir)
#' # harvesting_data is guaranteed to be a data.frame with 3 columns
#' # even if harvest.xlsx is unavailable or invalid
#' ```
load_harvest_data <- function(data_dir) {
harvesting_data <- NULL
harvest_file <- file.path(data_dir, "harvest.xlsx")
if (file.exists(harvest_file)) {
tryCatch({
harvesting_data <- readxl::read_excel(harvest_file)
# Ensure required columns are present
required_cols <- c("field", "year", "tonnage_ha")
if (all(required_cols %in% names(harvesting_data))) {
# Convert to data frame and ensure column types
harvesting_data <- as.data.frame(harvesting_data)
harvesting_data$year <- as.numeric(harvesting_data$year)
harvesting_data$tonnage_ha <- as.numeric(harvesting_data$tonnage_ha)
message(paste(" ✓ Loaded harvest data:", nrow(harvesting_data), "records from harvest.xlsx"))
return(harvesting_data)
} else {
message(paste(" WARNING: harvest.xlsx missing required columns. Expected: field, year, tonnage_ha"))
harvesting_data <- NULL
}
}, error = function(e) {
message(paste(" WARNING: Could not read harvest.xlsx:", e$message))
})
} else {
message(paste(" NOTE: harvest.xlsx not found at", harvest_file))
}
# Fallback: create empty data frame if loading failed
if (is.null(harvesting_data)) {
message(" WARNING: No harvest data available. TCH yield prediction will use graceful fallback (NA values)")
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
}
return(harvesting_data)
}
# ============================================================================
# FIELD STATISTICS EXTRACTION
# ============================================================================
@ -1386,3 +1466,257 @@ prepare_predictions <- function(predictions, newdata) {
dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
)
}
# ============================================================================
# YIELD PREDICTION KPI (SHARED ML-BASED MODEL FOR ALL CLIENT TYPES)
# ============================================================================
#' Helper function for graceful fallback when training data unavailable
#'
#' @param field_boundaries Field boundaries (sf or SpatVector)
#' @return List with summary and field_results (both with NA values)
create_fallback_result <- function(field_boundaries) {
# Convert to SpatVector if needed (for terra::project)
if (!inherits(field_boundaries, "SpatVector")) {
field_boundaries <- terra::vect(field_boundaries)
}
field_boundaries_projected <- terra::project(field_boundaries, "EPSG:6933") # Equal Earth projection
field_areas <- terra::expanse(field_boundaries_projected) / 10000 # Convert m² to hectares
total_area <- sum(field_areas)
summary_result <- data.frame(
field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
count = c(0, 0, 0, nrow(field_boundaries)),
value = c(NA_real_, NA_real_, NA_real_, round(total_area, 1)),
stringsAsFactors = FALSE
)
field_results <- data.frame(
field = character(0),
sub_field = character(0),
Age_days = numeric(0),
yield_forecast_t_ha = numeric(0),
season = numeric(0),
stringsAsFactors = FALSE
)
return(list(summary = summary_result, field_results = field_results))
}
#' Calculate yield prediction KPI using Random Forest with Feature Selection
#'
#' Trains a Random Forest model on historical harvest data with cumulative CI,
#' days of year (DOY), and CI-per-day as predictors. Uses CAST::ffs() for
#' Forward Feature Selection. Predicts yields for mature fields (DOY >= 240).
#'
#' @param field_boundaries Field boundaries (sf or SpatVector)
#' @param harvesting_data Data frame with harvest data including tonnage_ha column
#' @param cumulative_CI_vals_dir Directory containing "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
#'
#' @return List with:
#' - summary: Data frame with field_groups, count, value (quartiles and total area)
#' - field_results: Data frame with field-level yield forecasts (yield_forecast_t_ha)
#'
#' @details
#' **Training Data Requirements:**
#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
#' - harvesting_data must have tonnage_ha column with numeric yield values
#' - Training stops gracefully if either is missing (returns NA values, not fake numbers)
#'
#' **Model Specifications:**
#' - Algorithm: Random Forest (caret + CAST)
#' - Feature Selection: Forward Feature Selection (CAST::ffs)
#' - Cross-validation: 5-fold CV
#' - Predictors: cumulative_CI, DOY, CI_per_day
#' - Mature field threshold: DOY >= 240 (8 months)
#' - Output: Field-level yield forecasts grouped by quartile
#'
#' **Error Handling:**
#' - Missing tonnage_ha: Returns graceful fallback with NA (not zero) values
#' - No training data: Logs WARNING, returns empty field_results
#' - RDS file missing: Returns graceful fallback
#' - Prediction errors: Wrapped in tryCatch, returns fallback on failure
calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cumulative_CI_vals_dir) {
safe_log("Calculating yield prediction KPI using Random Forest with Feature Selection")
tryCatch({
# Check if tonnage_ha column is present and has valid data
if (is.null(harvesting_data) ||
!("tonnage_ha" %in% names(harvesting_data)) ||
all(is.na(harvesting_data$tonnage_ha))) {
safe_log("No harvest data available: lacking tonnage_ha column or all values are NA", "WARNING")
return(create_fallback_result(field_boundaries))
}
# Check if CI quadrant RDS file exists
ci_quadrant_path <- file.path(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds")
if (!file.exists(ci_quadrant_path)) {
safe_log(paste("CI quadrant file not found at:", ci_quadrant_path), "WARNING")
return(create_fallback_result(field_boundaries))
}
# Load CI quadrant data and fill missing field/sub_field values
CI_quadrant <- readRDS(ci_quadrant_path) %>%
dplyr::group_by(model) %>%
tidyr::fill(field, sub_field, .direction = "downup") %>%
dplyr::ungroup()
# Rename year column to season for consistency
harvesting_data_renamed <- harvesting_data %>% dplyr::rename(season = year)
# Join CI and yield data
CI_and_yield <- dplyr::left_join(CI_quadrant, harvesting_data_renamed,
by = c("field", "sub_field", "season")) %>%
dplyr::group_by(sub_field, season) %>%
dplyr::slice(which.max(DOY)) %>%
dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
dplyr::mutate(CI_per_day = cumulative_CI / DOY)
# Define predictors and response variables
predictors <- c("cumulative_CI", "DOY", "CI_per_day")
response <- "tonnage_ha"
# Prepare training dataset (fields with harvest data)
CI_and_yield_test <- CI_and_yield %>%
as.data.frame() %>%
dplyr::filter(!is.na(tonnage_ha))
# Check if we have minimum training data
if (nrow(CI_and_yield_test) == 0) {
safe_log("No training data available: no fields with tonnage_ha observations", "WARNING")
return(create_fallback_result(field_boundaries))
}
# Pre-clean training data: remove rows with any NAs in predictors or response
# This is required because CAST::ffs doesn't support na.rm parameter
CI_and_yield_test <- CI_and_yield_test %>%
dplyr::filter(!dplyr::if_any(dplyr::all_of(c(predictors, response)), is.na))
if (nrow(CI_and_yield_test) == 0) {
safe_log("No complete training data after removing NAs in predictors/response", "WARNING")
return(create_fallback_result(field_boundaries))
}
# Prepare prediction dataset (fields without harvest data, mature fields only)
prediction_yields <- CI_and_yield %>%
as.data.frame() %>%
dplyr::filter(is.na(tonnage_ha) & DOY >= 240) # Mature fields only
# Configure model training parameters
ctrl <- caret::trainControl(
method = "cv",
savePredictions = TRUE,
allowParallel = TRUE,
number = 5,
verboseIter = FALSE
)
# Train the model with forward feature selection
set.seed(202) # For reproducibility
safe_log("Training Random Forest model with Forward Feature Selection...")
model_ffs_rf <- CAST::ffs(
CI_and_yield_test[, predictors],
CI_and_yield_test[, response],
method = "rf",
trControl = ctrl,
importance = TRUE,
withinSE = TRUE,
tuneLength = 5
)
# Predict yields on validation data (same as training data for RMSE calculation)
pred_ffs_rf <- prepare_predictions(
stats::predict(model_ffs_rf, newdata = CI_and_yield_test),
CI_and_yield_test
)
# Extract cross-validated RMSE from the model object (more honest than in-sample RMSE)
# The CAST::ffs model stores CV results in $results data frame
# We extract the RMSE from the best model (lowest RMSE across folds)
if (!is.null(model_ffs_rf$results) && "RMSE" %in% names(model_ffs_rf$results)) {
# Get minimum RMSE from cross-validation results (best model from feature selection)
rmse_value <- min(model_ffs_rf$results$RMSE, na.rm = TRUE)
safe_log(paste("Yield prediction RMSE (cross-validated):", round(rmse_value, 2), "t/ha"))
} else {
# Fallback: compute in-sample RMSE if CV results unavailable, but label it clearly
rmse_value <- sqrt(mean((pred_ffs_rf$predicted_Tcha - CI_and_yield_test$tonnage_ha)^2, na.rm = TRUE))
safe_log(paste("Yield prediction RMSE (in-sample/training):", round(rmse_value, 2), "t/ha"))
}
# Predict yields for current season (mature fields >= 240 days)
if (nrow(prediction_yields) > 0) {
pred_rf_current_season <- prepare_predictions(
stats::predict(model_ffs_rf, newdata = prediction_yields),
prediction_yields
) %>%
dplyr::filter(Age_days >= 240) %>%
dplyr::select(c("field", "Age_days", "predicted_Tcha", "season"))
} else {
pred_rf_current_season <- data.frame()
}
# Calculate summary statistics for KPI
if (nrow(pred_rf_current_season) > 0) {
safe_log(paste("Number of fields with yield predictions:", nrow(pred_rf_current_season)))
safe_log(paste("Predicted yield range:",
round(min(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
"-",
round(max(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
"t/ha"))
# Calculate quartiles for grouping
yield_quartiles <- quantile(pred_rf_current_season$predicted_Tcha,
probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
# Count fields in each quartile
top_25_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[3], na.rm = TRUE)
average_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[1] &
pred_rf_current_season$predicted_Tcha < yield_quartiles[3], na.rm = TRUE)
lowest_25_count <- sum(pred_rf_current_season$predicted_Tcha < yield_quartiles[1], na.rm = TRUE)
# Calculate total area
if (!inherits(field_boundaries, "SpatVector")) {
field_boundaries_vect <- terra::vect(field_boundaries)
} else {
field_boundaries_vect <- field_boundaries
}
# Handle both sf and SpatVector inputs for area calculation
if (inherits(field_boundaries, "sf")) {
field_boundaries_projected <- sf::st_transform(field_boundaries, "EPSG:6933")
field_areas <- sf::st_area(field_boundaries_projected) / 10000 # m² to hectares
} else {
field_boundaries_projected <- terra::project(field_boundaries_vect, "EPSG:6933")
field_areas <- terra::expanse(field_boundaries_projected) / 10000
}
total_area <- sum(as.numeric(field_areas))
safe_log(paste("Total area:", round(total_area, 1), "hectares"))
# Build summary result
result <- data.frame(
field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
count = c(top_25_count, average_count, lowest_25_count, nrow(field_boundaries)),
value = c(round(yield_quartiles[3], 1), round(yield_quartiles[2], 1),
round(yield_quartiles[1], 1), round(total_area, 1)),
stringsAsFactors = FALSE
)
# Prepare field-level results
field_level_results <- pred_rf_current_season %>%
dplyr::select(field, Age_days, predicted_Tcha, season) %>%
dplyr::rename(yield_forecast_t_ha = predicted_Tcha)
safe_log("✓ Yield prediction complete")
return(list(summary = result, field_results = field_level_results))
} else {
safe_log("No fields meet maturity threshold (DOY >= 240) for prediction", "WARNING")
return(list(summary = create_fallback_result(field_boundaries)$summary,
field_results = data.frame()))
}
}, error = function(e) {
safe_log(paste("Error in yield prediction:", e$message), "ERROR")
return(create_fallback_result(field_boundaries))
})
}