Enhance yield prediction capabilities by integrating Random Forest model with Forward Feature Selection; add robust data loading and validation for harvesting data across multiple utility scripts.
This commit is contained in:
parent
640e239815
commit
34159b3003
|
|
@ -141,6 +141,10 @@ suppressPackageStartupMessages({
|
|||
library(writexl) # For writing Excel outputs (KPI summary tables)
|
||||
library(progress) # For progress bars during field processing
|
||||
|
||||
# ML models (for yield prediction KPI)
|
||||
library(caret) # For training Random Forest with cross-validation
|
||||
library(CAST) # For Forward Feature Selection in caret models
|
||||
|
||||
# ML/Analysis (optional - only for harvest model inference)
|
||||
tryCatch({
|
||||
library(torch) # For PyTorch model inference (harvest readiness prediction)
|
||||
|
|
@ -359,11 +363,8 @@ main <- function() {
|
|||
stop("ERROR loading field boundaries: ", e$message)
|
||||
})
|
||||
|
||||
# Load harvesting data
|
||||
if (!exists("harvesting_data")) {
|
||||
warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
|
||||
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
|
||||
}
|
||||
# Load harvesting data for yield prediction (using common helper function)
|
||||
harvesting_data <- load_harvest_data(setup$data_dir)
|
||||
|
||||
# Extract current week/year from end_date
|
||||
current_week <- as.numeric(format(end_date, "%V"))
|
||||
|
|
@ -379,6 +380,7 @@ main <- function() {
|
|||
ci_rds_path = NULL,
|
||||
harvesting_data = harvesting_data,
|
||||
output_dir = reports_dir_kpi,
|
||||
data_dir = setup$data_dir,
|
||||
project_dir = project_dir
|
||||
)
|
||||
|
||||
|
|
@ -419,11 +421,8 @@ main <- function() {
|
|||
stop("ERROR loading field boundaries: ", e$message)
|
||||
})
|
||||
|
||||
# Load harvesting data
|
||||
if (!exists("harvesting_data")) {
|
||||
warning("harvesting_data not loaded. TCH KPI will use placeholder values.")
|
||||
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
|
||||
}
|
||||
# Load harvesting data for yield prediction (using common helper function)
|
||||
harvesting_data <- load_harvest_data(setup$data_dir)
|
||||
|
||||
# Extract current week/year from end_date
|
||||
current_week <- as.numeric(format(end_date, "%V"))
|
||||
|
|
|
|||
|
|
@ -43,29 +43,6 @@ library(CAST)
|
|||
# These are now sourced from common utils and shared by all client types.
|
||||
# ============================================================================
|
||||
|
||||
#' Prepare harvest predictions and ensure proper alignment with field data
|
||||
prepare_predictions <- function(harvest_model, field_data, scenario = "optimistic") {
|
||||
if (is.null(harvest_model) || is.null(field_data)) {
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
tryCatch({
|
||||
scenario_factor <- switch(scenario,
|
||||
"pessimistic" = 0.85,
|
||||
"realistic" = 1.0,
|
||||
"optimistic" = 1.15,
|
||||
1.0)
|
||||
|
||||
predictions <- field_data %>%
|
||||
mutate(tch_forecasted = field_data$mean_ci * scenario_factor)
|
||||
|
||||
return(predictions)
|
||||
}, error = function(e) {
|
||||
message(paste("Error preparing predictions:", e$message))
|
||||
return(NULL)
|
||||
})
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# AURA KPI CALCULATION FUNCTIONS (6 KPIS)
|
||||
# ============================================================================
|
||||
|
|
@ -266,74 +243,91 @@ calculate_area_change_kpi <- function(current_stats, previous_stats) {
|
|||
|
||||
#' KPI 3: Calculate TCH forecasted (tonnes of cane per hectare)
|
||||
#'
|
||||
#' Projects final harvest tonnage based on CI growth trajectory
|
||||
#' Projects final harvest tonnage based on historical yield data and CI growth trajectory.
|
||||
#' Uses a Random Forest model trained on harvest data to predict yields for mature fields.
|
||||
#' Delegates to calculate_yield_prediction_kpi() in 80_utils_common.R.
|
||||
#'
|
||||
#' @param field_statistics Current field statistics
|
||||
#' @param harvesting_data Historical harvest data (with yield observations)
|
||||
#' @param field_boundaries_sf Field geometries
|
||||
#' @param field_statistics Current field statistics (dataframe with Mean_CI or mean_ci column)
|
||||
#' @param harvesting_data Historical harvest data frame (with tonnage_ha column)
|
||||
#' @param field_boundaries_sf SF object with field geometries
|
||||
#' @param cumulative_CI_vals_dir Directory with combined CI RDS files (optional)
|
||||
#' @param data_dir Project data directory (from setup_project_directories or parameters_project.R)
|
||||
#' Used to build cumulative_CI_vals_dir path if not provided directly (optional)
|
||||
#' @param project_dir Deprecated: only used if data_dir not provided (optional)
|
||||
#'
|
||||
#' @return Data frame with field-level TCH forecasts
|
||||
calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL, field_boundaries_sf = NULL) {
|
||||
#' @return Data frame with field-level yield forecasts ready for orchestrator
|
||||
#' Columns: field_idx, tch_forecasted (yields in t/ha)
|
||||
calculate_tch_forecasted_kpi <- function(field_statistics, harvesting_data = NULL,
|
||||
field_boundaries_sf = NULL,
|
||||
cumulative_CI_vals_dir = NULL,
|
||||
data_dir = NULL,
|
||||
project_dir = NULL) {
|
||||
|
||||
# Handle both naming conventions (Field_id/Mean_CI vs field_idx/mean_ci)
|
||||
if ("Field_id" %in% names(field_statistics)) {
|
||||
# Add field_idx to match field_boundaries row numbers
|
||||
field_statistics <- field_statistics %>%
|
||||
mutate(field_idx = match(Field_id, field_boundaries_sf$field))
|
||||
mean_ci_col <- "Mean_CI"
|
||||
} else {
|
||||
mean_ci_col <- "mean_ci"
|
||||
}
|
||||
# Use common utils yield prediction function (handles all ML logic)
|
||||
# This replaces the previous linear model (TCH = 50 + CI*10) with proper ML prediction
|
||||
|
||||
# Filter out any fields without a match
|
||||
field_statistics <- field_statistics %>%
|
||||
filter(!is.na(field_idx))
|
||||
|
||||
if (nrow(field_statistics) == 0) {
|
||||
warning("No fields matched between statistics and boundaries")
|
||||
# Validate required parameters
|
||||
if (is.null(field_boundaries_sf)) {
|
||||
safe_log("field_boundaries_sf is NULL in calculate_tch_forecasted_kpi", "WARNING")
|
||||
return(data.frame(
|
||||
field_idx = integer(),
|
||||
mean_ci = numeric(),
|
||||
tch_forecasted = numeric(),
|
||||
tch_lower_bound = numeric(),
|
||||
tch_upper_bound = numeric(),
|
||||
confidence = character(),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
result <- data.frame(
|
||||
field_idx = field_statistics$field_idx,
|
||||
mean_ci = field_statistics[[mean_ci_col]],
|
||||
tch_forecasted = NA_real_,
|
||||
tch_lower_bound = NA_real_,
|
||||
tch_upper_bound = NA_real_,
|
||||
confidence = NA_character_,
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
# Base TCH model: TCH = 50 + (CI * 10)
|
||||
# This is a simplified model; production use should include more variables
|
||||
|
||||
for (i in seq_len(nrow(result))) {
|
||||
if (is.na(result$mean_ci[i])) {
|
||||
result$confidence[i] <- "No data"
|
||||
next
|
||||
# Determine cumulative CI directory
|
||||
if (is.null(cumulative_CI_vals_dir)) {
|
||||
# Priority 1: Use provided data_dir parameter
|
||||
if (!is.null(data_dir)) {
|
||||
cumulative_CI_vals_dir <- file.path(data_dir, "extracted_ci", "cumulative_vals")
|
||||
} else if (exists("data_dir", envir = .GlobalEnv)) {
|
||||
# Priority 2: Fallback to global data_dir from parameters_project.R
|
||||
cumulative_CI_vals_dir <- file.path(get("data_dir", envir = .GlobalEnv), "extracted_ci", "cumulative_vals")
|
||||
} else {
|
||||
# Priority 3: Last resort - log warning and fail gracefully
|
||||
safe_log("Missing project data directory configuration: provide data_dir parameter or ensure parameters_project.R has set data_dir globally", "WARNING")
|
||||
safe_log("No training data available for yield prediction", "WARNING")
|
||||
return(data.frame(
|
||||
field_idx = integer(),
|
||||
tch_forecasted = numeric(),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
ci_val <- result$mean_ci[i]
|
||||
|
||||
# Simple linear model
|
||||
tch_est <- 50 + (ci_val * 10)
|
||||
|
||||
# Confidence interval based on CI range
|
||||
tch_lower <- tch_est * 0.85
|
||||
tch_upper <- tch_est * 1.15
|
||||
|
||||
result$tch_forecasted[i] <- round(tch_est, 2)
|
||||
result$tch_lower_bound[i] <- round(tch_lower, 2)
|
||||
result$tch_upper_bound[i] <- round(tch_upper, 2)
|
||||
result$confidence[i] <- "Medium"
|
||||
}
|
||||
|
||||
# Call the shared yield prediction function from common utils
|
||||
yield_result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir)
|
||||
|
||||
# Extract field-level results from the list
|
||||
field_results <- yield_result$field_results
|
||||
|
||||
# Convert to format expected by orchestrator
|
||||
# If no predictions, return empty data frame
|
||||
if (is.null(field_results) || nrow(field_results) == 0) {
|
||||
return(data.frame(
|
||||
field_idx = integer(),
|
||||
tch_forecasted = numeric(),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
# Map field names to field_idx using field_boundaries_sf
|
||||
result <- field_results %>%
|
||||
mutate(
|
||||
field_idx = match(field, field_boundaries_sf$field),
|
||||
tch_forecasted = yield_forecast_t_ha
|
||||
) %>%
|
||||
filter(!is.na(field_idx)) %>%
|
||||
select(field_idx, tch_forecasted)
|
||||
|
||||
# Ensure result has proper structure even if empty
|
||||
if (nrow(result) == 0) {
|
||||
return(data.frame(
|
||||
field_idx = integer(),
|
||||
tch_forecasted = numeric(),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
return(result)
|
||||
|
|
@ -471,7 +465,7 @@ create_summary_tables <- function(all_kpis) {
|
|||
select(field_idx, mean_ci_pct_change, interpretation),
|
||||
|
||||
tch_forecast = all_kpis$tch_forecasted %>%
|
||||
select(field_idx, mean_ci, tch_forecasted, tch_lower_bound, tch_upper_bound, confidence),
|
||||
select(field_idx, tch_forecasted),
|
||||
|
||||
growth_decline = all_kpis$growth_decline %>%
|
||||
select(field_idx, four_week_trend, trend_interpretation, decline_severity),
|
||||
|
|
@ -479,9 +473,9 @@ create_summary_tables <- function(all_kpis) {
|
|||
weed_pressure = all_kpis$weed_presence %>%
|
||||
select(field_idx, fragmentation_index, weed_pressure_risk),
|
||||
|
||||
gap_filling = if (!is.null(all_kpis$gap_filling)) {
|
||||
gap_filling = if (!is.null(all_kpis$gap_filling) && nrow(all_kpis$gap_filling) > 0) {
|
||||
all_kpis$gap_filling %>%
|
||||
select(field_idx, gap_score, gap_level, mean_ci)
|
||||
select(field_idx, gap_score, gap_level)
|
||||
} else {
|
||||
NULL
|
||||
}
|
||||
|
|
@ -527,9 +521,7 @@ create_field_detail_table <- function(field_boundaries_sf, all_kpis, current_wee
|
|||
) %>%
|
||||
left_join(
|
||||
all_kpis$tch_forecasted %>%
|
||||
select(field_idx, Mean_CI = mean_ci, TCH_Forecasted = tch_forecasted,
|
||||
TCH_Lower = tch_lower_bound, TCH_Upper = tch_upper_bound,
|
||||
TCH_Confidence = confidence),
|
||||
select(field_idx, TCH_Forecasted = tch_forecasted),
|
||||
by = "field_idx"
|
||||
) %>%
|
||||
left_join(
|
||||
|
|
@ -647,6 +639,7 @@ calculate_all_field_analysis_agronomic_support <- function(
|
|||
ci_rds_path = NULL,
|
||||
harvesting_data = NULL,
|
||||
output_dir = NULL,
|
||||
data_dir = NULL,
|
||||
project_dir = NULL
|
||||
) {
|
||||
|
||||
|
|
@ -794,7 +787,8 @@ calculate_all_field_analysis_agronomic_support <- function(
|
|||
}
|
||||
|
||||
message("Calculating KPI 3: TCH Forecasted...")
|
||||
tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf)
|
||||
tch_kpi <- calculate_tch_forecasted_kpi(current_stats, harvesting_data, field_boundaries_sf,
|
||||
data_dir = data_dir, project_dir = project_dir)
|
||||
|
||||
message("Calculating KPI 4: Growth Decline...")
|
||||
growth_decline_kpi <- calculate_growth_decline_kpi(
|
||||
|
|
|
|||
|
|
@ -166,6 +166,59 @@ calculate_status_alert <- function(imminent_prob, age_week, weekly_ci_change, me
|
|||
NA_character_
|
||||
}
|
||||
|
||||
#' Calculate yield prediction for CANE_SUPPLY workflows (Wrapper)
|
||||
#'
|
||||
#' This function wraps the shared yield prediction model from 80_utils_common.R
|
||||
#' to provide CANE_SUPPLY clients (e.g., ANGATA) with ML-based yield forecasting.
|
||||
#'
|
||||
#' Uses Random Forest with Forward Feature Selection trained on:
|
||||
#' - Cumulative Canopy Index (CI) from growth model
|
||||
#' - Days of Year (DOY) / crop age
|
||||
#' - CI-per-day (growth velocity)
|
||||
#'
|
||||
#' Predicts yields for mature fields (DOY >= 240, ~8 months) into quartiles:
|
||||
#' - Top 25%: High-yield fields
|
||||
#' - Average: Mid-range yield fields
|
||||
#' - Lowest 25%: Lower-yield fields
|
||||
#'
|
||||
#' @param field_boundaries_sf SF object with field geometries
|
||||
#' @param harvesting_data Data frame with harvest history (must have tonnage_ha column)
|
||||
#' @param cumulative_CI_vals_dir Directory with combined CI RDS files
|
||||
#'
|
||||
#' @return List with:
|
||||
#' - summary: Data frame with field_groups, count, and yield quartile predictions
|
||||
#' - field_results: Data frame with field-level forecasts (yield_forecast_t_ha in t/ha)
|
||||
#'
|
||||
#' @details
|
||||
#' **Data Requirements:**
|
||||
#' - harvesting_data must include tonnage_ha column (yield in t/ha) for training
|
||||
#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
|
||||
#' - If either is missing, returns graceful fallback with NA values (not fake numbers)
|
||||
#'
|
||||
#' **Integration:**
|
||||
#' This can be called from calculate_all_field_kpis() in cane_supply workflow to add
|
||||
#' a new "Yield_Forecast" column to the 22-column KPI output.
|
||||
#'
|
||||
#' **Example:**
|
||||
#' ```r
|
||||
#' yield_result <- calculate_yield_prediction_kpi_cane_supply(
|
||||
#' field_boundaries_sf,
|
||||
#' harvesting_data,
|
||||
#' file.path(data_dir, "combined_CI")
|
||||
#' )
|
||||
#' # yield_result$summary has quartiles
|
||||
#' # yield_result$field_results has per-field forecasts
|
||||
#' ```
|
||||
calculate_yield_prediction_kpi_cane_supply <- function(field_boundaries_sf,
|
||||
harvesting_data,
|
||||
cumulative_CI_vals_dir) {
|
||||
|
||||
# Call the shared yield prediction function from 80_utils_common.R
|
||||
result <- calculate_yield_prediction_kpi(field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir)
|
||||
|
||||
return(result)
|
||||
}
|
||||
|
||||
|
||||
#' Build complete per-field KPI dataframe with all 22 columns
|
||||
#' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats
|
||||
|
|
|
|||
|
|
@ -8,8 +8,13 @@
|
|||
# - Field statistics extraction
|
||||
# - Week/year calculations for consistent date handling
|
||||
# - Excel/CSV/RDS export utilities
|
||||
# - Yield prediction using ML models (Random Forest with Feature Selection)
|
||||
#
|
||||
# Used by: 80_calculate_kpis.R, all client-specific utils files
|
||||
#
|
||||
# NOTE: Libraries required by yield prediction (caret, CAST, here) are loaded
|
||||
# in the main script 80_calculate_kpis.R, not here. This keeps dependencies
|
||||
# centralized in the orchestrator script.
|
||||
# ============================================================================
|
||||
|
||||
# ============================================================================
|
||||
|
|
@ -590,6 +595,81 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL)
|
|||
})
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# DATA LOADING HELPERS
|
||||
# ============================================================================
|
||||
|
||||
#' Load and validate harvest data from harvest.xlsx
|
||||
#'
|
||||
#' Encapsulates harvest data loading with validation, type coercion, and error handling.
|
||||
#' Returns a data frame with required columns (field, year, tonnage_ha) or an empty
|
||||
#' data frame with proper structure if loading fails.
|
||||
#'
|
||||
#' @param data_dir Path to data directory containing harvest.xlsx
|
||||
#'
|
||||
#' @return data.frame with columns: field (character), year (numeric), tonnage_ha (numeric)
|
||||
#' - On success: data frame with N rows of harvest records
|
||||
#' - On failure: empty data frame with correct structure (0 rows, 3 columns)
|
||||
#'
|
||||
#' @details
|
||||
#' **File Location**: Expected at `file.path(data_dir, "harvest.xlsx")`
|
||||
#'
|
||||
#' **Validation**:
|
||||
#' - Checks file existence before reading
|
||||
#' - Validates required columns: field, year, tonnage_ha
|
||||
#' - Coerces year and tonnage_ha to numeric
|
||||
#' - Logs status messages for debugging
|
||||
#'
|
||||
#' **Error Handling**:
|
||||
#' - Missing file: Returns empty DF (logs NOTE)
|
||||
#' - Missing columns: Returns empty DF (logs WARNING)
|
||||
#' - Read errors: Returns empty DF (logs WARNING)
|
||||
#' - Always returns valid data frame structure (won't return NULL)
|
||||
#'
|
||||
#' **Usage**:
|
||||
#' ```r
|
||||
#' harvesting_data <- load_harvest_data(setup$data_dir)
|
||||
#' # harvesting_data is guaranteed to be a data.frame with 3 columns
|
||||
#' # even if harvest.xlsx is unavailable or invalid
|
||||
#' ```
|
||||
load_harvest_data <- function(data_dir) {
|
||||
harvesting_data <- NULL
|
||||
harvest_file <- file.path(data_dir, "harvest.xlsx")
|
||||
|
||||
if (file.exists(harvest_file)) {
|
||||
tryCatch({
|
||||
harvesting_data <- readxl::read_excel(harvest_file)
|
||||
|
||||
# Ensure required columns are present
|
||||
required_cols <- c("field", "year", "tonnage_ha")
|
||||
if (all(required_cols %in% names(harvesting_data))) {
|
||||
# Convert to data frame and ensure column types
|
||||
harvesting_data <- as.data.frame(harvesting_data)
|
||||
harvesting_data$year <- as.numeric(harvesting_data$year)
|
||||
harvesting_data$tonnage_ha <- as.numeric(harvesting_data$tonnage_ha)
|
||||
|
||||
message(paste(" ✓ Loaded harvest data:", nrow(harvesting_data), "records from harvest.xlsx"))
|
||||
return(harvesting_data)
|
||||
} else {
|
||||
message(paste(" WARNING: harvest.xlsx missing required columns. Expected: field, year, tonnage_ha"))
|
||||
harvesting_data <- NULL
|
||||
}
|
||||
}, error = function(e) {
|
||||
message(paste(" WARNING: Could not read harvest.xlsx:", e$message))
|
||||
})
|
||||
} else {
|
||||
message(paste(" NOTE: harvest.xlsx not found at", harvest_file))
|
||||
}
|
||||
|
||||
# Fallback: create empty data frame if loading failed
|
||||
if (is.null(harvesting_data)) {
|
||||
message(" WARNING: No harvest data available. TCH yield prediction will use graceful fallback (NA values)")
|
||||
harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric())
|
||||
}
|
||||
|
||||
return(harvesting_data)
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# FIELD STATISTICS EXTRACTION
|
||||
# ============================================================================
|
||||
|
|
@ -1386,3 +1466,257 @@ prepare_predictions <- function(predictions, newdata) {
|
|||
dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
|
||||
)
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# YIELD PREDICTION KPI (SHARED ML-BASED MODEL FOR ALL CLIENT TYPES)
|
||||
# ============================================================================
|
||||
|
||||
#' Helper function for graceful fallback when training data unavailable
|
||||
#'
|
||||
#' @param field_boundaries Field boundaries (sf or SpatVector)
|
||||
#' @return List with summary and field_results (both with NA values)
|
||||
create_fallback_result <- function(field_boundaries) {
|
||||
# Convert to SpatVector if needed (for terra::project)
|
||||
if (!inherits(field_boundaries, "SpatVector")) {
|
||||
field_boundaries <- terra::vect(field_boundaries)
|
||||
}
|
||||
field_boundaries_projected <- terra::project(field_boundaries, "EPSG:6933") # Equal Earth projection
|
||||
field_areas <- terra::expanse(field_boundaries_projected) / 10000 # Convert m² to hectares
|
||||
total_area <- sum(field_areas)
|
||||
|
||||
summary_result <- data.frame(
|
||||
field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
|
||||
count = c(0, 0, 0, nrow(field_boundaries)),
|
||||
value = c(NA_real_, NA_real_, NA_real_, round(total_area, 1)),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
field_results <- data.frame(
|
||||
field = character(0),
|
||||
sub_field = character(0),
|
||||
Age_days = numeric(0),
|
||||
yield_forecast_t_ha = numeric(0),
|
||||
season = numeric(0),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
return(list(summary = summary_result, field_results = field_results))
|
||||
}
|
||||
|
||||
#' Calculate yield prediction KPI using Random Forest with Feature Selection
|
||||
#'
|
||||
#' Trains a Random Forest model on historical harvest data with cumulative CI,
|
||||
#' days of year (DOY), and CI-per-day as predictors. Uses CAST::ffs() for
|
||||
#' Forward Feature Selection. Predicts yields for mature fields (DOY >= 240).
|
||||
#'
|
||||
#' @param field_boundaries Field boundaries (sf or SpatVector)
|
||||
#' @param harvesting_data Data frame with harvest data including tonnage_ha column
|
||||
#' @param cumulative_CI_vals_dir Directory containing "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
|
||||
#'
|
||||
#' @return List with:
|
||||
#' - summary: Data frame with field_groups, count, value (quartiles and total area)
|
||||
#' - field_results: Data frame with field-level yield forecasts (yield_forecast_t_ha)
|
||||
#'
|
||||
#' @details
|
||||
#' **Training Data Requirements:**
|
||||
#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
|
||||
#' - harvesting_data must have tonnage_ha column with numeric yield values
|
||||
#' - Training stops gracefully if either is missing (returns NA values, not fake numbers)
|
||||
#'
|
||||
#' **Model Specifications:**
|
||||
#' - Algorithm: Random Forest (caret + CAST)
|
||||
#' - Feature Selection: Forward Feature Selection (CAST::ffs)
|
||||
#' - Cross-validation: 5-fold CV
|
||||
#' - Predictors: cumulative_CI, DOY, CI_per_day
|
||||
#' - Mature field threshold: DOY >= 240 (8 months)
|
||||
#' - Output: Field-level yield forecasts grouped by quartile
|
||||
#'
|
||||
#' **Error Handling:**
|
||||
#' - Missing tonnage_ha: Returns graceful fallback with NA (not zero) values
|
||||
#' - No training data: Logs WARNING, returns empty field_results
|
||||
#' - RDS file missing: Returns graceful fallback
|
||||
#' - Prediction errors: Wrapped in tryCatch, returns fallback on failure
|
||||
calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cumulative_CI_vals_dir) {
|
||||
safe_log("Calculating yield prediction KPI using Random Forest with Feature Selection")
|
||||
|
||||
tryCatch({
|
||||
# Check if tonnage_ha column is present and has valid data
|
||||
if (is.null(harvesting_data) ||
|
||||
!("tonnage_ha" %in% names(harvesting_data)) ||
|
||||
all(is.na(harvesting_data$tonnage_ha))) {
|
||||
safe_log("No harvest data available: lacking tonnage_ha column or all values are NA", "WARNING")
|
||||
return(create_fallback_result(field_boundaries))
|
||||
}
|
||||
|
||||
# Check if CI quadrant RDS file exists
|
||||
ci_quadrant_path <- file.path(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
||||
if (!file.exists(ci_quadrant_path)) {
|
||||
safe_log(paste("CI quadrant file not found at:", ci_quadrant_path), "WARNING")
|
||||
return(create_fallback_result(field_boundaries))
|
||||
}
|
||||
|
||||
# Load CI quadrant data and fill missing field/sub_field values
|
||||
CI_quadrant <- readRDS(ci_quadrant_path) %>%
|
||||
dplyr::group_by(model) %>%
|
||||
tidyr::fill(field, sub_field, .direction = "downup") %>%
|
||||
dplyr::ungroup()
|
||||
|
||||
# Rename year column to season for consistency
|
||||
harvesting_data_renamed <- harvesting_data %>% dplyr::rename(season = year)
|
||||
|
||||
# Join CI and yield data
|
||||
CI_and_yield <- dplyr::left_join(CI_quadrant, harvesting_data_renamed,
|
||||
by = c("field", "sub_field", "season")) %>%
|
||||
dplyr::group_by(sub_field, season) %>%
|
||||
dplyr::slice(which.max(DOY)) %>%
|
||||
dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
|
||||
dplyr::mutate(CI_per_day = cumulative_CI / DOY)
|
||||
|
||||
# Define predictors and response variables
|
||||
predictors <- c("cumulative_CI", "DOY", "CI_per_day")
|
||||
response <- "tonnage_ha"
|
||||
|
||||
# Prepare training dataset (fields with harvest data)
|
||||
CI_and_yield_test <- CI_and_yield %>%
|
||||
as.data.frame() %>%
|
||||
dplyr::filter(!is.na(tonnage_ha))
|
||||
|
||||
# Check if we have minimum training data
|
||||
if (nrow(CI_and_yield_test) == 0) {
|
||||
safe_log("No training data available: no fields with tonnage_ha observations", "WARNING")
|
||||
return(create_fallback_result(field_boundaries))
|
||||
}
|
||||
|
||||
# Pre-clean training data: remove rows with any NAs in predictors or response
|
||||
# This is required because CAST::ffs doesn't support na.rm parameter
|
||||
CI_and_yield_test <- CI_and_yield_test %>%
|
||||
dplyr::filter(!dplyr::if_any(dplyr::all_of(c(predictors, response)), is.na))
|
||||
|
||||
if (nrow(CI_and_yield_test) == 0) {
|
||||
safe_log("No complete training data after removing NAs in predictors/response", "WARNING")
|
||||
return(create_fallback_result(field_boundaries))
|
||||
}
|
||||
|
||||
# Prepare prediction dataset (fields without harvest data, mature fields only)
|
||||
prediction_yields <- CI_and_yield %>%
|
||||
as.data.frame() %>%
|
||||
dplyr::filter(is.na(tonnage_ha) & DOY >= 240) # Mature fields only
|
||||
|
||||
# Configure model training parameters
|
||||
ctrl <- caret::trainControl(
|
||||
method = "cv",
|
||||
savePredictions = TRUE,
|
||||
allowParallel = TRUE,
|
||||
number = 5,
|
||||
verboseIter = FALSE
|
||||
)
|
||||
|
||||
# Train the model with forward feature selection
|
||||
set.seed(202) # For reproducibility
|
||||
safe_log("Training Random Forest model with Forward Feature Selection...")
|
||||
model_ffs_rf <- CAST::ffs(
|
||||
CI_and_yield_test[, predictors],
|
||||
CI_and_yield_test[, response],
|
||||
method = "rf",
|
||||
trControl = ctrl,
|
||||
importance = TRUE,
|
||||
withinSE = TRUE,
|
||||
tuneLength = 5
|
||||
)
|
||||
|
||||
# Predict yields on validation data (same as training data for RMSE calculation)
|
||||
pred_ffs_rf <- prepare_predictions(
|
||||
stats::predict(model_ffs_rf, newdata = CI_and_yield_test),
|
||||
CI_and_yield_test
|
||||
)
|
||||
|
||||
# Extract cross-validated RMSE from the model object (more honest than in-sample RMSE)
|
||||
# The CAST::ffs model stores CV results in $results data frame
|
||||
# We extract the RMSE from the best model (lowest RMSE across folds)
|
||||
if (!is.null(model_ffs_rf$results) && "RMSE" %in% names(model_ffs_rf$results)) {
|
||||
# Get minimum RMSE from cross-validation results (best model from feature selection)
|
||||
rmse_value <- min(model_ffs_rf$results$RMSE, na.rm = TRUE)
|
||||
safe_log(paste("Yield prediction RMSE (cross-validated):", round(rmse_value, 2), "t/ha"))
|
||||
} else {
|
||||
# Fallback: compute in-sample RMSE if CV results unavailable, but label it clearly
|
||||
rmse_value <- sqrt(mean((pred_ffs_rf$predicted_Tcha - CI_and_yield_test$tonnage_ha)^2, na.rm = TRUE))
|
||||
safe_log(paste("Yield prediction RMSE (in-sample/training):", round(rmse_value, 2), "t/ha"))
|
||||
}
|
||||
|
||||
# Predict yields for current season (mature fields >= 240 days)
|
||||
if (nrow(prediction_yields) > 0) {
|
||||
pred_rf_current_season <- prepare_predictions(
|
||||
stats::predict(model_ffs_rf, newdata = prediction_yields),
|
||||
prediction_yields
|
||||
) %>%
|
||||
dplyr::filter(Age_days >= 240) %>%
|
||||
dplyr::select(c("field", "Age_days", "predicted_Tcha", "season"))
|
||||
} else {
|
||||
pred_rf_current_season <- data.frame()
|
||||
}
|
||||
|
||||
# Calculate summary statistics for KPI
|
||||
if (nrow(pred_rf_current_season) > 0) {
|
||||
safe_log(paste("Number of fields with yield predictions:", nrow(pred_rf_current_season)))
|
||||
safe_log(paste("Predicted yield range:",
|
||||
round(min(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
|
||||
"-",
|
||||
round(max(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
|
||||
"t/ha"))
|
||||
|
||||
# Calculate quartiles for grouping
|
||||
yield_quartiles <- quantile(pred_rf_current_season$predicted_Tcha,
|
||||
probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
|
||||
|
||||
# Count fields in each quartile
|
||||
top_25_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[3], na.rm = TRUE)
|
||||
average_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[1] &
|
||||
pred_rf_current_season$predicted_Tcha < yield_quartiles[3], na.rm = TRUE)
|
||||
lowest_25_count <- sum(pred_rf_current_season$predicted_Tcha < yield_quartiles[1], na.rm = TRUE)
|
||||
|
||||
# Calculate total area
|
||||
if (!inherits(field_boundaries, "SpatVector")) {
|
||||
field_boundaries_vect <- terra::vect(field_boundaries)
|
||||
} else {
|
||||
field_boundaries_vect <- field_boundaries
|
||||
}
|
||||
|
||||
# Handle both sf and SpatVector inputs for area calculation
|
||||
if (inherits(field_boundaries, "sf")) {
|
||||
field_boundaries_projected <- sf::st_transform(field_boundaries, "EPSG:6933")
|
||||
field_areas <- sf::st_area(field_boundaries_projected) / 10000 # m² to hectares
|
||||
} else {
|
||||
field_boundaries_projected <- terra::project(field_boundaries_vect, "EPSG:6933")
|
||||
field_areas <- terra::expanse(field_boundaries_projected) / 10000
|
||||
}
|
||||
total_area <- sum(as.numeric(field_areas))
|
||||
|
||||
safe_log(paste("Total area:", round(total_area, 1), "hectares"))
|
||||
|
||||
# Build summary result
|
||||
result <- data.frame(
|
||||
field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
|
||||
count = c(top_25_count, average_count, lowest_25_count, nrow(field_boundaries)),
|
||||
value = c(round(yield_quartiles[3], 1), round(yield_quartiles[2], 1),
|
||||
round(yield_quartiles[1], 1), round(total_area, 1)),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
# Prepare field-level results
|
||||
field_level_results <- pred_rf_current_season %>%
|
||||
dplyr::select(field, Age_days, predicted_Tcha, season) %>%
|
||||
dplyr::rename(yield_forecast_t_ha = predicted_Tcha)
|
||||
|
||||
safe_log("✓ Yield prediction complete")
|
||||
return(list(summary = result, field_results = field_level_results))
|
||||
} else {
|
||||
safe_log("No fields meet maturity threshold (DOY >= 240) for prediction", "WARNING")
|
||||
return(list(summary = create_fallback_result(field_boundaries)$summary,
|
||||
field_results = data.frame()))
|
||||
}
|
||||
|
||||
}, error = function(e) {
|
||||
safe_log(paste("Error in yield prediction:", e$message), "ERROR")
|
||||
return(create_fallback_result(field_boundaries))
|
||||
})
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue