SmartCane/r_app/80_utils_common.R

1758 lines
64 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 80_UTILS_COMMON.R
# ============================================================================
# SHARED UTILITY FUNCTIONS FOR ALL CLIENT TYPES (SCRIPT 80)
#
# Contains helper and infrastructure functions used by both AURA and ANGATA workflows:
# - Statistical categorization and calculations
# - Tile operations and data loading
# - Field statistics extraction
# - Week/year calculations for consistent date handling
# - Excel/CSV/RDS export utilities
# - Yield prediction using ML models (Random Forest with Feature Selection)
#
# Used by: 80_calculate_kpis.R, all client-specific utils files
#
# NOTE: Libraries required by yield prediction (caret, CAST, here) are loaded
# in the main script 80_calculate_kpis.R, not here. This keeps dependencies
# centralized in the orchestrator script.
# ============================================================================
# ============================================================================
# LOAD PROJECT CONFIGURATION (Guard against re-sourcing)
# ============================================================================
# Ensure parameters_project.R has been sourced to provide global configuration
# (PROJECT, data_dir, field_boundaries_path, etc.). Use a sentinel to avoid double-sourcing.
if (!exists("PROJECT", envir = .GlobalEnv)) {
tryCatch({
source(here::here("r_app", "parameters_project.R"))
}, error = function(e) {
# Fallback: try relative path if here() doesn't work
tryCatch({
source("parameters_project.R")
}, error = function(e2) {
warning(paste("Could not source parameters_project.R:", e2$message,
"- using defaults or expecting caller to set PROJECT/data_dir"))
})
})
}
# ============================================================================
# CONSTANTS (from 80_calculate_kpis.R)
# ============================================================================
# Four-week trend thresholds
FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5
FOUR_WEEK_TREND_GROWTH_MIN <- 0.1
FOUR_WEEK_TREND_GROWTH_MAX <- 0.5
FOUR_WEEK_TREND_NO_GROWTH_RANGE <- 0.1
FOUR_WEEK_TREND_DECLINE_MAX <- -0.1
FOUR_WEEK_TREND_DECLINE_MIN <- -0.5
FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5
# CV Trend thresholds (8-week slope interpretation)
CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.03
CV_SLOPE_IMPROVEMENT_MIN <- -0.02
CV_SLOPE_IMPROVEMENT_MAX <- -0.01
CV_SLOPE_HOMOGENOUS_MIN <- -0.01
CV_SLOPE_HOMOGENOUS_MAX <- 0.01
CV_SLOPE_PATCHINESS_MIN <- 0.01
CV_SLOPE_PATCHINESS_MAX <- 0.02
CV_SLOPE_SEVERE_MIN <- 0.02
# Percentile calculations
CI_PERCENTILE_LOW <- 0.10
CI_PERCENTILE_HIGH <- 0.90
# Phase definitions (used by get_phase_by_age)
PHASE_DEFINITIONS <- data.frame(
phase = c("Germination", "Tillering", "Grand Growth", "Maturation"),
age_start = c(0, 4, 17, 39),
age_end = c(6, 16, 39, 200),
stringsAsFactors = FALSE
)
# ============================================================================
# WEEK/YEAR CALCULATION HELPERS (Consistent across all scripts)
# ============================================================================
#' Calculate week and year for a given lookback offset
#' This function handles ISO 8601 week numbering with proper year wrapping
#' when crossing year boundaries (e.g., week 01/2026 -> week 52/2025)
#'
#' @param current_week ISO week number (1-53)
#' @param current_year ISO week year (from format(..., "%G"))
#' @param offset_weeks Number of weeks to go back (0 = current week, 1 = previous week, etc.)
#'
#' @return List with: week (ISO week number), year (ISO week year)
#'
#' @details
#' This is the authoritative week/year calculation function.
#' Used by:
#' - load_historical_field_data() - to find RDS/CSV files for 4-week lookback
#' - Script 80 main - to calculate previous week with year wrapping
#' - Any other script needing to walk backwards through weeks
#'
#' Example: Week 01/2026, offset=1 -> returns list(week=52, year=2025)
calculate_target_week_and_year <- function(current_week, current_year, offset_weeks = 0) {
target_week <- current_week - offset_weeks
target_year <- current_year
# Handle wrapping: when going back from week 1, wrap to week 52 of previous year
while (target_week < 1) {
target_week <- target_week + 52
target_year <- target_year - 1
}
return(list(week = target_week, year = target_year))
}
# ============================================================================
# TILE-AWARE HELPER FUNCTIONS
# ============================================================================
#' Get tile IDs that intersect with a field geometry
get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) {
if (inherits(field_geom, "sf")) {
field_bbox <- sf::st_bbox(field_geom)
field_xmin <- field_bbox["xmin"]
field_xmax <- field_bbox["xmax"]
field_ymin <- field_bbox["ymin"]
field_ymax <- field_bbox["ymax"]
} else if (inherits(field_geom, "SpatVector")) {
field_bbox <- terra::ext(field_geom)
field_xmin <- field_bbox$xmin
field_xmax <- field_bbox$xmax
field_ymin <- field_bbox$ymin
field_ymax <- field_bbox$ymax
} else {
stop("field_geom must be sf or terra::vect object")
}
intersecting_tiles <- tile_grid$id[
!(tile_grid$xmax < field_xmin |
tile_grid$xmin > field_xmax |
tile_grid$ymax < field_ymin |
tile_grid$ymin > field_ymax)
]
return(as.numeric(intersecting_tiles))
}
#' Load and merge tiles for a specific field
load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_dir) {
if (length(tile_ids) == 0) {
return(NULL)
}
tiles_list <- list()
for (tile_id in sort(tile_ids)) {
tile_filename <- sprintf("week_%02d_%d_%02d.tif", week_num, year, tile_id)
tile_path <- file.path(mosaic_dir, tile_filename)
if (file.exists(tile_path)) {
tryCatch({
tile_rast <- terra::rast(tile_path)
ci_band <- terra::subset(tile_rast, 5)
tiles_list[[length(tiles_list) + 1]] <- ci_band
}, error = function(e) {
message(paste(" Warning: Could not load tile", tile_id, ":", e$message))
})
}
}
if (length(tiles_list) == 0) {
return(NULL)
}
if (length(tiles_list) == 1) {
return(tiles_list[[1]])
} else {
tryCatch({
rsrc <- terra::sprc(tiles_list)
merged <- terra::mosaic(rsrc, fun = "max")
return(merged)
}, error = function(e) {
message(paste(" Warning: Could not merge tiles:", e$message))
return(tiles_list[[1]])
})
}
}
#' Build tile grid from available tile files
build_tile_grid <- function(mosaic_dir, week_num, year) {
# Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/)
detected_grid_size <- NA
if (dir.exists(mosaic_dir)) {
subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
detected_grid_size <- grid_patterns[1]
mosaic_dir <- file.path(mosaic_dir, detected_grid_size)
message(paste(" Using grid-size subdirectory:", detected_grid_size))
}
}
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir))
}
tile_grid <- data.frame(
id = integer(),
xmin = numeric(),
xmax = numeric(),
ymin = numeric(),
ymax = numeric(),
stringsAsFactors = FALSE
)
for (tile_file in tile_files) {
tryCatch({
matches <- regmatches(basename(tile_file), regexpr("_([0-9]{2})\\.tif$", basename(tile_file)))
if (length(matches) > 0) {
tile_id <- as.integer(sub("_|\\.tif", "", matches[1]))
tile_rast <- terra::rast(tile_file)
tile_ext <- terra::ext(tile_rast)
tile_grid <- rbind(tile_grid, data.frame(
id = tile_id,
xmin = tile_ext$xmin,
xmax = tile_ext$xmax,
ymin = tile_ext$ymin,
ymax = tile_ext$ymax,
stringsAsFactors = FALSE
))
}
}, error = function(e) {
message(paste(" Warning: Could not process tile", basename(tile_file), ":", e$message))
})
}
if (nrow(tile_grid) == 0) {
stop("Could not extract extents from any tile files")
}
return(list(
tile_grid = tile_grid,
mosaic_dir = mosaic_dir,
grid_size = detected_grid_size
))
}
# ============================================================================
# STATISTICAL CATEGORIZATION FUNCTIONS
# ============================================================================
#' Categorize four-week CI trend
categorize_four_week_trend <- function(ci_values_list) {
if (is.null(ci_values_list) || length(ci_values_list) < 2) {
return(NA_character_)
}
ci_values_list <- ci_values_list[!is.na(ci_values_list)]
if (length(ci_values_list) < 2) {
return(NA_character_)
}
weekly_changes <- diff(ci_values_list)
avg_weekly_change <- mean(weekly_changes, na.rm = TRUE)
if (avg_weekly_change >= FOUR_WEEK_TREND_STRONG_GROWTH_MIN) {
return("strong growth")
} else if (avg_weekly_change >= FOUR_WEEK_TREND_GROWTH_MIN &&
avg_weekly_change < FOUR_WEEK_TREND_GROWTH_MAX) {
return("growth")
} else if (abs(avg_weekly_change) <= FOUR_WEEK_TREND_NO_GROWTH_RANGE) {
return("no growth")
} else if (avg_weekly_change <= FOUR_WEEK_TREND_DECLINE_MIN &&
avg_weekly_change > FOUR_WEEK_TREND_STRONG_DECLINE_MAX) {
return("decline")
} else if (avg_weekly_change < FOUR_WEEK_TREND_STRONG_DECLINE_MAX) {
return("strong decline")
} else {
return("no growth")
}
}
#' Round cloud coverage to interval categories
round_cloud_to_intervals <- function(cloud_pct_clear) {
if (is.na(cloud_pct_clear)) {
return(NA_character_)
}
if (cloud_pct_clear < 10) return("0-10%")
if (cloud_pct_clear < 20) return("10-20%")
if (cloud_pct_clear < 30) return("20-30%")
if (cloud_pct_clear < 40) return("30-40%")
if (cloud_pct_clear < 50) return("40-50%")
if (cloud_pct_clear < 60) return("50-60%")
if (cloud_pct_clear < 70) return("60-70%")
if (cloud_pct_clear < 80) return("70-80%")
if (cloud_pct_clear < 90) return("80-90%")
if (cloud_pct_clear < 95) return("90-95%")
return("95-100%")
}
#' Get CI percentile range (10th to 90th)
get_ci_percentiles <- function(ci_values) {
if (is.null(ci_values) || length(ci_values) == 0) {
return(NA_character_)
}
ci_values <- ci_values[!is.na(ci_values)]
if (length(ci_values) == 0) {
return(NA_character_)
}
p10 <- quantile(ci_values, CI_PERCENTILE_LOW, na.rm = TRUE)
p90 <- quantile(ci_values, CI_PERCENTILE_HIGH, na.rm = TRUE)
return(sprintf("%.1f-%.1f", p10, p90))
}
#' Calculate short-term CV trend (current week vs previous week)
calculate_cv_trend <- function(cv_current, cv_previous) {
if (is.na(cv_current) || is.na(cv_previous)) {
return(NA_real_)
}
return(round(cv_current - cv_previous, 4))
}
#' Calculate four-week CI trend
calculate_four_week_trend <- function(mean_ci_values) {
if (is.null(mean_ci_values) || length(mean_ci_values) == 0) {
return(NA_real_)
}
ci_clean <- mean_ci_values[!is.na(mean_ci_values)]
if (length(ci_clean) < 2) {
return(NA_real_)
}
trend <- ci_clean[length(ci_clean)] - ci_clean[1]
return(round(trend, 2))
}
#' Categorize CV slope (8-week regression) into field uniformity interpretation
categorize_cv_slope <- function(slope) {
if (is.na(slope)) {
return(NA_character_)
}
if (slope <= CV_SLOPE_IMPROVEMENT_MIN) {
return("Excellent uniformity")
} else if (slope < CV_SLOPE_HOMOGENOUS_MIN) {
return("Homogenous growth")
} else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) {
return("Homogenous growth")
} else if (slope <= CV_SLOPE_PATCHINESS_MAX) {
return("Minor patchiness")
} else {
return("Severe fragmentation")
}
}
#' Calculate 8-week CV trend via linear regression slope
calculate_cv_trend_long_term <- function(cv_values) {
if (is.null(cv_values) || length(cv_values) == 0) {
return(NA_real_)
}
cv_clean <- cv_values[!is.na(cv_values)]
if (length(cv_clean) < 2) {
return(NA_real_)
}
weeks <- seq_along(cv_clean)
tryCatch({
lm_fit <- lm(cv_clean ~ weeks)
slope <- coef(lm_fit)["weeks"]
return(round(as.numeric(slope), 4))
}, error = function(e) {
return(NA_real_)
})
}
#' Calculate Gap Filling Score KPI (2σ method)
#' @param ci_raster Current week CI raster (single band)
#' @param field_boundaries Field boundaries (sf or SpatVector)
#' @return List with summary data frame and field-level results data frame
calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
# Handle both sf and SpatVector inputs
if (!inherits(field_boundaries, "SpatVector")) {
field_boundaries_vect <- terra::vect(field_boundaries)
} else {
field_boundaries_vect <- field_boundaries
}
field_results <- data.frame()
for (i in seq_len(nrow(field_boundaries))) {
field_name <- if ("field" %in% names(field_boundaries)) field_boundaries$field[i] else NA_character_
sub_field_name <- if ("sub_field" %in% names(field_boundaries)) field_boundaries$sub_field[i] else NA_character_
field_vect <- field_boundaries_vect[i]
# Extract CI values using helper function
ci_values <- extract_ci_values(ci_raster, field_vect)
valid_values <- ci_values[!is.na(ci_values) & is.finite(ci_values)]
if (length(valid_values) > 1) {
# Gap score using 2σ below median to detect outliers
median_ci <- median(valid_values)
sd_ci <- sd(valid_values)
outlier_threshold <- median_ci - (1 * sd_ci)
low_ci_pixels <- sum(valid_values < outlier_threshold)
total_pixels <- length(valid_values)
gap_score <- round((low_ci_pixels / total_pixels) * 100, 2)
# Classify gap severity
gap_level <- dplyr::case_when(
gap_score < 10 ~ "Minimal",
gap_score < 25 ~ "Moderate",
gap_score >= 25 ~ "Significant",
TRUE ~ NA_character_
)
field_results <- rbind(field_results, data.frame(
field = field_name,
sub_field = sub_field_name,
gap_level = gap_level,
gap_score = gap_score,
mean_ci = mean(valid_values),
outlier_threshold = outlier_threshold
))
} else {
field_results <- rbind(field_results, data.frame(
field = field_name,
sub_field = sub_field_name,
gap_level = NA_character_,
gap_score = NA_real_,
mean_ci = NA_real_,
outlier_threshold = NA_real_
))
}
}
# Summarize results
gap_summary <- field_results %>%
dplyr::group_by(gap_level) %>%
dplyr::summarise(field_count = n(), .groups = 'drop') %>%
dplyr::mutate(percent = round((field_count / sum(field_count)) * 100, 1))
return(list(summary = gap_summary, field_results = field_results))
}
#' Calculate gap filling scores for all per-field mosaics (wrapper)
#'
#' This wrapper handles per-field mosaic structure by iterating over
#' individual field files and calling the basic KPI function
#'
#' @param per_field_files Character vector of paths to per-field mosaic TIFFs
#' @param field_boundaries_sf sf object with field geometries
#' @return data.frame with Field_id and gap_score columns
calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
message("\nCalculating gap filling scores (2σ method)...")
message(paste(" Using per-field mosaics for", length(per_field_files), "fields"))
field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field)
process_gap_for_field <- function(field_file) {
field_id <- basename(dirname(field_file))
field_bounds <- field_boundaries_by_id[[field_id]]
if (is.null(field_bounds) || nrow(field_bounds) == 0) {
return(data.frame(Field_id = field_id, gap_score = NA_real_))
}
tryCatch({
field_raster <- terra::rast(field_file)
ci_band_name <- "CI"
if (!(ci_band_name %in% names(field_raster))) {
return(data.frame(Field_id = field_id, gap_score = NA_real_))
}
field_ci_band <- field_raster[[ci_band_name]]
names(field_ci_band) <- "CI"
gap_result <- calculate_gap_filling_kpi(field_ci_band, field_bounds)
if (is.null(gap_result) || is.null(gap_result$field_results) || nrow(gap_result$field_results) == 0) {
return(data.frame(Field_id = field_id, gap_score = NA_real_))
}
gap_scores <- gap_result$field_results
gap_scores$Field_id <- gap_scores$field
gap_scores <- gap_scores[, c("Field_id", "gap_score")]
stats::aggregate(gap_score ~ Field_id, data = gap_scores, FUN = function(x) mean(x, na.rm = TRUE))
}, error = function(e) {
message(paste(" WARNING: Gap score failed for field", field_id, ":", e$message))
data.frame(Field_id = field_id, gap_score = NA_real_)
})
}
# Process fields sequentially with progress bar
message(" Processing gap scores for ", length(per_field_files), " fields...")
pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50)
results_list <- lapply(seq_along(per_field_files), function(idx) {
result <- process_gap_for_field(per_field_files[[idx]])
utils::setTxtProgressBar(pb, idx)
result
})
close(pb)
gap_scores_df <- dplyr::bind_rows(results_list)
if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
gap_scores_df <- gap_scores_df %>%
dplyr::group_by(Field_id) %>%
dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop")
message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields"))
# Guard against all-NA values which would produce Inf/-Inf warnings
if (any(is.finite(gap_scores_df$gap_score))) {
min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
message(paste(" Gap score range:", min_score, "-", max_score, "%"))
} else {
message(" Gap score range: All values are NA (no valid gap scores)")
}
} else {
message(" WARNING: No gap scores calculated from per-field mosaics")
gap_scores_df <- NULL
}
return(gap_scores_df)
}
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
#' Get crop phase by age in weeks
get_phase_by_age <- function(age_weeks) {
if (is.na(age_weeks)) return(NA_character_)
for (i in seq_len(nrow(PHASE_DEFINITIONS))) {
if (age_weeks >= PHASE_DEFINITIONS$age_start[i] &&
age_weeks <= PHASE_DEFINITIONS$age_end[i]) {
return(PHASE_DEFINITIONS$phase[i])
}
}
return("Unknown")
}
#' Get status trigger based on CI values and field age
get_status_trigger <- function(ci_values, ci_change, age_weeks) {
if (is.na(age_weeks) || length(ci_values) == 0) return(NA_character_)
ci_values <- ci_values[!is.na(ci_values)]
if (length(ci_values) == 0) return(NA_character_)
pct_above_2 <- sum(ci_values > 2) / length(ci_values) * 100
pct_at_or_above_2 <- sum(ci_values >= 2) / length(ci_values) * 100
ci_cv <- if (mean(ci_values, na.rm = TRUE) > 0) sd(ci_values) / mean(ci_values, na.rm = TRUE) else 0
mean_ci <- mean(ci_values, na.rm = TRUE)
if (age_weeks >= 0 && age_weeks <= 6) {
if (pct_at_or_above_2 >= 70) {
return("germination_complete")
} else if (pct_above_2 > 10) {
return("germination_started")
}
}
if (age_weeks >= 45) {
return("harvest_ready")
}
if (age_weeks > 6 && !is.na(ci_change) && ci_change < -1.5 && ci_cv < 0.25) {
return("stress_detected_whole_field")
}
if (age_weeks > 6 && !is.na(ci_change) && ci_change > 1.5) {
return("strong_recovery")
}
if (age_weeks >= 4 && age_weeks < 39 && !is.na(ci_change) && ci_change > 0.2) {
return("growth_on_track")
}
if (age_weeks >= 39 && age_weeks < 45 && mean_ci > 3.5) {
return("maturation_progressing")
}
return(NA_character_)
}
#' Extract planting dates from harvesting data
extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
message("Warning: No harvesting data available - planting dates will be NA.")
if (!is.null(field_boundaries_sf)) {
return(data.frame(
field_id = field_boundaries_sf$field,
planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)),
stringsAsFactors = FALSE
))
}
return(NULL)
}
tryCatch({
planting_dates <- harvesting_data %>%
arrange(field, desc(season_start)) %>%
distinct(field, .keep_all = TRUE) %>%
select(field, season_start) %>%
rename(field_id = field, planting_date = season_start) %>%
filter(!is.na(planting_date)) %>%
as.data.frame()
message(paste("Extracted planting dates for", nrow(planting_dates), "fields from harvest.xlsx"))
return(planting_dates)
}, error = function(e) {
message(paste("Error extracting planting dates:", e$message))
return(NULL)
})
}
# ============================================================================
# DATA LOADING HELPERS
# ============================================================================
#' Load and validate harvest data from harvest.xlsx
#'
#' Encapsulates harvest data loading with validation, type coercion, and error handling.
#' Returns a data frame with required columns (field, year, tonnage_ha) or an empty
#' data frame with proper structure if loading fails.
#'
#' @param data_dir Path to data directory containing harvest.xlsx
#'
#' @return data.frame with columns: field (character), year (numeric), tonnage_ha (numeric)
#' - On success: data frame with N rows of harvest records
#' - On failure: empty data frame with correct structure (0 rows, 3 columns)
#'
#' @details
#' **File Location**: Expected at `file.path(data_dir, "harvest.xlsx")`
#'
#' **Validation**:
#' - Checks file existence before reading
#' - Validates required columns: field, year, tonnage_ha
#' - Coerces year and tonnage_ha to numeric
#' - Logs status messages for debugging
#'
#' **Error Handling**:
#' - Missing file: Returns empty DF (logs NOTE)
#' - Missing columns: Returns empty DF (logs WARNING)
#' - Read errors: Returns empty DF (logs WARNING)
#' - Always returns valid data frame structure (won't return NULL)
#'
#' **Usage**:
#' ```r
#' harvesting_data <- load_harvest_data(setup$data_dir)
#' # harvesting_data is guaranteed to be a data.frame with 3 columns
#' # even if harvest.xlsx is unavailable or invalid
#' ```
load_harvest_data <- function(data_dir) {
harvesting_data <- NULL
harvest_file <- file.path(data_dir, "harvest.xlsx")
if (file.exists(harvest_file)) {
tryCatch({
harvesting_data <- readxl::read_excel(harvest_file)
# Ensure required columns are present
required_cols <- c("field", "year", "tonnage_ha")
if (all(required_cols %in% names(harvesting_data))) {
# Convert to data frame and ensure column types
harvesting_data <- as.data.frame(harvesting_data)
# CRITICAL: Coerce field to character to preserve leading zeros (e.g., "01", "02")
harvesting_data$field <- as.character(harvesting_data$field)
harvesting_data$year <- as.numeric(harvesting_data$year)
harvesting_data$tonnage_ha <- as.numeric(harvesting_data$tonnage_ha)
message(paste(" ✓ Loaded harvest data:", nrow(harvesting_data), "records from harvest.xlsx"))
return(harvesting_data)
} else {
message(paste(" WARNING: harvest.xlsx missing required columns. Expected: field, year, tonnage_ha"))
harvesting_data <- NULL
}
}, error = function(e) {
message(paste(" WARNING: Could not read harvest.xlsx:", e$message))
})
} else {
message(paste(" NOTE: harvest.xlsx not found at", harvest_file))
}
# Fallback: create empty data frame if loading failed
if (is.null(harvesting_data)) {
message(" WARNING: No harvest data available. TCH yield prediction will use graceful fallback (NA values)")
harvesting_data <- data.frame(
field = character(), # Explicitly character to preserve leading zeros when data is added
year = numeric(),
tonnage_ha = numeric(),
stringsAsFactors = FALSE
)
}
return(harvesting_data)
}
# ============================================================================
# FIELD STATISTICS EXTRACTION
# ============================================================================
#' Extract CI statistics for all fields from a single CI raster band
extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
extract_result <- terra::extract(ci_band, field_boundaries_sf)
stats_list <- list()
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
field_pixels <- extract_result[extract_result$ID == field_idx, 2]
pixels <- as.numeric(field_pixels[!is.na(field_pixels)])
if (length(pixels) == 0) {
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = NA_real_,
cv = NA_real_,
p10 = NA_real_,
p90 = NA_real_,
min_ci = NA_real_,
max_ci = NA_real_,
pixel_count_valid = 0,
pixel_count_total = 0,
stringsAsFactors = FALSE
)
next
}
mean_val <- mean(pixels, na.rm = TRUE)
cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
min_val <- min(pixels, na.rm = TRUE)
max_val <- max(pixels, na.rm = TRUE)
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = mean_val,
cv = cv_val,
p10 = p10_val,
p90 = p90_val,
min_ci = min_val,
max_ci = max_val,
pixel_count_valid = length(pixels),
pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]),
stringsAsFactors = FALSE
)
}
return(dplyr::bind_rows(stats_list))
}
# ============================================================================
# EXPORT FUNCTIONS (USED BY ALL CLIENTS)
# ============================================================================
#' Generate summary statistics from field analysis data
generate_field_analysis_summary <- function(field_df) {
message("Generating summary statistics...")
total_acreage <- sum(field_df$Acreage, na.rm = TRUE)
germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE)
tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE)
grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE)
maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE)
unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE)
harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE)
stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE)
recovery_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "strong_recovery"], na.rm = TRUE)
growth_on_track_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "growth_on_track"], na.rm = TRUE)
germination_complete_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_complete"], na.rm = TRUE)
germination_started_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_started"], na.rm = TRUE)
no_trigger_acreage <- sum(field_df$Acreage[is.na(field_df$Status_trigger)], na.rm = TRUE)
clear_fields <- sum(field_df$Cloud_category == "Clear view", na.rm = TRUE)
partial_fields <- sum(field_df$Cloud_category == "Partial coverage", na.rm = TRUE)
no_image_fields <- sum(field_df$Cloud_category == "No image available", na.rm = TRUE)
total_fields <- nrow(field_df)
clear_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Clear view"], na.rm = TRUE)
partial_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Partial coverage"], na.rm = TRUE)
no_image_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "No image available"], na.rm = TRUE)
summary_df <- data.frame(
Category = c(
"--- PHASE DISTRIBUTION ---",
"Germination",
"Tillering",
"Grand Growth",
"Maturation",
"Unknown phase",
"--- STATUS TRIGGERS ---",
"Harvest ready",
"Stress detected",
"Strong recovery",
"Growth on track",
"Germination complete",
"Germination started",
"No trigger",
"--- CLOUD COVERAGE (FIELDS) ---",
"Clear view",
"Partial coverage",
"No image available",
"--- CLOUD COVERAGE (ACREAGE) ---",
"Clear view",
"Partial coverage",
"No image available",
"--- TOTAL ---",
"Total Acreage"
),
Acreage = c(
NA,
round(germination_acreage, 2),
round(tillering_acreage, 2),
round(grand_growth_acreage, 2),
round(maturation_acreage, 2),
round(unknown_phase_acreage, 2),
NA,
round(harvest_ready_acreage, 2),
round(stress_acreage, 2),
round(recovery_acreage, 2),
round(growth_on_track_acreage, 2),
round(germination_complete_acreage, 2),
round(germination_started_acreage, 2),
round(no_trigger_acreage, 2),
NA,
paste0(clear_fields, " fields"),
paste0(partial_fields, " fields"),
paste0(no_image_fields, " fields"),
NA,
round(clear_acreage, 2),
round(partial_acreage, 2),
round(no_image_acreage, 2),
NA,
round(total_acreage, 2)
),
stringsAsFactors = FALSE
)
return(summary_df)
}
#' Export field analysis to Excel, CSV, and RDS formats
export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, year, reports_dir) {
message("Exporting per-field analysis to Excel, CSV, and RDS...")
field_df_rounded <- field_df %>%
mutate(across(where(is.numeric), ~ round(., 2)))
# Handle NULL summary_df
summary_df_rounded <- if (!is.null(summary_df)) {
summary_df %>%
mutate(across(where(is.numeric), ~ round(., 2)))
} else {
NULL
}
output_dir <- file.path(reports_dir)
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".xlsx")
excel_path <- file.path(output_dir, excel_filename)
excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE)
# Build sheets list dynamically
sheets <- list(
"Field Data" = field_df_rounded
)
if (!is.null(summary_df_rounded)) {
sheets[["Summary"]] <- summary_df_rounded
}
write_xlsx(sheets, excel_path)
message(paste("✓ Field analysis Excel exported to:", excel_path))
kpi_data <- list(
field_analysis = field_df_rounded,
field_analysis_summary = summary_df_rounded,
metadata = list(
current_week = current_week,
year = year,
project = project_dir,
created_at = Sys.time()
)
)
rds_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".rds")
rds_path <- file.path(output_dir, rds_filename)
saveRDS(kpi_data, rds_path)
message(paste("✓ Field analysis RDS exported to:", rds_path))
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".csv")
csv_path <- file.path(output_dir, csv_filename)
write_csv(field_df_rounded, csv_path)
message(paste("✓ Field analysis CSV exported to:", csv_path))
return(list(excel = excel_path, rds = rds_path, csv = csv_path))
}
# ============================================================================
# ADDITIONAL CRITICAL FUNCTIONS FROM 80_weekly_stats_utils.R (REQUIRED BY 80_calculate_kpis.R)
# ============================================================================
#' Calculate statistics for all fields from weekly mosaics
calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
mosaic_dir, report_date = Sys.Date()) {
message(paste("Calculating statistics for all fields - Week", week_num, year))
# Per-field mode: look in per-field subdirectories
single_file_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year)
# Find all field subdirectories with mosaics for this week
field_dirs <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
field_dirs <- field_dirs[field_dirs != ""]
per_field_files <- list()
for (field in field_dirs) {
field_mosaic_dir <- file.path(mosaic_dir, field)
files <- list.files(field_mosaic_dir, pattern = single_file_pattern, full.names = TRUE)
if (length(files) > 0) {
per_field_files[[field]] <- files[1] # Take first match for this field
}
}
if (length(per_field_files) == 0) {
stop(paste("No per-field mosaic files found for week", week_num, year, "in", mosaic_dir))
}
message(paste(" Found", length(per_field_files), "per-field mosaic file(s) for week", week_num))
results_list <- list()
# Initialize progress bar
pb <- progress::progress_bar$new(
format = " [:bar] :percent | Field :current/:total",
total = length(per_field_files),
width = 60
)
# Process each field's mosaic
for (field_idx in seq_along(per_field_files)) {
pb$tick() # Update progress bar
field_name <- names(per_field_files)[field_idx]
field_file <- per_field_files[[field_name]]
tryCatch({
current_rast <- terra::rast(field_file)
ci_band <- current_rast[["CI"]]
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
message(paste(" [SKIP] Field", field_name, "- CI band not found"))
next
}
# Extract CI values for this field
field_boundary <- field_boundaries_sf[field_boundaries_sf$field == field_name, ]
if (nrow(field_boundary) == 0) {
message(paste(" [SKIP] Field", field_name, "- not in field boundaries"))
next
}
extracted <- terra::extract(ci_band, field_boundary, na.rm = FALSE)
if (nrow(extracted) == 0 || all(is.na(extracted$CI))) {
message(paste(" [SKIP] Field", field_name, "- no CI values found"))
next
}
ci_vals <- extracted$CI[!is.na(extracted$CI)]
if (length(ci_vals) == 0) {
next
}
# Calculate statistics
mean_ci <- mean(ci_vals, na.rm = TRUE)
ci_std <- sd(ci_vals, na.rm = TRUE)
cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_
range_min <- min(ci_vals, na.rm = TRUE)
range_max <- max(ci_vals, na.rm = TRUE)
range_str <- sprintf("%.1f-%.1f", range_min, range_max)
ci_percentiles_str <- get_ci_percentiles(ci_vals)
num_pixels_total <- length(ci_vals)
num_pixels_gte_2 <- sum(ci_vals >= 2)
pct_pixels_gte_2 <- if (num_pixels_total > 0) round((num_pixels_gte_2 / num_pixels_total) * 100, 1) else 0
num_total <- nrow(extracted)
num_data <- sum(!is.na(extracted$CI))
pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0
cloud_cat <- if (num_data == 0) "No image available"
else if (pct_clear >= 95) "Clear view"
else "Partial coverage"
# Add to results
results_list[[length(results_list) + 1]] <- data.frame(
Field_id = field_name,
Mean_CI = round(mean_ci, 2),
CV = round(cv * 100, 2),
CI_range = range_str,
CI_Percentiles = ci_percentiles_str,
Pct_pixels_CI_gte_2 = pct_pixels_gte_2,
Cloud_pct_clear = pct_clear,
Cloud_category = cloud_cat,
stringsAsFactors = FALSE
)
}, error = function(e) {
message(paste(" [ERROR] Field", field_name, ":", e$message))
})
}
if (length(results_list) == 0) {
stop(paste("No fields processed successfully for week", week_num))
}
stats_df <- dplyr::bind_rows(results_list)
message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields"))
return(stats_df)
}
#' Load or calculate weekly statistics (with RDS caching)
load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf,
mosaic_dir, reports_dir, report_date = Sys.Date()) {
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year)
rds_path <- file.path(reports_dir, "field_stats", rds_filename)
if (file.exists(rds_path)) {
message(paste("Loading cached statistics from:", basename(rds_path)))
return(readRDS(rds_path))
}
message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num))
stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year, mosaic_dir, report_date)
output_dir <- file.path(reports_dir, "field_stats")
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
saveRDS(stats_df, rds_path)
message(paste("Saved weekly statistics RDS:", basename(rds_path)))
csv_filename <- sprintf("%s_field_stats_week%02d_%d.csv", project_dir, week_num, year)
csv_path <- file.path(output_dir, csv_filename)
readr::write_csv(stats_df, csv_path)
message(paste("Saved weekly statistics CSV:", basename(csv_path)))
return(stats_df)
}
#' Load historical field data from CSV (4-week lookback)
load_historical_field_data <- function(project_dir, current_week, current_year, kpi_reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL, daily_vals_dir = NULL) {
historical_data <- list()
loaded_weeks <- c()
missing_weeks <- c()
for (lookback in 0:(num_weeks - 1)) {
target <- calculate_target_week_and_year(current_week, current_year, offset_weeks = lookback)
target_week <- target$week
target_year <- target$year
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", target_week, target_year), ".csv")
csv_path <- file.path(kpi_reports_dir, csv_filename)
if (file.exists(csv_path)) {
tryCatch({
data <- readr::read_csv(csv_path, show_col_types = FALSE,
col_types = readr::cols(.default = readr::col_character()))
historical_data[[lookback + 1]] <- list(
week = target_week,
year = target_year,
data = data
)
loaded_weeks <- c(loaded_weeks, paste0("week", sprintf("%02d_%d", target_week, target_year)))
}, error = function(e) {
message(paste(" Warning: Could not load week", target_week, "/", target_year, ":", e$message))
missing_weeks <<- c(missing_weeks, paste0("week", sprintf("%02d_%d", target_week, target_year)))
})
} else {
missing_weeks <- c(missing_weeks, paste0("week", sprintf("%02d_%d", target_week, target_year)))
}
}
if (length(historical_data) == 0) {
message(paste("Error: No historical field data found"))
return(NULL)
}
message(paste("✓ Loaded", length(historical_data), "weeks of historical data:",
paste(loaded_weeks, collapse = ", ")))
return(historical_data)
}
#' Calculate KPI trends (CI change, CV trends, 4-week and 8-week trends)
calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
project_dir = NULL, reports_dir = NULL,
current_week = NULL, year = NULL) {
message("Calculating KPI trends from current and previous week data")
current_stats$Weekly_ci_change <- NA_real_
current_stats$CV_Trend_Short_Term <- NA_real_
current_stats$Four_week_trend <- NA_real_
current_stats$CV_Trend_Long_Term <- NA_real_
current_stats$nmr_of_weeks_analysed <- 1L
if (is.null(prev_stats) || nrow(prev_stats) == 0) {
message(" No previous week data available - using defaults")
return(current_stats)
}
message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns"))
prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id)
prev_field_analysis <- NULL
tryCatch({
analysis_dir <- file.path(reports_dir, "field_analysis")
if (dir.exists(analysis_dir)) {
analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE)
if (length(analysis_files) > 0) {
recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)]
prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE,
col_types = readr::cols(.default = readr::col_character()),
col_select = c(Field_id, nmr_of_weeks_analysed, Phase))
}
}
}, error = function(e) {
message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message))
})
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
message(paste(" Using previous field_analysis to track nmr_of_weeks_analysed"))
}
historical_4weeks <- list()
historical_8weeks <- list()
if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) {
message(" Loading historical field_stats for 4-week and 8-week trends...")
for (lookback in 1:4) {
target_week <- current_week - lookback
target_year <- year
if (target_week < 1) {
target_week <- target_week + 52
target_year <- target_year - 1
}
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
rds_path <- file.path(reports_dir, "field_stats", rds_filename)
if (file.exists(rds_path)) {
tryCatch({
stats_data <- readRDS(rds_path)
historical_4weeks[[length(historical_4weeks) + 1]] <- list(week = target_week, stats = stats_data)
}, error = function(e) {
message(paste(" Warning: Could not load week", target_week, ":", e$message))
})
}
}
for (lookback in 1:8) {
target_week <- current_week - lookback
target_year <- year
if (target_week < 1) {
target_week <- target_week + 52
target_year <- target_year - 1
}
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
rds_path <- file.path(reports_dir, "field_stats", rds_filename)
if (file.exists(rds_path)) {
tryCatch({
stats_data <- readRDS(rds_path)
historical_8weeks[[length(historical_8weeks) + 1]] <- list(week = target_week, stats = stats_data)
}, error = function(e) {
# Silently skip
})
}
}
if (length(historical_4weeks) > 0) {
message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend"))
}
if (length(historical_8weeks) > 0) {
message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend"))
}
}
cv_trends_calculated <- 0
four_week_trends_calculated <- 0
cv_long_term_calculated <- 0
for (i in seq_len(nrow(current_stats))) {
field_id <- current_stats$Field_id[i]
prev_idx <- prev_lookup[field_id]
if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) {
prev_row <- prev_stats[prev_idx, , drop = FALSE]
prev_ci <- prev_row$Mean_CI[1]
if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) {
current_stats$Weekly_ci_change[i] <- round(current_stats$Mean_CI[i] - prev_ci, 2)
}
prev_cv <- prev_row$CV[1]
if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) {
current_stats$CV_Trend_Short_Term[i] <- calculate_cv_trend(current_stats$CV[i], prev_cv)
cv_trends_calculated <- cv_trends_calculated + 1
}
if (length(historical_4weeks) > 0) {
ci_values_4week <- numeric()
for (hist_idx in rev(seq_along(historical_4weeks))) {
hist_data <- historical_4weeks[[hist_idx]]$stats
hist_field <- which(hist_data$Field_id == field_id)
if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) {
ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]])
}
}
ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i])
if (length(ci_values_4week) >= 2) {
current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week)
four_week_trends_calculated <- four_week_trends_calculated + 1
}
}
if (length(historical_8weeks) > 0) {
cv_values_8week <- numeric()
for (hist_idx in rev(seq_along(historical_8weeks))) {
hist_data <- historical_8weeks[[hist_idx]]$stats
hist_field <- which(hist_data$Field_id == field_id)
if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) {
cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]])
}
}
cv_values_8week <- c(cv_values_8week, current_stats$CV[i])
if (length(cv_values_8week) >= 2) {
slope <- calculate_cv_trend_long_term(cv_values_8week)
current_stats$CV_Trend_Long_Term[i] <- slope
cv_long_term_calculated <- cv_long_term_calculated + 1
}
}
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
prev_analysis_row <- prev_field_analysis %>% dplyr::filter(Field_id == field_id)
if (nrow(prev_analysis_row) > 0) {
prev_nmr_weeks_analysis <- prev_analysis_row$nmr_of_weeks_analysed[1]
if (!is.na(prev_nmr_weeks_analysis)) {
current_stats$nmr_of_weeks_analysed[i] <- prev_nmr_weeks_analysis + 1L
} else {
current_stats$nmr_of_weeks_analysed[i] <- 1L
}
}
}
}
}
message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields"))
message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields"))
message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields"))
return(current_stats)
}
# ============================================================================
# INTERNAL HELPER FUNCTIONS (from 80_kpi_utils.R) - DO NOT DELETE
# ============================================================================
# Spatial autocorrelation thresholds for field pattern analysis
MORAN_THRESHOLD_HIGH <- 0.95 # Very strong clustering (problematic patterns)
MORAN_THRESHOLD_MODERATE <- 0.85 # Moderate clustering
MORAN_THRESHOLD_LOW <- 0.7 # Normal field continuity
#' Calculate coefficient of variation for uniformity assessment
calculate_cv <- function(values) {
values <- values[!is.na(values) & is.finite(values)]
if (length(values) < 2) return(NA)
cv <- sd(values) / mean(values)
return(cv)
}
#' Calculate percentage of field with positive vs negative change
calculate_change_percentages <- function(current_values, previous_values) {
if (length(current_values) != length(previous_values)) {
return(list(positive_pct = NA, negative_pct = NA, stable_pct = NA))
}
change_values <- current_values - previous_values
valid_changes <- change_values[!is.na(change_values) & is.finite(change_values)]
if (length(valid_changes) < 2) {
return(list(positive_pct = NA, negative_pct = NA, stable_pct = NA))
}
positive_pct <- sum(valid_changes > 0) / length(valid_changes) * 100
negative_pct <- sum(valid_changes < 0) / length(valid_changes) * 100
stable_pct <- sum(valid_changes == 0) / length(valid_changes) * 100
return(list(
positive_pct = positive_pct,
negative_pct = negative_pct,
stable_pct = stable_pct
))
}
#' Calculate spatial autocorrelation (Moran's I) for a field
calculate_spatial_autocorrelation <- function(ci_raster, field_boundary) {
tryCatch({
field_raster <- terra::crop(ci_raster, field_boundary)
field_raster <- terra::mask(field_raster, field_boundary)
raster_points <- terra::as.points(field_raster, na.rm = TRUE)
if (length(raster_points) < 10) {
return(list(morans_i = NA, p_value = NA, interpretation = "insufficient_data"))
}
points_sf <- sf::st_as_sf(raster_points)
coords <- sf::st_coordinates(points_sf)
k_neighbors <- min(8, max(4, floor(nrow(coords) / 10)))
knn_nb <- spdep::knearneigh(coords, k = k_neighbors)
knn_listw <- spdep::nb2listw(spdep::knn2nb(knn_nb), style = "W", zero.policy = TRUE)
ci_values <- points_sf[[1]]
moran_result <- spdep::moran.test(ci_values, knn_listw, zero.policy = TRUE)
morans_i <- moran_result$estimate[1]
p_value <- moran_result$p.value
interpretation <- if (is.na(morans_i)) {
"insufficient_data"
} else if (p_value > 0.05) {
"random"
} else if (morans_i > MORAN_THRESHOLD_HIGH) {
"very_strong_clustering"
} else if (morans_i > MORAN_THRESHOLD_MODERATE) {
"strong_clustering"
} else if (morans_i > MORAN_THRESHOLD_LOW) {
"normal_continuity"
} else if (morans_i > 0.3) {
"weak_clustering"
} else if (morans_i < -0.3) {
"dispersed"
} else {
"low_autocorrelation"
}
return(list(morans_i = morans_i, p_value = p_value, interpretation = interpretation))
}, error = function(e) {
warning(paste("Error calculating spatial autocorrelation:", e$message))
return(list(morans_i = NA, p_value = NA, interpretation = "error"))
})
}
#' Extract CI band from multi-band raster
extract_ci_values <- function(ci_raster, field_vect) {
extracted <- terra::extract(ci_raster, field_vect, fun = NULL)
if ("CI" %in% names(extracted)) {
return(extracted[, "CI"])
} else if (ncol(extracted) > 1) {
return(extracted[, ncol(extracted)])
} else {
return(extracted[, 1])
}
}
#' Calculate current and previous week numbers using ISO 8601
calculate_week_numbers <- function(report_date = Sys.Date()) {
current_week <- as.numeric(format(report_date, "%V"))
current_year <- as.numeric(format(report_date, "%G"))
previous_week <- current_week - 1
previous_year <- current_year
if (previous_week < 1) {
previous_week <- 52
previous_year <- current_year - 1
}
return(list(
current_week = current_week,
previous_week = previous_week,
current_year = current_year,
previous_year = previous_year
))
}
#' Load field CI raster (handles single-file and per-field architectures)
load_field_ci_raster <- function(ci_raster_or_obj, field_name, field_vect = NULL) {
is_per_field <- !is.null(attr(ci_raster_or_obj, "is_per_field")) && attr(ci_raster_or_obj, "is_per_field")
if (is_per_field) {
per_field_dir <- attr(ci_raster_or_obj, "per_field_dir")
week_file <- attr(ci_raster_or_obj, "week_file")
field_mosaic_path <- file.path(per_field_dir, field_name, week_file)
if (file.exists(field_mosaic_path)) {
tryCatch({
field_mosaic <- terra::rast(field_mosaic_path)
if (terra::nlyr(field_mosaic) >= 5) {
return(field_mosaic[[5]])
} else {
return(field_mosaic[[1]])
}
}, error = function(e) {
return(NULL)
})
} else {
return(NULL)
}
} else {
if (!is.null(field_vect)) {
return(terra::crop(ci_raster_or_obj, field_vect, mask = TRUE))
} else {
return(ci_raster_or_obj)
}
}
}
#' Load weekly CI mosaic (single-file or per-field)
load_weekly_ci_mosaic <- function(week_num, year, mosaic_dir) {
week_file <- sprintf("week_%02d_%d.tif", week_num, year)
week_path <- file.path(mosaic_dir, week_file)
if (file.exists(week_path)) {
tryCatch({
mosaic_raster <- terra::rast(week_path)
ci_raster <- mosaic_raster[[5]]
names(ci_raster) <- "CI"
return(ci_raster)
}, error = function(e) {
return(NULL)
})
}
if (dir.exists(mosaic_dir)) {
field_dirs <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
field_dirs <- field_dirs[field_dirs != ""]
found_any <- FALSE
for (field in field_dirs) {
field_mosaic_path <- file.path(mosaic_dir, field, week_file)
if (file.exists(field_mosaic_path)) {
found_any <- TRUE
break
}
}
if (found_any) {
dummy_raster <- terra::rast(nrow=1, ncol=1, vals=NA)
attr(dummy_raster, "per_field_dir") <- mosaic_dir
attr(dummy_raster, "week_file") <- week_file
attr(dummy_raster, "is_per_field") <- TRUE
return(dummy_raster)
}
}
return(NULL)
}
#' Prepare predictions with consistent naming
prepare_predictions <- function(predictions, newdata) {
return(predictions %>%
as.data.frame() %>%
dplyr::rename(predicted_Tcha = ".") %>%
dplyr::mutate(
sub_field = newdata$sub_field,
field = newdata$field,
Age_days = newdata$DAH,
total_CI = round(newdata$cumulative_CI, 0),
predicted_Tcha = round(predicted_Tcha, 0),
season = newdata$season
) %>%
dplyr::select(field, sub_field, Age_days, predicted_Tcha, season) %>%
dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
)
}
# ============================================================================
# YIELD PREDICTION KPI (SHARED ML-BASED MODEL FOR ALL CLIENT TYPES)
# ============================================================================
#' Helper function for graceful fallback when training data unavailable
#'
#' @param field_boundaries Field boundaries (sf or SpatVector)
#' @return List with summary and field_results (both with NA values)
create_fallback_result <- function(field_boundaries) {
# Convert to SpatVector if needed (for terra::project)
if (!inherits(field_boundaries, "SpatVector")) {
field_boundaries <- terra::vect(field_boundaries)
}
field_boundaries_projected <- terra::project(field_boundaries, "EPSG:6933") # Equal Earth projection
field_areas <- terra::expanse(field_boundaries_projected) / 10000 # Convert m² to hectares
total_area <- sum(field_areas)
summary_result <- data.frame(
field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
count = c(0, 0, 0, nrow(field_boundaries)),
value = c(NA_real_, NA_real_, NA_real_, round(total_area, 1)),
stringsAsFactors = FALSE
)
field_results <- data.frame(
field = character(0),
sub_field = character(0),
Age_days = numeric(0),
yield_forecast_t_ha = numeric(0),
season = numeric(0),
stringsAsFactors = FALSE
)
return(list(summary = summary_result, field_results = field_results))
}
#' Calculate yield prediction KPI using Random Forest with Feature Selection
#'
#' Trains a Random Forest model on historical harvest data with cumulative CI,
#' days after harvest (DAH), and CI-per-day as predictors. Uses CAST::ffs() for
#' Forward Feature Selection. Predicts yields for mature fields (DAH >= DAH_MATURITY_THRESHOLD).
#'
#' @param field_boundaries Field boundaries (sf or SpatVector)
#' @param harvesting_data Data frame with harvest data including tonnage_ha column
#' @param cumulative_CI_vals_dir Directory containing "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
#'
#' @return List with:
#' - summary: Data frame with field_groups, count, value (quartiles and total area)
#' - field_results: Data frame with field-level yield forecasts (yield_forecast_t_ha)
#'
#' @details
#' **Training Data Requirements:**
#' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds"
#' - harvesting_data must have tonnage_ha column with numeric yield values
#' - Training stops gracefully if either is missing (returns NA values, not fake numbers)
#'
#' **Model Specifications:**
#' - Algorithm: Random Forest (caret + CAST)
#' - Feature Selection: Forward Feature Selection (CAST::ffs)
#' - Cross-validation: 5-fold CV
#' - Predictors: cumulative_CI, DAH, CI_per_day
#' - Mature field threshold: DAH >= DAH_MATURITY_THRESHOLD (8 months, ~240 days)
#' - Output: Field-level yield forecasts grouped by quartile
#'
#' **Error Handling:**
#' - Missing tonnage_ha: Returns graceful fallback with NA (not zero) values
#' - No training data: Logs WARNING, returns empty field_results
#' - RDS file missing: Returns graceful fallback
#' - Prediction errors: Wrapped in tryCatch, returns fallback on failure
calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cumulative_CI_vals_dir) {
safe_log("Calculating yield prediction KPI using Random Forest with Feature Selection")
tryCatch({
# Check if tonnage_ha column is present and has valid data
if (is.null(harvesting_data) ||
!("tonnage_ha" %in% names(harvesting_data)) ||
all(is.na(harvesting_data$tonnage_ha))) {
safe_log("No harvest data available: lacking tonnage_ha column or all values are NA", "WARNING")
return(create_fallback_result(field_boundaries))
}
# Check if CI quadrant RDS file exists
ci_quadrant_path <- file.path(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds")
if (!file.exists(ci_quadrant_path)) {
safe_log(paste("CI quadrant file not found at:", ci_quadrant_path), "WARNING")
return(create_fallback_result(field_boundaries))
}
# Load CI quadrant data and fill missing field/sub_field values
CI_quadrant <- readRDS(ci_quadrant_path) %>%
dplyr::group_by(model) %>%
tidyr::fill(field, sub_field, .direction = "downup") %>%
dplyr::ungroup()
# Rename year column to season for consistency
harvesting_data_renamed <- harvesting_data %>% dplyr::rename(season = year)
# Join CI and yield data
CI_and_yield <- dplyr::left_join(CI_quadrant, harvesting_data_renamed,
by = c("field", "sub_field", "season")) %>%
dplyr::group_by(sub_field, season) %>%
dplyr::slice(which.max(DAH)) %>%
dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DAH, season, sub_area) %>%
dplyr::mutate(CI_per_day = ifelse(DAH > 0, cumulative_CI / DAH, NA_real_))
# Define predictors and response variables
predictors <- c("cumulative_CI", "DAH", "CI_per_day")
response <- "tonnage_ha"
# Prepare training dataset (fields with harvest data)
CI_and_yield_test <- CI_and_yield %>%
as.data.frame() %>%
dplyr::filter(!is.na(tonnage_ha))
# Check if we have minimum training data
if (nrow(CI_and_yield_test) == 0) {
safe_log("No training data available: no fields with tonnage_ha observations", "WARNING")
return(create_fallback_result(field_boundaries))
}
# Pre-clean training data: remove rows with any NAs in predictors or response
# This is required because CAST::ffs doesn't support na.rm parameter
CI_and_yield_test <- CI_and_yield_test %>%
dplyr::filter(!dplyr::if_any(dplyr::all_of(c(predictors, response)), is.na))
if (nrow(CI_and_yield_test) == 0) {
safe_log("No complete training data after removing NAs in predictors/response", "WARNING")
return(create_fallback_result(field_boundaries))
}
# Prepare prediction dataset (fields without harvest data, mature fields only)
prediction_yields <- CI_and_yield %>%
as.data.frame() %>%
dplyr::filter(is.na(tonnage_ha) & DAH >= DAH_MATURITY_THRESHOLD) # Mature fields only
# Configure model training parameters
ctrl <- caret::trainControl(
method = "cv",
savePredictions = TRUE,
allowParallel = TRUE,
number = 5,
verboseIter = FALSE
)
# Train the model with forward feature selection
set.seed(202) # For reproducibility
safe_log("Training Random Forest model with Forward Feature Selection...")
model_ffs_rf <- CAST::ffs(
CI_and_yield_test[, predictors],
CI_and_yield_test[, response],
method = "rf",
trControl = ctrl,
importance = TRUE,
withinSE = TRUE,
tuneLength = 5
)
# Predict yields on validation data (same as training data for RMSE calculation)
pred_ffs_rf <- prepare_predictions(
stats::predict(model_ffs_rf, newdata = CI_and_yield_test),
CI_and_yield_test
)
# Extract cross-validated RMSE from the model object (more honest than in-sample RMSE)
# The CAST::ffs model stores CV results in $results data frame
# We extract the RMSE from the best model (lowest RMSE across folds)
if (!is.null(model_ffs_rf$results) && "RMSE" %in% names(model_ffs_rf$results)) {
# Get minimum RMSE from cross-validation results (best model from feature selection)
rmse_value <- min(model_ffs_rf$results$RMSE, na.rm = TRUE)
safe_log(paste("Yield prediction RMSE (cross-validated):", round(rmse_value, 2), "t/ha"))
} else {
# Fallback: compute in-sample RMSE if CV results unavailable, but label it clearly
rmse_value <- sqrt(mean((pred_ffs_rf$predicted_Tcha - CI_and_yield_test$tonnage_ha)^2, na.rm = TRUE))
safe_log(paste("Yield prediction RMSE (in-sample/training):", round(rmse_value, 2), "t/ha"))
}
# Predict yields for current season (mature fields >= DAH_MATURITY_THRESHOLD days)
if (nrow(prediction_yields) > 0) {
pred_rf_current_season <- prepare_predictions(
stats::predict(model_ffs_rf, newdata = prediction_yields),
prediction_yields
) %>%
dplyr::filter(Age_days >= DAH_MATURITY_THRESHOLD) %>%
dplyr::select(c("field", "Age_days", "predicted_Tcha", "season"))
} else {
pred_rf_current_season <- data.frame()
}
# Calculate summary statistics for KPI
if (nrow(pred_rf_current_season) > 0) {
safe_log(paste("Number of fields with yield predictions:", nrow(pred_rf_current_season)))
safe_log(paste("Predicted yield range:",
round(min(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
"-",
round(max(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1),
"t/ha"))
# Calculate quartiles for grouping
yield_quartiles <- quantile(pred_rf_current_season$predicted_Tcha,
probs = c(0.25, 0.5, 0.75), na.rm = TRUE)
# Count fields in each quartile
top_25_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[3], na.rm = TRUE)
average_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[1] &
pred_rf_current_season$predicted_Tcha < yield_quartiles[3], na.rm = TRUE)
lowest_25_count <- sum(pred_rf_current_season$predicted_Tcha < yield_quartiles[1], na.rm = TRUE)
# Calculate total area
if (!inherits(field_boundaries, "SpatVector")) {
field_boundaries_vect <- terra::vect(field_boundaries)
} else {
field_boundaries_vect <- field_boundaries
}
# Handle both sf and SpatVector inputs for area calculation
if (inherits(field_boundaries, "sf")) {
field_boundaries_projected <- sf::st_transform(field_boundaries, "EPSG:6933")
field_areas <- sf::st_area(field_boundaries_projected) / 10000 # m² to hectares
} else {
field_boundaries_projected <- terra::project(field_boundaries_vect, "EPSG:6933")
field_areas <- terra::expanse(field_boundaries_projected) / 10000
}
total_area <- sum(as.numeric(field_areas))
safe_log(paste("Total area:", round(total_area, 1), "hectares"))
# Build summary result
result <- data.frame(
field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"),
count = c(top_25_count, average_count, lowest_25_count, nrow(field_boundaries)),
value = c(round(yield_quartiles[3], 1), round(yield_quartiles[2], 1),
round(yield_quartiles[1], 1), round(total_area, 1)),
stringsAsFactors = FALSE
)
# Prepare field-level results
field_level_results <- pred_rf_current_season %>%
dplyr::select(field, Age_days, predicted_Tcha, season) %>%
dplyr::rename(yield_forecast_t_ha = predicted_Tcha)
safe_log("✓ Yield prediction complete")
return(list(summary = result, field_results = field_level_results))
} else {
safe_log(paste("No fields meet maturity threshold (DAH >=", DAH_MATURITY_THRESHOLD, ") for prediction"), "WARNING")
return(list(summary = create_fallback_result(field_boundaries)$summary,
field_results = data.frame()))
}
}, error = function(e) {
safe_log(paste("Error in yield prediction:", e$message), "ERROR")
return(create_fallback_result(field_boundaries))
})
}