# 80_UTILS_COMMON.R # ============================================================================ # SHARED UTILITY FUNCTIONS FOR ALL CLIENT TYPES (SCRIPT 80) # # Contains helper and infrastructure functions used by both AURA and ANGATA workflows: # - Statistical categorization and calculations # - Tile operations and data loading # - Field statistics extraction # - Week/year calculations for consistent date handling # - Excel/CSV/RDS export utilities # - Yield prediction using ML models (Random Forest with Feature Selection) # # Used by: 80_calculate_kpis.R, all client-specific utils files # # NOTE: Libraries required by yield prediction (caret, CAST, here) are loaded # in the main script 80_calculate_kpis.R, not here. This keeps dependencies # centralized in the orchestrator script. # ============================================================================ # ============================================================================ # CONSTANTS (from 80_calculate_kpis.R) # ============================================================================ # Four-week trend thresholds FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5 FOUR_WEEK_TREND_GROWTH_MIN <- 0.1 FOUR_WEEK_TREND_GROWTH_MAX <- 0.5 FOUR_WEEK_TREND_NO_GROWTH_RANGE <- 0.1 FOUR_WEEK_TREND_DECLINE_MAX <- -0.1 FOUR_WEEK_TREND_DECLINE_MIN <- -0.5 FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5 # CV Trend thresholds (8-week slope interpretation) CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.03 CV_SLOPE_IMPROVEMENT_MIN <- -0.02 CV_SLOPE_IMPROVEMENT_MAX <- -0.01 CV_SLOPE_HOMOGENOUS_MIN <- -0.01 CV_SLOPE_HOMOGENOUS_MAX <- 0.01 CV_SLOPE_PATCHINESS_MIN <- 0.01 CV_SLOPE_PATCHINESS_MAX <- 0.02 CV_SLOPE_SEVERE_MIN <- 0.02 # Percentile calculations CI_PERCENTILE_LOW <- 0.10 CI_PERCENTILE_HIGH <- 0.90 # Phase definitions (used by get_phase_by_age) PHASE_DEFINITIONS <- data.frame( phase = c("Germination", "Tillering", "Grand Growth", "Maturation"), age_start = c(0, 4, 17, 39), age_end = c(6, 16, 39, 200), stringsAsFactors = FALSE ) # ============================================================================ # WEEK/YEAR CALCULATION HELPERS (Consistent across all scripts) # ============================================================================ #' Calculate week and year for a given lookback offset #' This function handles ISO 8601 week numbering with proper year wrapping #' when crossing year boundaries (e.g., week 01/2026 -> week 52/2025) #' #' @param current_week ISO week number (1-53) #' @param current_year ISO week year (from format(..., "%G")) #' @param offset_weeks Number of weeks to go back (0 = current week, 1 = previous week, etc.) #' #' @return List with: week (ISO week number), year (ISO week year) #' #' @details #' This is the authoritative week/year calculation function. #' Used by: #' - load_historical_field_data() - to find RDS/CSV files for 4-week lookback #' - Script 80 main - to calculate previous week with year wrapping #' - Any other script needing to walk backwards through weeks #' #' Example: Week 01/2026, offset=1 -> returns list(week=52, year=2025) calculate_target_week_and_year <- function(current_week, current_year, offset_weeks = 0) { target_week <- current_week - offset_weeks target_year <- current_year # Handle wrapping: when going back from week 1, wrap to week 52 of previous year while (target_week < 1) { target_week <- target_week + 52 target_year <- target_year - 1 } return(list(week = target_week, year = target_year)) } # ============================================================================ # TILE-AWARE HELPER FUNCTIONS # ============================================================================ #' Get tile IDs that intersect with a field geometry get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) { if (inherits(field_geom, "sf")) { field_bbox <- sf::st_bbox(field_geom) field_xmin <- field_bbox["xmin"] field_xmax <- field_bbox["xmax"] field_ymin <- field_bbox["ymin"] field_ymax <- field_bbox["ymax"] } else if (inherits(field_geom, "SpatVector")) { field_bbox <- terra::ext(field_geom) field_xmin <- field_bbox$xmin field_xmax <- field_bbox$xmax field_ymin <- field_bbox$ymin field_ymax <- field_bbox$ymax } else { stop("field_geom must be sf or terra::vect object") } intersecting_tiles <- tile_grid$id[ !(tile_grid$xmax < field_xmin | tile_grid$xmin > field_xmax | tile_grid$ymax < field_ymin | tile_grid$ymin > field_ymax) ] return(as.numeric(intersecting_tiles)) } #' Load and merge tiles for a specific field load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_dir) { if (length(tile_ids) == 0) { return(NULL) } tiles_list <- list() for (tile_id in sort(tile_ids)) { tile_filename <- sprintf("week_%02d_%d_%02d.tif", week_num, year, tile_id) tile_path <- file.path(mosaic_dir, tile_filename) if (file.exists(tile_path)) { tryCatch({ tile_rast <- terra::rast(tile_path) ci_band <- terra::subset(tile_rast, 5) tiles_list[[length(tiles_list) + 1]] <- ci_band }, error = function(e) { message(paste(" Warning: Could not load tile", tile_id, ":", e$message)) }) } } if (length(tiles_list) == 0) { return(NULL) } if (length(tiles_list) == 1) { return(tiles_list[[1]]) } else { tryCatch({ rsrc <- terra::sprc(tiles_list) merged <- terra::mosaic(rsrc, fun = "max") return(merged) }, error = function(e) { message(paste(" Warning: Could not merge tiles:", e$message)) return(tiles_list[[1]]) }) } } #' Build tile grid from available tile files build_tile_grid <- function(mosaic_dir, week_num, year) { # Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/) detected_grid_size <- NA if (dir.exists(mosaic_dir)) { subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE) grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE) if (length(grid_patterns) > 0) { detected_grid_size <- grid_patterns[1] mosaic_dir <- file.path(mosaic_dir, detected_grid_size) message(paste(" Using grid-size subdirectory:", detected_grid_size)) } } tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) if (length(tile_files) == 0) { stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir)) } tile_grid <- data.frame( id = integer(), xmin = numeric(), xmax = numeric(), ymin = numeric(), ymax = numeric(), stringsAsFactors = FALSE ) for (tile_file in tile_files) { tryCatch({ matches <- regmatches(basename(tile_file), regexpr("_([0-9]{2})\\.tif$", basename(tile_file))) if (length(matches) > 0) { tile_id <- as.integer(sub("_|\\.tif", "", matches[1])) tile_rast <- terra::rast(tile_file) tile_ext <- terra::ext(tile_rast) tile_grid <- rbind(tile_grid, data.frame( id = tile_id, xmin = tile_ext$xmin, xmax = tile_ext$xmax, ymin = tile_ext$ymin, ymax = tile_ext$ymax, stringsAsFactors = FALSE )) } }, error = function(e) { message(paste(" Warning: Could not process tile", basename(tile_file), ":", e$message)) }) } if (nrow(tile_grid) == 0) { stop("Could not extract extents from any tile files") } return(list( tile_grid = tile_grid, mosaic_dir = mosaic_dir, grid_size = detected_grid_size )) } # ============================================================================ # STATISTICAL CATEGORIZATION FUNCTIONS # ============================================================================ #' Categorize four-week CI trend categorize_four_week_trend <- function(ci_values_list) { if (is.null(ci_values_list) || length(ci_values_list) < 2) { return(NA_character_) } ci_values_list <- ci_values_list[!is.na(ci_values_list)] if (length(ci_values_list) < 2) { return(NA_character_) } weekly_changes <- diff(ci_values_list) avg_weekly_change <- mean(weekly_changes, na.rm = TRUE) if (avg_weekly_change >= FOUR_WEEK_TREND_STRONG_GROWTH_MIN) { return("strong growth") } else if (avg_weekly_change >= FOUR_WEEK_TREND_GROWTH_MIN && avg_weekly_change < FOUR_WEEK_TREND_GROWTH_MAX) { return("growth") } else if (abs(avg_weekly_change) <= FOUR_WEEK_TREND_NO_GROWTH_RANGE) { return("no growth") } else if (avg_weekly_change <= FOUR_WEEK_TREND_DECLINE_MIN && avg_weekly_change > FOUR_WEEK_TREND_STRONG_DECLINE_MAX) { return("decline") } else if (avg_weekly_change < FOUR_WEEK_TREND_STRONG_DECLINE_MAX) { return("strong decline") } else { return("no growth") } } #' Round cloud coverage to interval categories round_cloud_to_intervals <- function(cloud_pct_clear) { if (is.na(cloud_pct_clear)) { return(NA_character_) } if (cloud_pct_clear < 10) return("0-10%") if (cloud_pct_clear < 20) return("10-20%") if (cloud_pct_clear < 30) return("20-30%") if (cloud_pct_clear < 40) return("30-40%") if (cloud_pct_clear < 50) return("40-50%") if (cloud_pct_clear < 60) return("50-60%") if (cloud_pct_clear < 70) return("60-70%") if (cloud_pct_clear < 80) return("70-80%") if (cloud_pct_clear < 90) return("80-90%") if (cloud_pct_clear < 95) return("90-95%") return("95-100%") } #' Get CI percentile range (10th to 90th) get_ci_percentiles <- function(ci_values) { if (is.null(ci_values) || length(ci_values) == 0) { return(NA_character_) } ci_values <- ci_values[!is.na(ci_values)] if (length(ci_values) == 0) { return(NA_character_) } p10 <- quantile(ci_values, CI_PERCENTILE_LOW, na.rm = TRUE) p90 <- quantile(ci_values, CI_PERCENTILE_HIGH, na.rm = TRUE) return(sprintf("%.1f-%.1f", p10, p90)) } #' Calculate short-term CV trend (current week vs previous week) calculate_cv_trend <- function(cv_current, cv_previous) { if (is.na(cv_current) || is.na(cv_previous)) { return(NA_real_) } return(round(cv_current - cv_previous, 4)) } #' Calculate four-week CI trend calculate_four_week_trend <- function(mean_ci_values) { if (is.null(mean_ci_values) || length(mean_ci_values) == 0) { return(NA_real_) } ci_clean <- mean_ci_values[!is.na(mean_ci_values)] if (length(ci_clean) < 2) { return(NA_real_) } trend <- ci_clean[length(ci_clean)] - ci_clean[1] return(round(trend, 2)) } #' Categorize CV slope (8-week regression) into field uniformity interpretation categorize_cv_slope <- function(slope) { if (is.na(slope)) { return(NA_character_) } if (slope <= CV_SLOPE_IMPROVEMENT_MIN) { return("Excellent uniformity") } else if (slope < CV_SLOPE_HOMOGENOUS_MIN) { return("Homogenous growth") } else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) { return("Homogenous growth") } else if (slope <= CV_SLOPE_PATCHINESS_MAX) { return("Minor patchiness") } else { return("Severe fragmentation") } } #' Calculate 8-week CV trend via linear regression slope calculate_cv_trend_long_term <- function(cv_values) { if (is.null(cv_values) || length(cv_values) == 0) { return(NA_real_) } cv_clean <- cv_values[!is.na(cv_values)] if (length(cv_clean) < 2) { return(NA_real_) } weeks <- seq_along(cv_clean) tryCatch({ lm_fit <- lm(cv_clean ~ weeks) slope <- coef(lm_fit)["weeks"] return(round(as.numeric(slope), 4)) }, error = function(e) { return(NA_real_) }) } #' Calculate Gap Filling Score KPI (2σ method) #' @param ci_raster Current week CI raster (single band) #' @param field_boundaries Field boundaries (sf or SpatVector) #' @return List with summary data frame and field-level results data frame calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) { # Handle both sf and SpatVector inputs if (!inherits(field_boundaries, "SpatVector")) { field_boundaries_vect <- terra::vect(field_boundaries) } else { field_boundaries_vect <- field_boundaries } field_results <- data.frame() for (i in seq_len(nrow(field_boundaries))) { field_name <- if ("field" %in% names(field_boundaries)) field_boundaries$field[i] else NA_character_ sub_field_name <- if ("sub_field" %in% names(field_boundaries)) field_boundaries$sub_field[i] else NA_character_ field_vect <- field_boundaries_vect[i] # Extract CI values using helper function ci_values <- extract_ci_values(ci_raster, field_vect) valid_values <- ci_values[!is.na(ci_values) & is.finite(ci_values)] if (length(valid_values) > 1) { # Gap score using 2σ below median to detect outliers median_ci <- median(valid_values) sd_ci <- sd(valid_values) outlier_threshold <- median_ci - (2 * sd_ci) low_ci_pixels <- sum(valid_values < outlier_threshold) total_pixels <- length(valid_values) gap_score <- round((low_ci_pixels / total_pixels) * 100, 2) # Classify gap severity gap_level <- dplyr::case_when( gap_score < 10 ~ "Minimal", gap_score < 25 ~ "Moderate", TRUE ~ "Significant" ) field_results <- rbind(field_results, data.frame( field = field_name, sub_field = sub_field_name, gap_level = gap_level, gap_score = gap_score, mean_ci = mean(valid_values), outlier_threshold = outlier_threshold )) } else { field_results <- rbind(field_results, data.frame( field = field_name, sub_field = sub_field_name, gap_level = NA_character_, gap_score = NA_real_, mean_ci = NA_real_, outlier_threshold = NA_real_ )) } } # Summarize results gap_summary <- field_results %>% dplyr::group_by(gap_level) %>% dplyr::summarise(field_count = n(), .groups = 'drop') %>% dplyr::mutate(percent = round((field_count / sum(field_count)) * 100, 1)) return(list(summary = gap_summary, field_results = field_results)) } #' Calculate gap filling scores for all per-field mosaics (wrapper) #' #' This wrapper handles per-field mosaic structure by iterating over #' individual field files and calling the basic KPI function #' #' @param per_field_files Character vector of paths to per-field mosaic TIFFs #' @param field_boundaries_sf sf object with field geometries #' @return data.frame with Field_id and gap_score columns calculate_gap_scores <- function(per_field_files, field_boundaries_sf) { message("\nCalculating gap filling scores (2σ method)...") message(paste(" Using per-field mosaics for", length(per_field_files), "fields")) field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field) process_gap_for_field <- function(field_file) { field_id <- basename(dirname(field_file)) field_bounds <- field_boundaries_by_id[[field_id]] if (is.null(field_bounds) || nrow(field_bounds) == 0) { return(data.frame(Field_id = field_id, gap_score = NA_real_)) } tryCatch({ field_raster <- terra::rast(field_file) ci_band_name <- "CI" if (!(ci_band_name %in% names(field_raster))) { return(data.frame(Field_id = field_id, gap_score = NA_real_)) } field_ci_band <- field_raster[[ci_band_name]] names(field_ci_band) <- "CI" gap_result <- calculate_gap_filling_kpi(field_ci_band, field_bounds) if (is.null(gap_result) || is.null(gap_result$field_results) || nrow(gap_result$field_results) == 0) { return(data.frame(Field_id = field_id, gap_score = NA_real_)) } gap_scores <- gap_result$field_results gap_scores$Field_id <- gap_scores$field gap_scores <- gap_scores[, c("Field_id", "gap_score")] stats::aggregate(gap_score ~ Field_id, data = gap_scores, FUN = function(x) mean(x, na.rm = TRUE)) }, error = function(e) { message(paste(" WARNING: Gap score failed for field", field_id, ":", e$message)) data.frame(Field_id = field_id, gap_score = NA_real_) }) } # Process fields sequentially with progress bar message(" Processing gap scores for ", length(per_field_files), " fields...") pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50) results_list <- lapply(seq_along(per_field_files), function(idx) { result <- process_gap_for_field(per_field_files[[idx]]) utils::setTxtProgressBar(pb, idx) result }) close(pb) gap_scores_df <- dplyr::bind_rows(results_list) if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) { gap_scores_df <- gap_scores_df %>% dplyr::group_by(Field_id) %>% dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop") message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields")) message(paste(" Gap score range:", round(min(gap_scores_df$gap_score, na.rm=TRUE), 2), "-", round(max(gap_scores_df$gap_score, na.rm=TRUE), 2), "%")) } else { message(" WARNING: No gap scores calculated from per-field mosaics") gap_scores_df <- NULL } return(gap_scores_df) } # ============================================================================ # HELPER FUNCTIONS # ============================================================================ #' Get crop phase by age in weeks get_phase_by_age <- function(age_weeks) { if (is.na(age_weeks)) return(NA_character_) for (i in seq_len(nrow(PHASE_DEFINITIONS))) { if (age_weeks >= PHASE_DEFINITIONS$age_start[i] && age_weeks <= PHASE_DEFINITIONS$age_end[i]) { return(PHASE_DEFINITIONS$phase[i]) } } return("Unknown") } #' Get status trigger based on CI values and field age get_status_trigger <- function(ci_values, ci_change, age_weeks) { if (is.na(age_weeks) || length(ci_values) == 0) return(NA_character_) ci_values <- ci_values[!is.na(ci_values)] if (length(ci_values) == 0) return(NA_character_) pct_above_2 <- sum(ci_values > 2) / length(ci_values) * 100 pct_at_or_above_2 <- sum(ci_values >= 2) / length(ci_values) * 100 ci_cv <- if (mean(ci_values, na.rm = TRUE) > 0) sd(ci_values) / mean(ci_values, na.rm = TRUE) else 0 mean_ci <- mean(ci_values, na.rm = TRUE) if (age_weeks >= 0 && age_weeks <= 6) { if (pct_at_or_above_2 >= 70) { return("germination_complete") } else if (pct_above_2 > 10) { return("germination_started") } } if (age_weeks >= 45) { return("harvest_ready") } if (age_weeks > 6 && !is.na(ci_change) && ci_change < -1.5 && ci_cv < 0.25) { return("stress_detected_whole_field") } if (age_weeks > 6 && !is.na(ci_change) && ci_change > 1.5) { return("strong_recovery") } if (age_weeks >= 4 && age_weeks < 39 && !is.na(ci_change) && ci_change > 0.2) { return("growth_on_track") } if (age_weeks >= 39 && age_weeks < 45 && mean_ci > 3.5) { return("maturation_progressing") } return(NA_character_) } #' Extract planting dates from harvesting data extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { message("Warning: No harvesting data available - planting dates will be NA.") if (!is.null(field_boundaries_sf)) { return(data.frame( field_id = field_boundaries_sf$field, planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)), stringsAsFactors = FALSE )) } return(NULL) } tryCatch({ planting_dates <- harvesting_data %>% arrange(field, desc(season_start)) %>% distinct(field, .keep_all = TRUE) %>% select(field, season_start) %>% rename(field_id = field, planting_date = season_start) %>% filter(!is.na(planting_date)) %>% as.data.frame() message(paste("Extracted planting dates for", nrow(planting_dates), "fields from harvest.xlsx")) return(planting_dates) }, error = function(e) { message(paste("Error extracting planting dates:", e$message)) return(NULL) }) } # ============================================================================ # DATA LOADING HELPERS # ============================================================================ #' Load and validate harvest data from harvest.xlsx #' #' Encapsulates harvest data loading with validation, type coercion, and error handling. #' Returns a data frame with required columns (field, year, tonnage_ha) or an empty #' data frame with proper structure if loading fails. #' #' @param data_dir Path to data directory containing harvest.xlsx #' #' @return data.frame with columns: field (character), year (numeric), tonnage_ha (numeric) #' - On success: data frame with N rows of harvest records #' - On failure: empty data frame with correct structure (0 rows, 3 columns) #' #' @details #' **File Location**: Expected at `file.path(data_dir, "harvest.xlsx")` #' #' **Validation**: #' - Checks file existence before reading #' - Validates required columns: field, year, tonnage_ha #' - Coerces year and tonnage_ha to numeric #' - Logs status messages for debugging #' #' **Error Handling**: #' - Missing file: Returns empty DF (logs NOTE) #' - Missing columns: Returns empty DF (logs WARNING) #' - Read errors: Returns empty DF (logs WARNING) #' - Always returns valid data frame structure (won't return NULL) #' #' **Usage**: #' ```r #' harvesting_data <- load_harvest_data(setup$data_dir) #' # harvesting_data is guaranteed to be a data.frame with 3 columns #' # even if harvest.xlsx is unavailable or invalid #' ``` load_harvest_data <- function(data_dir) { harvesting_data <- NULL harvest_file <- file.path(data_dir, "harvest.xlsx") if (file.exists(harvest_file)) { tryCatch({ harvesting_data <- readxl::read_excel(harvest_file) # Ensure required columns are present required_cols <- c("field", "year", "tonnage_ha") if (all(required_cols %in% names(harvesting_data))) { # Convert to data frame and ensure column types harvesting_data <- as.data.frame(harvesting_data) harvesting_data$year <- as.numeric(harvesting_data$year) harvesting_data$tonnage_ha <- as.numeric(harvesting_data$tonnage_ha) message(paste(" ✓ Loaded harvest data:", nrow(harvesting_data), "records from harvest.xlsx")) return(harvesting_data) } else { message(paste(" WARNING: harvest.xlsx missing required columns. Expected: field, year, tonnage_ha")) harvesting_data <- NULL } }, error = function(e) { message(paste(" WARNING: Could not read harvest.xlsx:", e$message)) }) } else { message(paste(" NOTE: harvest.xlsx not found at", harvest_file)) } # Fallback: create empty data frame if loading failed if (is.null(harvesting_data)) { message(" WARNING: No harvest data available. TCH yield prediction will use graceful fallback (NA values)") harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric()) } return(harvesting_data) } # ============================================================================ # FIELD STATISTICS EXTRACTION # ============================================================================ #' Extract CI statistics for all fields from a single CI raster band extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) { extract_result <- terra::extract(ci_band, field_boundaries_sf) stats_list <- list() for (field_idx in seq_len(nrow(field_boundaries_sf))) { field_pixels <- extract_result[extract_result$ID == field_idx, 2] pixels <- as.numeric(field_pixels[!is.na(field_pixels)]) if (length(pixels) == 0) { stats_list[[field_idx]] <- data.frame( field_idx = field_idx, mean_ci = NA_real_, cv = NA_real_, p10 = NA_real_, p90 = NA_real_, min_ci = NA_real_, max_ci = NA_real_, pixel_count_valid = 0, pixel_count_total = 0, stringsAsFactors = FALSE ) next } mean_val <- mean(pixels, na.rm = TRUE) cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_ p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] min_val <- min(pixels, na.rm = TRUE) max_val <- max(pixels, na.rm = TRUE) stats_list[[field_idx]] <- data.frame( field_idx = field_idx, mean_ci = mean_val, cv = cv_val, p10 = p10_val, p90 = p90_val, min_ci = min_val, max_ci = max_val, pixel_count_valid = length(pixels), pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]), stringsAsFactors = FALSE ) } return(dplyr::bind_rows(stats_list)) } # ============================================================================ # EXPORT FUNCTIONS (USED BY ALL CLIENTS) # ============================================================================ #' Generate summary statistics from field analysis data generate_field_analysis_summary <- function(field_df) { message("Generating summary statistics...") total_acreage <- sum(field_df$Acreage, na.rm = TRUE) germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE) tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE) grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE) maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE) unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE) harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE) stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE) recovery_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "strong_recovery"], na.rm = TRUE) growth_on_track_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "growth_on_track"], na.rm = TRUE) germination_complete_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_complete"], na.rm = TRUE) germination_started_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_started"], na.rm = TRUE) no_trigger_acreage <- sum(field_df$Acreage[is.na(field_df$Status_trigger)], na.rm = TRUE) clear_fields <- sum(field_df$Cloud_category == "Clear view", na.rm = TRUE) partial_fields <- sum(field_df$Cloud_category == "Partial coverage", na.rm = TRUE) no_image_fields <- sum(field_df$Cloud_category == "No image available", na.rm = TRUE) total_fields <- nrow(field_df) clear_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Clear view"], na.rm = TRUE) partial_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Partial coverage"], na.rm = TRUE) no_image_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "No image available"], na.rm = TRUE) summary_df <- data.frame( Category = c( "--- PHASE DISTRIBUTION ---", "Germination", "Tillering", "Grand Growth", "Maturation", "Unknown phase", "--- STATUS TRIGGERS ---", "Harvest ready", "Stress detected", "Strong recovery", "Growth on track", "Germination complete", "Germination started", "No trigger", "--- CLOUD COVERAGE (FIELDS) ---", "Clear view", "Partial coverage", "No image available", "--- CLOUD COVERAGE (ACREAGE) ---", "Clear view", "Partial coverage", "No image available", "--- TOTAL ---", "Total Acreage" ), Acreage = c( NA, round(germination_acreage, 2), round(tillering_acreage, 2), round(grand_growth_acreage, 2), round(maturation_acreage, 2), round(unknown_phase_acreage, 2), NA, round(harvest_ready_acreage, 2), round(stress_acreage, 2), round(recovery_acreage, 2), round(growth_on_track_acreage, 2), round(germination_complete_acreage, 2), round(germination_started_acreage, 2), round(no_trigger_acreage, 2), NA, paste0(clear_fields, " fields"), paste0(partial_fields, " fields"), paste0(no_image_fields, " fields"), NA, round(clear_acreage, 2), round(partial_acreage, 2), round(no_image_acreage, 2), NA, round(total_acreage, 2) ), stringsAsFactors = FALSE ) return(summary_df) } #' Export field analysis to Excel, CSV, and RDS formats export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, year, reports_dir) { message("Exporting per-field analysis to Excel, CSV, and RDS...") field_df_rounded <- field_df %>% mutate(across(where(is.numeric), ~ round(., 2))) # Handle NULL summary_df summary_df_rounded <- if (!is.null(summary_df)) { summary_df %>% mutate(across(where(is.numeric), ~ round(., 2))) } else { NULL } output_dir <- file.path(reports_dir) if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".xlsx") excel_path <- file.path(output_dir, excel_filename) excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE) # Build sheets list dynamically sheets <- list( "Field Data" = field_df_rounded ) if (!is.null(summary_df_rounded)) { sheets[["Summary"]] <- summary_df_rounded } write_xlsx(sheets, excel_path) message(paste("✓ Field analysis Excel exported to:", excel_path)) kpi_data <- list( field_analysis = field_df_rounded, field_analysis_summary = summary_df_rounded, metadata = list( current_week = current_week, year = year, project = project_dir, created_at = Sys.time() ) ) rds_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".rds") rds_path <- file.path(output_dir, rds_filename) saveRDS(kpi_data, rds_path) message(paste("✓ Field analysis RDS exported to:", rds_path)) csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".csv") csv_path <- file.path(output_dir, csv_filename) write_csv(field_df_rounded, csv_path) message(paste("✓ Field analysis CSV exported to:", csv_path)) return(list(excel = excel_path, rds = rds_path, csv = csv_path)) } # ============================================================================ # ADDITIONAL CRITICAL FUNCTIONS FROM 80_weekly_stats_utils.R (REQUIRED BY 80_calculate_kpis.R) # ============================================================================ #' Calculate statistics for all fields from weekly mosaics calculate_field_statistics <- function(field_boundaries_sf, week_num, year, mosaic_dir, report_date = Sys.Date()) { message(paste("Calculating statistics for all fields - Week", week_num, year)) # Per-field mode: look in per-field subdirectories single_file_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year) # Find all field subdirectories with mosaics for this week field_dirs <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE) field_dirs <- field_dirs[field_dirs != ""] per_field_files <- list() for (field in field_dirs) { field_mosaic_dir <- file.path(mosaic_dir, field) files <- list.files(field_mosaic_dir, pattern = single_file_pattern, full.names = TRUE) if (length(files) > 0) { per_field_files[[field]] <- files[1] # Take first match for this field } } if (length(per_field_files) == 0) { stop(paste("No per-field mosaic files found for week", week_num, year, "in", mosaic_dir)) } message(paste(" Found", length(per_field_files), "per-field mosaic file(s) for week", week_num)) results_list <- list() # Initialize progress bar pb <- progress::progress_bar$new( format = " [:bar] :percent | Field :current/:total", total = length(per_field_files), width = 60 ) # Process each field's mosaic for (field_idx in seq_along(per_field_files)) { pb$tick() # Update progress bar field_name <- names(per_field_files)[field_idx] field_file <- per_field_files[[field_name]] tryCatch({ current_rast <- terra::rast(field_file) ci_band <- current_rast[["CI"]] if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) { message(paste(" [SKIP] Field", field_name, "- CI band not found")) next } # Extract CI values for this field field_boundary <- field_boundaries_sf[field_boundaries_sf$field == field_name, ] if (nrow(field_boundary) == 0) { message(paste(" [SKIP] Field", field_name, "- not in field boundaries")) next } extracted <- terra::extract(ci_band, field_boundary, na.rm = FALSE) if (nrow(extracted) == 0 || all(is.na(extracted$CI))) { message(paste(" [SKIP] Field", field_name, "- no CI values found")) next } ci_vals <- extracted$CI[!is.na(extracted$CI)] if (length(ci_vals) == 0) { next } # Calculate statistics mean_ci <- mean(ci_vals, na.rm = TRUE) ci_std <- sd(ci_vals, na.rm = TRUE) cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_ range_min <- min(ci_vals, na.rm = TRUE) range_max <- max(ci_vals, na.rm = TRUE) range_str <- sprintf("%.1f-%.1f", range_min, range_max) ci_percentiles_str <- get_ci_percentiles(ci_vals) num_pixels_total <- length(ci_vals) num_pixels_gte_2 <- sum(ci_vals >= 2) pct_pixels_gte_2 <- if (num_pixels_total > 0) round((num_pixels_gte_2 / num_pixels_total) * 100, 1) else 0 num_total <- nrow(extracted) num_data <- sum(!is.na(extracted$CI)) pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 cloud_cat <- if (num_data == 0) "No image available" else if (pct_clear >= 95) "Clear view" else "Partial coverage" # Add to results results_list[[length(results_list) + 1]] <- data.frame( Field_id = field_name, Mean_CI = round(mean_ci, 2), CV = round(cv * 100, 2), CI_range = range_str, CI_Percentiles = ci_percentiles_str, Pct_pixels_CI_gte_2 = pct_pixels_gte_2, Cloud_pct_clear = pct_clear, Cloud_category = cloud_cat, stringsAsFactors = FALSE ) }, error = function(e) { message(paste(" [ERROR] Field", field_name, ":", e$message)) }) } if (length(results_list) == 0) { stop(paste("No fields processed successfully for week", week_num)) } stats_df <- dplyr::bind_rows(results_list) message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields")) return(stats_df) } #' Load or calculate weekly statistics (with RDS caching) load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf, mosaic_dir, reports_dir, report_date = Sys.Date()) { rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year) rds_path <- file.path(reports_dir, "field_stats", rds_filename) if (file.exists(rds_path)) { message(paste("Loading cached statistics from:", basename(rds_path))) return(readRDS(rds_path)) } message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num)) stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year, mosaic_dir, report_date) output_dir <- file.path(reports_dir, "field_stats") if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) } saveRDS(stats_df, rds_path) message(paste("Saved weekly statistics RDS:", basename(rds_path))) csv_filename <- sprintf("%s_field_stats_week%02d_%d.csv", project_dir, week_num, year) csv_path <- file.path(output_dir, csv_filename) readr::write_csv(stats_df, csv_path) message(paste("Saved weekly statistics CSV:", basename(csv_path))) return(stats_df) } #' Load historical field data from CSV (4-week lookback) load_historical_field_data <- function(project_dir, current_week, current_year, reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL, daily_vals_dir = NULL) { historical_data <- list() loaded_weeks <- c() missing_weeks <- c() for (lookback in 0:(num_weeks - 1)) { target <- calculate_target_week_and_year(current_week, current_year, offset_weeks = lookback) target_week <- target$week target_year <- target$year csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", target_week, target_year), ".csv") csv_path <- file.path(reports_dir, "field_analysis", csv_filename) if (file.exists(csv_path)) { tryCatch({ data <- readr::read_csv(csv_path, show_col_types = FALSE, col_types = readr::cols(.default = readr::col_character())) historical_data[[lookback + 1]] <- list( week = target_week, year = target_year, data = data ) loaded_weeks <- c(loaded_weeks, paste0("week", sprintf("%02d_%d", target_week, target_year))) }, error = function(e) { message(paste(" Warning: Could not load week", target_week, "/", target_year, ":", e$message)) missing_weeks <<- c(missing_weeks, paste0("week", sprintf("%02d_%d", target_week, target_year))) }) } else { missing_weeks <- c(missing_weeks, paste0("week", sprintf("%02d_%d", target_week, target_year))) } } if (length(historical_data) == 0) { message(paste("Error: No historical field data found")) return(NULL) } message(paste("✓ Loaded", length(historical_data), "weeks of historical data:", paste(loaded_weeks, collapse = ", "))) return(historical_data) } #' Calculate KPI trends (CI change, CV trends, 4-week and 8-week trends) calculate_kpi_trends <- function(current_stats, prev_stats = NULL, project_dir = NULL, reports_dir = NULL, current_week = NULL, year = NULL) { message("Calculating KPI trends from current and previous week data") current_stats$Weekly_ci_change <- NA_real_ current_stats$CV_Trend_Short_Term <- NA_real_ current_stats$Four_week_trend <- NA_real_ current_stats$CV_Trend_Long_Term <- NA_real_ current_stats$nmr_of_weeks_analysed <- 1L if (is.null(prev_stats) || nrow(prev_stats) == 0) { message(" No previous week data available - using defaults") return(current_stats) } message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns")) prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) prev_field_analysis <- NULL tryCatch({ analysis_dir <- file.path(reports_dir, "field_analysis") if (dir.exists(analysis_dir)) { analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) if (length(analysis_files) > 0) { recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, col_types = readr::cols(.default = readr::col_character()), col_select = c(Field_id, nmr_of_weeks_analysed, Phase)) } } }, error = function(e) { message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message)) }) if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { message(paste(" Using previous field_analysis to track nmr_of_weeks_analysed")) } historical_4weeks <- list() historical_8weeks <- list() if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) { message(" Loading historical field_stats for 4-week and 8-week trends...") for (lookback in 1:4) { target_week <- current_week - lookback target_year <- year if (target_week < 1) { target_week <- target_week + 52 target_year <- target_year - 1 } rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year) rds_path <- file.path(reports_dir, "field_stats", rds_filename) if (file.exists(rds_path)) { tryCatch({ stats_data <- readRDS(rds_path) historical_4weeks[[length(historical_4weeks) + 1]] <- list(week = target_week, stats = stats_data) }, error = function(e) { message(paste(" Warning: Could not load week", target_week, ":", e$message)) }) } } for (lookback in 1:8) { target_week <- current_week - lookback target_year <- year if (target_week < 1) { target_week <- target_week + 52 target_year <- target_year - 1 } rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year) rds_path <- file.path(reports_dir, "field_stats", rds_filename) if (file.exists(rds_path)) { tryCatch({ stats_data <- readRDS(rds_path) historical_8weeks[[length(historical_8weeks) + 1]] <- list(week = target_week, stats = stats_data) }, error = function(e) { # Silently skip }) } } if (length(historical_4weeks) > 0) { message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend")) } if (length(historical_8weeks) > 0) { message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend")) } } cv_trends_calculated <- 0 four_week_trends_calculated <- 0 cv_long_term_calculated <- 0 for (i in seq_len(nrow(current_stats))) { field_id <- current_stats$Field_id[i] prev_idx <- prev_lookup[field_id] if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { prev_row <- prev_stats[prev_idx, , drop = FALSE] prev_ci <- prev_row$Mean_CI[1] if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) { current_stats$Weekly_ci_change[i] <- round(current_stats$Mean_CI[i] - prev_ci, 2) } prev_cv <- prev_row$CV[1] if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) { current_stats$CV_Trend_Short_Term[i] <- calculate_cv_trend(current_stats$CV[i], prev_cv) cv_trends_calculated <- cv_trends_calculated + 1 } if (length(historical_4weeks) > 0) { ci_values_4week <- numeric() for (hist_idx in rev(seq_along(historical_4weeks))) { hist_data <- historical_4weeks[[hist_idx]]$stats hist_field <- which(hist_data$Field_id == field_id) if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) { ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]]) } } ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i]) if (length(ci_values_4week) >= 2) { current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week) four_week_trends_calculated <- four_week_trends_calculated + 1 } } if (length(historical_8weeks) > 0) { cv_values_8week <- numeric() for (hist_idx in rev(seq_along(historical_8weeks))) { hist_data <- historical_8weeks[[hist_idx]]$stats hist_field <- which(hist_data$Field_id == field_id) if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) { cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]]) } } cv_values_8week <- c(cv_values_8week, current_stats$CV[i]) if (length(cv_values_8week) >= 2) { slope <- calculate_cv_trend_long_term(cv_values_8week) current_stats$CV_Trend_Long_Term[i] <- slope cv_long_term_calculated <- cv_long_term_calculated + 1 } } if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { prev_analysis_row <- prev_field_analysis %>% dplyr::filter(Field_id == field_id) if (nrow(prev_analysis_row) > 0) { prev_nmr_weeks_analysis <- prev_analysis_row$nmr_of_weeks_analysed[1] if (!is.na(prev_nmr_weeks_analysis)) { current_stats$nmr_of_weeks_analysed[i] <- prev_nmr_weeks_analysis + 1L } else { current_stats$nmr_of_weeks_analysed[i] <- 1L } } } } } message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields")) message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields")) message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields")) return(current_stats) } # ============================================================================ # INTERNAL HELPER FUNCTIONS (from 80_kpi_utils.R) - DO NOT DELETE # ============================================================================ # Spatial autocorrelation thresholds for field pattern analysis MORAN_THRESHOLD_HIGH <- 0.95 # Very strong clustering (problematic patterns) MORAN_THRESHOLD_MODERATE <- 0.85 # Moderate clustering MORAN_THRESHOLD_LOW <- 0.7 # Normal field continuity #' Calculate coefficient of variation for uniformity assessment calculate_cv <- function(values) { values <- values[!is.na(values) & is.finite(values)] if (length(values) < 2) return(NA) cv <- sd(values) / mean(values) return(cv) } #' Calculate percentage of field with positive vs negative change calculate_change_percentages <- function(current_values, previous_values) { if (length(current_values) != length(previous_values)) { return(list(positive_pct = NA, negative_pct = NA, stable_pct = NA)) } change_values <- current_values - previous_values valid_changes <- change_values[!is.na(change_values) & is.finite(change_values)] if (length(valid_changes) < 2) { return(list(positive_pct = NA, negative_pct = NA, stable_pct = NA)) } positive_pct <- sum(valid_changes > 0) / length(valid_changes) * 100 negative_pct <- sum(valid_changes < 0) / length(valid_changes) * 100 stable_pct <- sum(valid_changes == 0) / length(valid_changes) * 100 return(list( positive_pct = positive_pct, negative_pct = negative_pct, stable_pct = stable_pct )) } #' Calculate spatial autocorrelation (Moran's I) for a field calculate_spatial_autocorrelation <- function(ci_raster, field_boundary) { tryCatch({ field_raster <- terra::crop(ci_raster, field_boundary) field_raster <- terra::mask(field_raster, field_boundary) raster_points <- terra::as.points(field_raster, na.rm = TRUE) if (length(raster_points) < 10) { return(list(morans_i = NA, p_value = NA, interpretation = "insufficient_data")) } points_sf <- sf::st_as_sf(raster_points) coords <- sf::st_coordinates(points_sf) k_neighbors <- min(8, max(4, floor(nrow(coords) / 10))) knn_nb <- spdep::knearneigh(coords, k = k_neighbors) knn_listw <- spdep::nb2listw(spdep::knn2nb(knn_nb), style = "W", zero.policy = TRUE) ci_values <- points_sf[[1]] moran_result <- spdep::moran.test(ci_values, knn_listw, zero.policy = TRUE) morans_i <- moran_result$estimate[1] p_value <- moran_result$p.value interpretation <- if (is.na(morans_i)) { "insufficient_data" } else if (p_value > 0.05) { "random" } else if (morans_i > MORAN_THRESHOLD_HIGH) { "very_strong_clustering" } else if (morans_i > MORAN_THRESHOLD_MODERATE) { "strong_clustering" } else if (morans_i > MORAN_THRESHOLD_LOW) { "normal_continuity" } else if (morans_i > 0.3) { "weak_clustering" } else if (morans_i < -0.3) { "dispersed" } else { "low_autocorrelation" } return(list(morans_i = morans_i, p_value = p_value, interpretation = interpretation)) }, error = function(e) { warning(paste("Error calculating spatial autocorrelation:", e$message)) return(list(morans_i = NA, p_value = NA, interpretation = "error")) }) } #' Extract CI band from multi-band raster extract_ci_values <- function(ci_raster, field_vect) { extracted <- terra::extract(ci_raster, field_vect, fun = NULL) if ("CI" %in% names(extracted)) { return(extracted[, "CI"]) } else if (ncol(extracted) > 1) { return(extracted[, ncol(extracted)]) } else { return(extracted[, 1]) } } #' Calculate current and previous week numbers using ISO 8601 calculate_week_numbers <- function(report_date = Sys.Date()) { current_week <- as.numeric(format(report_date, "%V")) current_year <- as.numeric(format(report_date, "%G")) previous_week <- current_week - 1 previous_year <- current_year if (previous_week < 1) { previous_week <- 52 previous_year <- current_year - 1 } return(list( current_week = current_week, previous_week = previous_week, current_year = current_year, previous_year = previous_year )) } #' Load field CI raster (handles single-file and per-field architectures) load_field_ci_raster <- function(ci_raster_or_obj, field_name, field_vect = NULL) { is_per_field <- !is.null(attr(ci_raster_or_obj, "is_per_field")) && attr(ci_raster_or_obj, "is_per_field") if (is_per_field) { per_field_dir <- attr(ci_raster_or_obj, "per_field_dir") week_file <- attr(ci_raster_or_obj, "week_file") field_mosaic_path <- file.path(per_field_dir, field_name, week_file) if (file.exists(field_mosaic_path)) { tryCatch({ field_mosaic <- terra::rast(field_mosaic_path) if (terra::nlyr(field_mosaic) >= 5) { return(field_mosaic[[5]]) } else { return(field_mosaic[[1]]) } }, error = function(e) { return(NULL) }) } else { return(NULL) } } else { if (!is.null(field_vect)) { return(terra::crop(ci_raster_or_obj, field_vect, mask = TRUE)) } else { return(ci_raster_or_obj) } } } #' Load weekly CI mosaic (single-file or per-field) load_weekly_ci_mosaic <- function(week_num, year, mosaic_dir) { week_file <- sprintf("week_%02d_%d.tif", week_num, year) week_path <- file.path(mosaic_dir, week_file) if (file.exists(week_path)) { tryCatch({ mosaic_raster <- terra::rast(week_path) ci_raster <- mosaic_raster[[5]] names(ci_raster) <- "CI" return(ci_raster) }, error = function(e) { return(NULL) }) } if (dir.exists(mosaic_dir)) { field_dirs <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE) field_dirs <- field_dirs[field_dirs != ""] found_any <- FALSE for (field in field_dirs) { field_mosaic_path <- file.path(mosaic_dir, field, week_file) if (file.exists(field_mosaic_path)) { found_any <- TRUE break } } if (found_any) { dummy_raster <- terra::rast(nrow=1, ncol=1, vals=NA) attr(dummy_raster, "per_field_dir") <- mosaic_dir attr(dummy_raster, "week_file") <- week_file attr(dummy_raster, "is_per_field") <- TRUE return(dummy_raster) } } return(NULL) } #' Prepare predictions with consistent naming prepare_predictions <- function(predictions, newdata) { return(predictions %>% as.data.frame() %>% dplyr::rename(predicted_Tcha = ".") %>% dplyr::mutate( sub_field = newdata$sub_field, field = newdata$field, Age_days = newdata$DOY, total_CI = round(newdata$cumulative_CI, 0), predicted_Tcha = round(predicted_Tcha, 0), season = newdata$season ) %>% dplyr::select(field, sub_field, Age_days, predicted_Tcha, season) %>% dplyr::left_join(., newdata, by = c("field", "sub_field", "season")) ) } # ============================================================================ # YIELD PREDICTION KPI (SHARED ML-BASED MODEL FOR ALL CLIENT TYPES) # ============================================================================ #' Helper function for graceful fallback when training data unavailable #' #' @param field_boundaries Field boundaries (sf or SpatVector) #' @return List with summary and field_results (both with NA values) create_fallback_result <- function(field_boundaries) { # Convert to SpatVector if needed (for terra::project) if (!inherits(field_boundaries, "SpatVector")) { field_boundaries <- terra::vect(field_boundaries) } field_boundaries_projected <- terra::project(field_boundaries, "EPSG:6933") # Equal Earth projection field_areas <- terra::expanse(field_boundaries_projected) / 10000 # Convert m² to hectares total_area <- sum(field_areas) summary_result <- data.frame( field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"), count = c(0, 0, 0, nrow(field_boundaries)), value = c(NA_real_, NA_real_, NA_real_, round(total_area, 1)), stringsAsFactors = FALSE ) field_results <- data.frame( field = character(0), sub_field = character(0), Age_days = numeric(0), yield_forecast_t_ha = numeric(0), season = numeric(0), stringsAsFactors = FALSE ) return(list(summary = summary_result, field_results = field_results)) } #' Calculate yield prediction KPI using Random Forest with Feature Selection #' #' Trains a Random Forest model on historical harvest data with cumulative CI, #' days of year (DOY), and CI-per-day as predictors. Uses CAST::ffs() for #' Forward Feature Selection. Predicts yields for mature fields (DOY >= 240). #' #' @param field_boundaries Field boundaries (sf or SpatVector) #' @param harvesting_data Data frame with harvest data including tonnage_ha column #' @param cumulative_CI_vals_dir Directory containing "All_pivots_Cumulative_CI_quadrant_year_v2.rds" #' #' @return List with: #' - summary: Data frame with field_groups, count, value (quartiles and total area) #' - field_results: Data frame with field-level yield forecasts (yield_forecast_t_ha) #' #' @details #' **Training Data Requirements:** #' - cumulative_CI_vals_dir must contain "All_pivots_Cumulative_CI_quadrant_year_v2.rds" #' - harvesting_data must have tonnage_ha column with numeric yield values #' - Training stops gracefully if either is missing (returns NA values, not fake numbers) #' #' **Model Specifications:** #' - Algorithm: Random Forest (caret + CAST) #' - Feature Selection: Forward Feature Selection (CAST::ffs) #' - Cross-validation: 5-fold CV #' - Predictors: cumulative_CI, DOY, CI_per_day #' - Mature field threshold: DOY >= 240 (8 months) #' - Output: Field-level yield forecasts grouped by quartile #' #' **Error Handling:** #' - Missing tonnage_ha: Returns graceful fallback with NA (not zero) values #' - No training data: Logs WARNING, returns empty field_results #' - RDS file missing: Returns graceful fallback #' - Prediction errors: Wrapped in tryCatch, returns fallback on failure calculate_yield_prediction_kpi <- function(field_boundaries, harvesting_data, cumulative_CI_vals_dir) { safe_log("Calculating yield prediction KPI using Random Forest with Feature Selection") tryCatch({ # Check if tonnage_ha column is present and has valid data if (is.null(harvesting_data) || !("tonnage_ha" %in% names(harvesting_data)) || all(is.na(harvesting_data$tonnage_ha))) { safe_log("No harvest data available: lacking tonnage_ha column or all values are NA", "WARNING") return(create_fallback_result(field_boundaries)) } # Check if CI quadrant RDS file exists ci_quadrant_path <- file.path(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds") if (!file.exists(ci_quadrant_path)) { safe_log(paste("CI quadrant file not found at:", ci_quadrant_path), "WARNING") return(create_fallback_result(field_boundaries)) } # Load CI quadrant data and fill missing field/sub_field values CI_quadrant <- readRDS(ci_quadrant_path) %>% dplyr::group_by(model) %>% tidyr::fill(field, sub_field, .direction = "downup") %>% dplyr::ungroup() # Rename year column to season for consistency harvesting_data_renamed <- harvesting_data %>% dplyr::rename(season = year) # Join CI and yield data CI_and_yield <- dplyr::left_join(CI_quadrant, harvesting_data_renamed, by = c("field", "sub_field", "season")) %>% dplyr::group_by(sub_field, season) %>% dplyr::slice(which.max(DOY)) %>% dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>% dplyr::mutate(CI_per_day = cumulative_CI / DOY) # Define predictors and response variables predictors <- c("cumulative_CI", "DOY", "CI_per_day") response <- "tonnage_ha" # Prepare training dataset (fields with harvest data) CI_and_yield_test <- CI_and_yield %>% as.data.frame() %>% dplyr::filter(!is.na(tonnage_ha)) # Check if we have minimum training data if (nrow(CI_and_yield_test) == 0) { safe_log("No training data available: no fields with tonnage_ha observations", "WARNING") return(create_fallback_result(field_boundaries)) } # Pre-clean training data: remove rows with any NAs in predictors or response # This is required because CAST::ffs doesn't support na.rm parameter CI_and_yield_test <- CI_and_yield_test %>% dplyr::filter(!dplyr::if_any(dplyr::all_of(c(predictors, response)), is.na)) if (nrow(CI_and_yield_test) == 0) { safe_log("No complete training data after removing NAs in predictors/response", "WARNING") return(create_fallback_result(field_boundaries)) } # Prepare prediction dataset (fields without harvest data, mature fields only) prediction_yields <- CI_and_yield %>% as.data.frame() %>% dplyr::filter(is.na(tonnage_ha) & DOY >= 240) # Mature fields only # Configure model training parameters ctrl <- caret::trainControl( method = "cv", savePredictions = TRUE, allowParallel = TRUE, number = 5, verboseIter = FALSE ) # Train the model with forward feature selection set.seed(202) # For reproducibility safe_log("Training Random Forest model with Forward Feature Selection...") model_ffs_rf <- CAST::ffs( CI_and_yield_test[, predictors], CI_and_yield_test[, response], method = "rf", trControl = ctrl, importance = TRUE, withinSE = TRUE, tuneLength = 5 ) # Predict yields on validation data (same as training data for RMSE calculation) pred_ffs_rf <- prepare_predictions( stats::predict(model_ffs_rf, newdata = CI_and_yield_test), CI_and_yield_test ) # Extract cross-validated RMSE from the model object (more honest than in-sample RMSE) # The CAST::ffs model stores CV results in $results data frame # We extract the RMSE from the best model (lowest RMSE across folds) if (!is.null(model_ffs_rf$results) && "RMSE" %in% names(model_ffs_rf$results)) { # Get minimum RMSE from cross-validation results (best model from feature selection) rmse_value <- min(model_ffs_rf$results$RMSE, na.rm = TRUE) safe_log(paste("Yield prediction RMSE (cross-validated):", round(rmse_value, 2), "t/ha")) } else { # Fallback: compute in-sample RMSE if CV results unavailable, but label it clearly rmse_value <- sqrt(mean((pred_ffs_rf$predicted_Tcha - CI_and_yield_test$tonnage_ha)^2, na.rm = TRUE)) safe_log(paste("Yield prediction RMSE (in-sample/training):", round(rmse_value, 2), "t/ha")) } # Predict yields for current season (mature fields >= 240 days) if (nrow(prediction_yields) > 0) { pred_rf_current_season <- prepare_predictions( stats::predict(model_ffs_rf, newdata = prediction_yields), prediction_yields ) %>% dplyr::filter(Age_days >= 240) %>% dplyr::select(c("field", "Age_days", "predicted_Tcha", "season")) } else { pred_rf_current_season <- data.frame() } # Calculate summary statistics for KPI if (nrow(pred_rf_current_season) > 0) { safe_log(paste("Number of fields with yield predictions:", nrow(pred_rf_current_season))) safe_log(paste("Predicted yield range:", round(min(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1), "-", round(max(pred_rf_current_season$predicted_Tcha, na.rm = TRUE), 1), "t/ha")) # Calculate quartiles for grouping yield_quartiles <- quantile(pred_rf_current_season$predicted_Tcha, probs = c(0.25, 0.5, 0.75), na.rm = TRUE) # Count fields in each quartile top_25_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[3], na.rm = TRUE) average_count <- sum(pred_rf_current_season$predicted_Tcha >= yield_quartiles[1] & pred_rf_current_season$predicted_Tcha < yield_quartiles[3], na.rm = TRUE) lowest_25_count <- sum(pred_rf_current_season$predicted_Tcha < yield_quartiles[1], na.rm = TRUE) # Calculate total area if (!inherits(field_boundaries, "SpatVector")) { field_boundaries_vect <- terra::vect(field_boundaries) } else { field_boundaries_vect <- field_boundaries } # Handle both sf and SpatVector inputs for area calculation if (inherits(field_boundaries, "sf")) { field_boundaries_projected <- sf::st_transform(field_boundaries, "EPSG:6933") field_areas <- sf::st_area(field_boundaries_projected) / 10000 # m² to hectares } else { field_boundaries_projected <- terra::project(field_boundaries_vect, "EPSG:6933") field_areas <- terra::expanse(field_boundaries_projected) / 10000 } total_area <- sum(as.numeric(field_areas)) safe_log(paste("Total area:", round(total_area, 1), "hectares")) # Build summary result result <- data.frame( field_groups = c("Top 25%", "Average", "Lowest 25%", "Total area forecasted"), count = c(top_25_count, average_count, lowest_25_count, nrow(field_boundaries)), value = c(round(yield_quartiles[3], 1), round(yield_quartiles[2], 1), round(yield_quartiles[1], 1), round(total_area, 1)), stringsAsFactors = FALSE ) # Prepare field-level results field_level_results <- pred_rf_current_season %>% dplyr::select(field, Age_days, predicted_Tcha, season) %>% dplyr::rename(yield_forecast_t_ha = predicted_Tcha) safe_log("✓ Yield prediction complete") return(list(summary = result, field_results = field_level_results)) } else { safe_log("No fields meet maturity threshold (DOY >= 240) for prediction", "WARNING") return(list(summary = create_fallback_result(field_boundaries)$summary, field_results = data.frame())) } }, error = function(e) { safe_log(paste("Error in yield prediction:", e$message), "ERROR") return(create_fallback_result(field_boundaries)) }) }