From 83951efca21ab88057909c7d763c45e1593f2fb1 Mon Sep 17 00:00:00 2001 From: Timon Date: Tue, 24 Feb 2026 13:48:06 +0100 Subject: [PATCH] =?UTF-8?q?Refactor=20gap=20filling=20score=20calculations?= =?UTF-8?q?=20to=20use=201=CF=83=20method;=20ensure=20all=20fields=20are?= =?UTF-8?q?=20included=20in=20output?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- r_app/80_utils_cane_supply.R | 47 ++++++++++++++------- r_app/80_utils_common.R | 77 ++++++++++++++++++++-------------- r_app/MANUAL_PIPELINE_RUNNER.R | 6 +-- 3 files changed, 81 insertions(+), 49 deletions(-) diff --git a/r_app/80_utils_cane_supply.R b/r_app/80_utils_cane_supply.R index ca896a1..6b50033 100644 --- a/r_app/80_utils_cane_supply.R +++ b/r_app/80_utils_cane_supply.R @@ -80,7 +80,8 @@ calculate_field_acreages <- function(field_boundaries_sf, unit = AREA_UNIT_PREFE result_df <- data.frame( field = field_names, - warning(paste("Could not calculate areas from geometries -", e$message)) stringsAsFactors = FALSE + area = areas, + stringsAsFactors = FALSE ) colnames(result_df) <- c("field", col_name) @@ -213,14 +214,19 @@ calculate_status_alert <- function(imminent_prob, age_week, mean_ci, -#' Build complete per-field KPI dataframe with all 22 columns -#' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats +#' Build complete per-field KPI dataframe - ALWAYS ALL FIELDS from GeoJSON +#' +#' CRITICAL: Always outputs all fields from field_boundaries_sf, regardless of data availability. +#' Fields without satellite data this week have all KPI columns set to NA. +#' This is dynamic - output row count equals nrow(field_boundaries_sf), not hardcoded. +#' +#' @param current_stats data.frame with current week statistics (may have fewer rows if data is incomplete) #' @param planting_dates data.frame with field_id and planting_date columns #' @param imminent_prob_data data.frame with Field_id and Imminent_prob_actual columns (or NULL) -#' @param gap_scores_df data.frame with Field_id and gap_score columns (or NULL) -#' @param field_boundaries_sf sf object with field geometries +#' @param gap_scores_df data.frame with Field_id and gap_score columns (guaranteed to have all fields) +#' @param field_boundaries_sf sf object with field geometries (authoritative source of all field IDs) #' @param end_date Date object for current report date -#' @return data.frame with all 22 KPI columns +#' @return data.frame with 22 columns and nrow(field_boundaries_sf) rows (all fields, NAs for missing data) calculate_all_field_kpis <- function(current_stats, planting_dates, imminent_prob_data, @@ -230,6 +236,14 @@ calculate_all_field_kpis <- function(current_stats, message("\nBuilding final field analysis output...") + # CRITICAL: Start with ALL fields from field_boundaries_sf (dynamic, based on GeoJSON) + # This ensures output always includes all fields, even if they lack satellite data + all_fields <- field_boundaries_sf %>% + sf::st_drop_geometry() %>% + mutate(Field_id = field) %>% + select(Field_id) %>% + distinct() + # Pre-calculate areas using unified function acreage_lookup <- calculate_field_acreages(field_boundaries_sf, unit = AREA_UNIT_PREFERENCE) @@ -237,7 +251,10 @@ calculate_all_field_kpis <- function(current_stats, unit_label <- get_area_unit_label(AREA_UNIT_PREFERENCE) area_col_name <- paste0("Area_", unit_label) - field_analysis_df <- current_stats %>% + # Left-join current_stats to all_fields to preserve all field IDs + # Fields without data in current_stats will have NA values + field_analysis_df <- all_fields %>% + left_join(current_stats, by = "Field_id") %>% mutate( # Column 2: Farm_Section (user fills manually) Farm_Section = NA_character_, @@ -258,7 +275,7 @@ calculate_all_field_kpis <- function(current_stats, # Column 9: Age_week (calculated) Age_week = { - sapply(seq_len(nrow(current_stats)), function(idx) { + sapply(seq_len(n()), function(idx) { calculate_age_week(Last_harvest_or_planting_date[idx], end_date) }) }, @@ -274,13 +291,13 @@ calculate_all_field_kpis <- function(current_stats, if (!is.null(imminent_prob_data)) { as.numeric(imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)]) } else { - rep(NA_real_, nrow(current_stats)) + rep(NA_real_, n()) } }, # Column 14: Status_Alert (multi-priority logic for harvest/milling workflow) Status_Alert = { - sapply(seq_len(nrow(current_stats)), function(idx) { + sapply(seq_len(n()), function(idx) { calculate_status_alert( imminent_prob = Imminent_prob[idx], age_week = Age_week[idx], @@ -293,17 +310,17 @@ calculate_all_field_kpis <- function(current_stats, }, # Column 19b: CV_Trend_Long_Term_Category (categorical slope) - CV_Trend_Long_Term_Category = sapply(current_stats$CV_Trend_Long_Term, categorize_cv_trend_long_term), + CV_Trend_Long_Term_Category = sapply(CV_Trend_Long_Term, categorize_cv_trend_long_term), # Column 21: Cloud_pct_clear (binned into intervals) Cloud_pct_clear = sapply(Cloud_pct_clear, bin_percentage), - # Column 22: Gap_score (2σ method) + # Column 22: Gap_score (1σ method); gap_scores_df now has all fields Gap_score = { if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) { - gap_scores_df$gap_score[match(current_stats$Field_id, gap_scores_df$Field_id)] + gap_scores_df$gap_score[match(Field_id, gap_scores_df$Field_id)] } else { - rep(NA_real_, nrow(current_stats)) + rep(NA_real_, n()) } } ) %>% @@ -316,7 +333,7 @@ calculate_all_field_kpis <- function(current_stats, "Imminent_prob", "Cloud_pct_clear", "Cloud_category", "Gap_score")) ) - message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 22 columns")) + message(paste("✓ Built final output with", nrow(field_analysis_df), "fields (from GeoJSON) and 22 columns")) return(field_analysis_df) } diff --git a/r_app/80_utils_common.R b/r_app/80_utils_common.R index f74b7d9..0eb9da5 100644 --- a/r_app/80_utils_common.R +++ b/r_app/80_utils_common.R @@ -465,7 +465,7 @@ calculate_regression_slope <- function(values, decimal_places = 2) { }) } -#' Calculate Gap Filling Score KPI (2σ method) +#' Calculate Gap Filling Score KPI (1σ method) #' @param ci_raster Current week CI raster (single band) #' @param field_boundaries Field boundaries (sf or SpatVector) #' @return List with summary data frame and field-level results data frame @@ -490,7 +490,7 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) { valid_values <- ci_values[!is.na(ci_values) & is.finite(ci_values)] if (length(valid_values) > 1) { - # Gap score using 2σ below median to detect outliers + # Gap score using 1σ below median to detect outliers median_ci <- median(valid_values) sd_ci <- sd(valid_values) outlier_threshold <- median_ci - (1 * sd_ci) @@ -544,8 +544,17 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) { #' @param field_boundaries_sf sf object with field geometries #' @return data.frame with Field_id and gap_score columns calculate_gap_scores <- function(per_field_files, field_boundaries_sf) { - message("\nCalculating gap filling scores (2σ method)...") - message(paste(" Using per-field mosaics for", length(per_field_files), "fields")) + message("\nCalculating gap filling scores (1σ method)...") + message(paste(" Using per-field mosaics for", nrow(field_boundaries_sf), "fields")) + + # Initialize results with ALL field IDs (1185 for angata), defaulting to NA + # This ensures that even fields without satellite data this week are included in output + all_field_ids <- field_boundaries_sf$field + gap_scores_df <- data.frame( + Field_id = all_field_ids, + gap_score = NA_real_, + stringsAsFactors = FALSE + ) field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field) @@ -583,37 +592,43 @@ calculate_gap_scores <- function(per_field_files, field_boundaries_sf) { }) } - # Process fields sequentially with progress bar - message(" Processing gap scores for ", length(per_field_files), " fields...") - pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50) - - results_list <- lapply(seq_along(per_field_files), function(idx) { - result <- process_gap_for_field(per_field_files[[idx]]) - utils::setTxtProgressBar(pb, idx) - result - }) - close(pb) - - gap_scores_df <- dplyr::bind_rows(results_list) - - if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) { - gap_scores_df <- gap_scores_df %>% + # Process only fields that have per-field mosaic files + n_files <- length(per_field_files) + if (n_files > 0) { + message(" Processing gap scores for", n_files, "available per-field mosaics...") + pb <- utils::txtProgressBar(min = 0, max = n_files, style = 3, width = 50) + + results_list <- lapply(seq_along(per_field_files), function(idx) { + result <- process_gap_for_field(per_field_files[[idx]]) + utils::setTxtProgressBar(pb, idx) + result + }) + close(pb) + + # Merge calculated results into the complete field list + results_df <- dplyr::bind_rows(results_list) %>% dplyr::group_by(Field_id) %>% dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop") - message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields")) - - # Guard against all-NA values which would produce Inf/-Inf warnings - if (any(is.finite(gap_scores_df$gap_score))) { - min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2) - max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2) - message(paste(" Gap score range:", min_score, "-", max_score, "%")) - } else { - message(" Gap score range: All values are NA (no valid gap scores)") - } + # Update gap_scores_df with calculated values (overwrite NAs where we have data) + gap_scores_df <- gap_scores_df %>% + left_join(results_df, by = "Field_id") %>% + mutate(gap_score = coalesce(gap_score.y, gap_score.x)) %>% + select(Field_id, gap_score) } else { - message(" WARNING: No gap scores calculated from per-field mosaics") - gap_scores_df <- NULL + message(" No per-field mosaic files available - all gap scores will be NA") + } + + # Always count how many fields got actual calculations (not NA) + n_calculated <- sum(!is.na(gap_scores_df$gap_score)) + message(paste(" ✓ Gap scores for", nrow(gap_scores_df), "fields (", n_calculated, "calculated,", + nrow(gap_scores_df) - n_calculated, "no data)")) + + # Guard against all-NA values which would produce Inf/-Inf warnings + if (any(is.finite(gap_scores_df$gap_score))) { + min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2) + max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2) + message(paste(" Gap score range:", min_score, "-", max_score, "%")) } return(gap_scores_df) diff --git a/r_app/MANUAL_PIPELINE_RUNNER.R b/r_app/MANUAL_PIPELINE_RUNNER.R index 0133024..9575bfd 100644 --- a/r_app/MANUAL_PIPELINE_RUNNER.R +++ b/r_app/MANUAL_PIPELINE_RUNNER.R @@ -382,7 +382,7 @@ # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [END_DATE] [PROJECT] [OFFSET] # # Example (2026-02-09, angata, 7-day lookback): -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-02-09 angata 7 +# # # EXPECTED OUTPUT: # File: {PROJECT}_field_analysis_week{WW}_{YYYY}.xlsx @@ -456,8 +456,8 @@ rmarkdown::render( # rmarkdown::render( rmarkdown::render( "r_app/91_CI_report_with_kpis_cane_supply.Rmd", - params = list(data_dir = "angata", report_date = as.Date("2026-02-04")), - output_file = "SmartCane_Report_cane_supply_angata_2026-02-04.docx", + params = list(data_dir = "angata", report_date = as.Date("2026-02-23")), + output_file = "SmartCane_Report_cane_supply_angata_2026-02-23_en.docx", output_dir = "laravel_app/storage/app/angata/reports" ) #