Refactor gap filling score calculations to use 1σ method; ensure all fields are included in output

This commit is contained in:
Timon 2026-02-24 13:48:06 +01:00
parent e0bfbccf0e
commit 83951efca2
3 changed files with 81 additions and 49 deletions

View file

@ -80,7 +80,8 @@ calculate_field_acreages <- function(field_boundaries_sf, unit = AREA_UNIT_PREFE
result_df <- data.frame(
field = field_names,
warning(paste("Could not calculate areas from geometries -", e$message)) stringsAsFactors = FALSE
area = areas,
stringsAsFactors = FALSE
)
colnames(result_df) <- c("field", col_name)
@ -213,14 +214,19 @@ calculate_status_alert <- function(imminent_prob, age_week, mean_ci,
#' Build complete per-field KPI dataframe with all 22 columns
#' @param current_stats data.frame with current week statistics from load_or_calculate_weekly_stats
#' Build complete per-field KPI dataframe - ALWAYS ALL FIELDS from GeoJSON
#'
#' CRITICAL: Always outputs all fields from field_boundaries_sf, regardless of data availability.
#' Fields without satellite data this week have all KPI columns set to NA.
#' This is dynamic - output row count equals nrow(field_boundaries_sf), not hardcoded.
#'
#' @param current_stats data.frame with current week statistics (may have fewer rows if data is incomplete)
#' @param planting_dates data.frame with field_id and planting_date columns
#' @param imminent_prob_data data.frame with Field_id and Imminent_prob_actual columns (or NULL)
#' @param gap_scores_df data.frame with Field_id and gap_score columns (or NULL)
#' @param field_boundaries_sf sf object with field geometries
#' @param gap_scores_df data.frame with Field_id and gap_score columns (guaranteed to have all fields)
#' @param field_boundaries_sf sf object with field geometries (authoritative source of all field IDs)
#' @param end_date Date object for current report date
#' @return data.frame with all 22 KPI columns
#' @return data.frame with 22 columns and nrow(field_boundaries_sf) rows (all fields, NAs for missing data)
calculate_all_field_kpis <- function(current_stats,
planting_dates,
imminent_prob_data,
@ -230,6 +236,14 @@ calculate_all_field_kpis <- function(current_stats,
message("\nBuilding final field analysis output...")
# CRITICAL: Start with ALL fields from field_boundaries_sf (dynamic, based on GeoJSON)
# This ensures output always includes all fields, even if they lack satellite data
all_fields <- field_boundaries_sf %>%
sf::st_drop_geometry() %>%
mutate(Field_id = field) %>%
select(Field_id) %>%
distinct()
# Pre-calculate areas using unified function
acreage_lookup <- calculate_field_acreages(field_boundaries_sf, unit = AREA_UNIT_PREFERENCE)
@ -237,7 +251,10 @@ calculate_all_field_kpis <- function(current_stats,
unit_label <- get_area_unit_label(AREA_UNIT_PREFERENCE)
area_col_name <- paste0("Area_", unit_label)
field_analysis_df <- current_stats %>%
# Left-join current_stats to all_fields to preserve all field IDs
# Fields without data in current_stats will have NA values
field_analysis_df <- all_fields %>%
left_join(current_stats, by = "Field_id") %>%
mutate(
# Column 2: Farm_Section (user fills manually)
Farm_Section = NA_character_,
@ -258,7 +275,7 @@ calculate_all_field_kpis <- function(current_stats,
# Column 9: Age_week (calculated)
Age_week = {
sapply(seq_len(nrow(current_stats)), function(idx) {
sapply(seq_len(n()), function(idx) {
calculate_age_week(Last_harvest_or_planting_date[idx], end_date)
})
},
@ -274,13 +291,13 @@ calculate_all_field_kpis <- function(current_stats,
if (!is.null(imminent_prob_data)) {
as.numeric(imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)])
} else {
rep(NA_real_, nrow(current_stats))
rep(NA_real_, n())
}
},
# Column 14: Status_Alert (multi-priority logic for harvest/milling workflow)
Status_Alert = {
sapply(seq_len(nrow(current_stats)), function(idx) {
sapply(seq_len(n()), function(idx) {
calculate_status_alert(
imminent_prob = Imminent_prob[idx],
age_week = Age_week[idx],
@ -293,17 +310,17 @@ calculate_all_field_kpis <- function(current_stats,
},
# Column 19b: CV_Trend_Long_Term_Category (categorical slope)
CV_Trend_Long_Term_Category = sapply(current_stats$CV_Trend_Long_Term, categorize_cv_trend_long_term),
CV_Trend_Long_Term_Category = sapply(CV_Trend_Long_Term, categorize_cv_trend_long_term),
# Column 21: Cloud_pct_clear (binned into intervals)
Cloud_pct_clear = sapply(Cloud_pct_clear, bin_percentage),
# Column 22: Gap_score (2σ method)
# Column 22: Gap_score (1σ method); gap_scores_df now has all fields
Gap_score = {
if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
gap_scores_df$gap_score[match(current_stats$Field_id, gap_scores_df$Field_id)]
gap_scores_df$gap_score[match(Field_id, gap_scores_df$Field_id)]
} else {
rep(NA_real_, nrow(current_stats))
rep(NA_real_, n())
}
}
) %>%
@ -316,7 +333,7 @@ calculate_all_field_kpis <- function(current_stats,
"Imminent_prob", "Cloud_pct_clear", "Cloud_category", "Gap_score"))
)
message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 22 columns"))
message(paste("✓ Built final output with", nrow(field_analysis_df), "fields (from GeoJSON) and 22 columns"))
return(field_analysis_df)
}

View file

@ -465,7 +465,7 @@ calculate_regression_slope <- function(values, decimal_places = 2) {
})
}
#' Calculate Gap Filling Score KPI (2σ method)
#' Calculate Gap Filling Score KPI (1σ method)
#' @param ci_raster Current week CI raster (single band)
#' @param field_boundaries Field boundaries (sf or SpatVector)
#' @return List with summary data frame and field-level results data frame
@ -490,7 +490,7 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
valid_values <- ci_values[!is.na(ci_values) & is.finite(ci_values)]
if (length(valid_values) > 1) {
# Gap score using 2σ below median to detect outliers
# Gap score using 1σ below median to detect outliers
median_ci <- median(valid_values)
sd_ci <- sd(valid_values)
outlier_threshold <- median_ci - (1 * sd_ci)
@ -544,8 +544,17 @@ calculate_gap_filling_kpi <- function(ci_raster, field_boundaries) {
#' @param field_boundaries_sf sf object with field geometries
#' @return data.frame with Field_id and gap_score columns
calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
message("\nCalculating gap filling scores (2σ method)...")
message(paste(" Using per-field mosaics for", length(per_field_files), "fields"))
message("\nCalculating gap filling scores (1σ method)...")
message(paste(" Using per-field mosaics for", nrow(field_boundaries_sf), "fields"))
# Initialize results with ALL field IDs (1185 for angata), defaulting to NA
# This ensures that even fields without satellite data this week are included in output
all_field_ids <- field_boundaries_sf$field
gap_scores_df <- data.frame(
Field_id = all_field_ids,
gap_score = NA_real_,
stringsAsFactors = FALSE
)
field_boundaries_by_id <- split(field_boundaries_sf, field_boundaries_sf$field)
@ -583,37 +592,43 @@ calculate_gap_scores <- function(per_field_files, field_boundaries_sf) {
})
}
# Process fields sequentially with progress bar
message(" Processing gap scores for ", length(per_field_files), " fields...")
pb <- utils::txtProgressBar(min = 0, max = length(per_field_files), style = 3, width = 50)
# Process only fields that have per-field mosaic files
n_files <- length(per_field_files)
if (n_files > 0) {
message(" Processing gap scores for", n_files, "available per-field mosaics...")
pb <- utils::txtProgressBar(min = 0, max = n_files, style = 3, width = 50)
results_list <- lapply(seq_along(per_field_files), function(idx) {
result <- process_gap_for_field(per_field_files[[idx]])
utils::setTxtProgressBar(pb, idx)
result
})
close(pb)
results_list <- lapply(seq_along(per_field_files), function(idx) {
result <- process_gap_for_field(per_field_files[[idx]])
utils::setTxtProgressBar(pb, idx)
result
})
close(pb)
gap_scores_df <- dplyr::bind_rows(results_list)
if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
gap_scores_df <- gap_scores_df %>%
# Merge calculated results into the complete field list
results_df <- dplyr::bind_rows(results_list) %>%
dplyr::group_by(Field_id) %>%
dplyr::summarise(gap_score = mean(gap_score, na.rm = TRUE), .groups = "drop")
message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields"))
# Guard against all-NA values which would produce Inf/-Inf warnings
if (any(is.finite(gap_scores_df$gap_score))) {
min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
message(paste(" Gap score range:", min_score, "-", max_score, "%"))
} else {
message(" Gap score range: All values are NA (no valid gap scores)")
}
# Update gap_scores_df with calculated values (overwrite NAs where we have data)
gap_scores_df <- gap_scores_df %>%
left_join(results_df, by = "Field_id") %>%
mutate(gap_score = coalesce(gap_score.y, gap_score.x)) %>%
select(Field_id, gap_score)
} else {
message(" WARNING: No gap scores calculated from per-field mosaics")
gap_scores_df <- NULL
message(" No per-field mosaic files available - all gap scores will be NA")
}
# Always count how many fields got actual calculations (not NA)
n_calculated <- sum(!is.na(gap_scores_df$gap_score))
message(paste(" ✓ Gap scores for", nrow(gap_scores_df), "fields (", n_calculated, "calculated,",
nrow(gap_scores_df) - n_calculated, "no data)"))
# Guard against all-NA values which would produce Inf/-Inf warnings
if (any(is.finite(gap_scores_df$gap_score))) {
min_score <- round(min(gap_scores_df$gap_score, na.rm = TRUE), 2)
max_score <- round(max(gap_scores_df$gap_score, na.rm = TRUE), 2)
message(paste(" Gap score range:", min_score, "-", max_score, "%"))
}
return(gap_scores_df)

View file

@ -382,7 +382,7 @@
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [END_DATE] [PROJECT] [OFFSET]
#
# Example (2026-02-09, angata, 7-day lookback):
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-02-09 angata 7
#
#
# EXPECTED OUTPUT:
# File: {PROJECT}_field_analysis_week{WW}_{YYYY}.xlsx
@ -456,8 +456,8 @@ rmarkdown::render(
# rmarkdown::render(
rmarkdown::render(
"r_app/91_CI_report_with_kpis_cane_supply.Rmd",
params = list(data_dir = "angata", report_date = as.Date("2026-02-04")),
output_file = "SmartCane_Report_cane_supply_angata_2026-02-04.docx",
params = list(data_dir = "angata", report_date = as.Date("2026-02-23")),
output_file = "SmartCane_Report_cane_supply_angata_2026-02-23_en.docx",
output_dir = "laravel_app/storage/app/angata/reports"
)
#