SmartCane/r_app/80_calculate_kpis.R
DimitraVeropoulou f94a6317bd feat: Integrate 2σ gap filling KPI into weekly field analysis
- Changed gap calculation from Q25 to 2σ below median method (kpi_utils.R)
- Integrated gap filling into script 80 with tile-based processing
- Added Gap_score column to field analysis output (Excel/CSV/RDS)
- Fixed memory issues by processing 25 tiles individually instead of merging
- Fixed Field_id matching to ensure gap scores populate correctly

Gap scores now calculate for all 1185 fields with range 0-11.25%
Works with tile-based mosaics (Angata 5x5 grid) without memory errors
2026-02-03 16:41:04 +01:00

863 lines
33 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 80_CALCULATE_KPIS.R (CONSOLIDATED KPI CALCULATION)
# ============================================================================
# UNIFIED KPI CALCULATION SCRIPT
#
# This script combines:
# 1. Per-field weekly analysis (from 09c: field-level trends, phases, statuses)
# 2. Farm-level KPI metrics (from old 09: 6 high-level indicators)
#
# FEATURES:
# - Per-field analysis with SC-64 enhancements (4-week trends, CI percentiles, etc.)
# - Farm-level KPI calculation (6 metrics for executive overview)
# - Parallel processing (tile-aware, 1000+ fields supported)
# - Comprehensive Excel + RDS + CSV exports (21 columns per spec)
# - Test mode for development
# CRITICAL INTEGRATIONS:
#
# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307)
# [✓] Load script 31 output: {project}_week_{WW}_{YYYY}.csv
# Columns: field, imminent_prob, detected_prob, week, year
# [✓] LEFT JOIN to field_analysis_df by field
# [✓] Use actual harvest probability data instead of placeholder
#
# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 23)
# [✓] Load harvest.xlsx with planting_date (season_start)
# [✓] Extract planting dates per field
# [✓] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
#
# COMMAND-LINE USAGE:
# Option 1: Rscript 80_calculate_kpis.R 2026-01-14 angata
# Arguments: [end_date] [project_dir]
#
# Option 2: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
# Arguments: [end_date] [project_dir] [offset_days]
#
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-01-12 angata 7
#
# Usage in run_full_pipeline.R:
# source("r_app/80_calculate_kpis.R")
# main()
# ============================================================================
# NEXT INTEGRATIONS (See Linear issues for detailed requirements)
# ============================================================================
# 1. [✓] Load imminent_prob from script 31 (week_WW_YYYY.csv)
# 2. [✓] Load planting_date from harvest.xlsx for field-specific age calculation
# 3. [ ] Improve Status_trigger logic to use actual imminent_prob values
# ============================================================================
# ============================================================================
# CONFIGURATION
# ============================================================================
FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5
FOUR_WEEK_TREND_GROWTH_MIN <- 0.1
FOUR_WEEK_TREND_GROWTH_MAX <- 0.5
FOUR_WEEK_TREND_NO_GROWTH_RANGE <- 0.1
FOUR_WEEK_TREND_DECLINE_MAX <- -0.1
FOUR_WEEK_TREND_DECLINE_MIN <- -0.5
FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5
# CV TREND THRESHOLDS
CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05
# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS
# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend
# Negative slope = CV decreasing = field becoming MORE uniform = GOOD
# Positive slope = CV increasing = field becoming MORE patchy = BAD
# Near zero = Homogenous growth (all crops progressing equally)
CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.03 # CV decreasing rapidly (>3% drop over 8 weeks)
CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # CV decreasing (2-3% drop over 8 weeks)
CV_SLOPE_IMPROVEMENT_MAX <- -0.01 # Gradual improvement (1-2% drop over 8 weeks)
CV_SLOPE_HOMOGENOUS_MIN <- -0.01 # Essentially stable (small natural variation)
CV_SLOPE_HOMOGENOUS_MAX <- 0.01 # No change in uniformity (within ±1% over 8 weeks)
CV_SLOPE_PATCHINESS_MIN <- 0.01 # Minor divergence (1-2% increase over 8 weeks)
CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness (2-3% increase over 8 weeks)
CV_SLOPE_SEVERE_MIN <- 0.02 # Severe fragmentation (>3% increase over 8 weeks)
# PERCENTILE CALCULATIONS
CI_PERCENTILE_LOW <- 0.10
CI_PERCENTILE_HIGH <- 0.90
# GERMINATION THRESHOLD (for germination_progress calculation)
GERMINATION_CI_THRESHOLD <- 2.0
# PLANTING DATE & AGE CONFIGURATION
# Load from harvest.xlsx (scripts 22 & 23) - no fallback to uniform dates
# HISTORICAL DATA LOOKBACK
WEEKS_FOR_FOUR_WEEK_TREND <- 4
WEEKS_FOR_CV_TREND_SHORT <- 2
WEEKS_FOR_CV_TREND_LONG <- 8
# ============================================================================
# 1. Load required libraries
# ============================================================================
suppressPackageStartupMessages({
library(here)
library(sf)
library(terra)
library(dplyr)
library(tidyr)
library(lubridate)
library(readr)
library(readxl)
library(writexl)
library(purrr)
library(furrr)
library(future)
library(caret)
library(CAST)
library(randomForest)
tryCatch({
library(torch)
}, error = function(e) {
message("Note: torch package not available - harvest model inference will be skipped")
})
})
# ============================================================================
# LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES
# ============================================================================
tryCatch({
source(here("r_app", "80_weekly_stats_utils.R"))
}, error = function(e) {
stop("Error loading 80_weekly_stats_utils.R: ", e$message)
})
tryCatch({
source(here("r_app", "80_report_building_utils.R"))
}, error = function(e) {
stop("Error loading 80_report_building_utils.R: ", e$message)
})
tryCatch({
source(here("r_app", "kpi_utils.R"))
}, error = function(e) {
stop("Error loading kpi_utils.R: ", e$message)
})
# ============================================================================
# PHASE AND STATUS TRIGGER DEFINITIONS
# ============================================================================
PHASE_DEFINITIONS <- data.frame(
phase = c("Germination", "Tillering", "Grand Growth", "Maturation"),
age_start = c(0, 4, 17, 39),
age_end = c(6, 16, 39, 200),
stringsAsFactors = FALSE
)
STATUS_TRIGGERS <- data.frame(
trigger = c(
"germination_started",
"germination_complete",
"stress_detected_whole_field",
"strong_recovery",
"growth_on_track",
"maturation_progressing",
"harvest_ready"
),
age_min = c(0, 0, NA, NA, 4, 39, 45),
age_max = c(6, 6, NA, NA, 39, 200, 200),
description = c(
"10% of field CI > 2",
"70% of field CI >= 2",
"CI decline > -1.5 + low CV",
"CI increase > +1.5",
"CI increasing consistently",
"High CI, stable/declining",
"Age 45+ weeks (ready to harvest)"
),
stringsAsFactors = FALSE
)
# ============================================================================
# MAIN
# ============================================================================
# ============================================================================
# MAIN
# ============================================================================
main <- function() {
# Parse command-line arguments
args <- commandArgs(trailingOnly = TRUE)
# end_date (arg 1)
# Priority: 1) Command-line arg, 2) Global end_date variable (for recursive calls), 3) Global end_date_str, 4) Sys.Date()
end_date <- if (length(args) >= 1 && !is.na(args[1])) {
as.Date(args[1])
} else if (exists("end_date", envir = .GlobalEnv)) {
global_date <- get("end_date", envir = .GlobalEnv)
# Check if it's a valid Date with length > 0
if (is.Date(global_date) && length(global_date) > 0 && !is.na(global_date)) {
global_date
} else if (exists("end_date_str", envir = .GlobalEnv)) {
as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
Sys.Date()
}
} else if (exists("end_date_str", envir = .GlobalEnv)) {
as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
Sys.Date()
}
# project_dir (arg 2)
project_dir <- if (length(args) >= 2 && !is.na(args[2])) {
as.character(args[2])
} else if (exists("project_dir", envir = .GlobalEnv)) {
get("project_dir", envir = .GlobalEnv)
} else {
"angata"
}
# offset (arg 3) - for backward compatibility with old 09
offset <- if (length(args) >= 3 && !is.na(args[3])) {
as.numeric(args[3])
} else {
7
}
# Validate end_date is a proper Date object
if (is.null(end_date) || length(end_date) == 0 || !inherits(end_date, "Date")) {
stop("ERROR: end_date is not valid. Got: ", class(end_date), " with length ", length(end_date))
}
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv)
message("\n", strrep("=", 70))
message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION")
message(strrep("=", 70))
message("Date:", format(end_date, "%Y-%m-%d"))
message("Project:", project_dir)
message("Mode: Per-field analysis (SC-64) + Farm-level KPIs")
message("")
# Load configuration and utilities
# source(here("r_app", "crop_messaging_utils.R"))
tryCatch({
source(here("r_app", "parameters_project.R"))
}, error = function(e) {
stop("Error loading parameters_project.R: ", e$message)
})
tryCatch({
source(here("r_app", "30_growth_model_utils.R"))
}, error = function(e) {
warning("30_growth_model_utils.R not found - yield prediction KPI will use placeholder data")
})
# ========== PER-FIELD ANALYSIS (SC-64) ==========
message("\n", strrep("-", 70))
message("PHASE 1: PER-FIELD WEEKLY ANALYSIS (SC-64 ENHANCEMENTS)")
message(strrep("-", 70))
current_week <- as.numeric(format(end_date, "%V"))
year <- as.numeric(format(end_date, "%Y"))
previous_week <- current_week - 1
if (previous_week < 1) previous_week <- 52
message(paste("Week:", current_week, "/ Year:", year))
# Find tile files - approach from Script 20
message("Finding tile files...")
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", current_week, year)
# Detect grid size subdirectory
detected_grid_size <- NA
if (dir.exists(weekly_tile_max)) {
subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
detected_grid_size <- grid_patterns[1]
mosaic_dir <- file.path(weekly_tile_max, detected_grid_size)
message(paste(" Using grid-size subdirectory:", detected_grid_size))
}
}
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
stop(paste("No tile files found for week", current_week, year, "in", mosaic_dir))
}
message(paste(" Found", length(tile_files), "tiles"))
# Load field boundaries
tryCatch({
boundaries_result <- load_field_boundaries(data_dir)
if (is.list(boundaries_result) && "field_boundaries_sf" %in% names(boundaries_result)) {
field_boundaries_sf <- boundaries_result$field_boundaries_sf
} else {
field_boundaries_sf <- boundaries_result
}
if (nrow(field_boundaries_sf) == 0) {
stop("No fields loaded from boundaries")
}
message(paste(" Loaded", nrow(field_boundaries_sf), "fields"))
}, error = function(e) {
stop("ERROR loading field boundaries: ", e$message)
})
message("Loading historical field data for trend calculations...")
# Load up to 8 weeks (max of 4-week and 8-week trend requirements)
# Function gracefully handles missing weeks and loads whatever exists
num_weeks_to_load <- max(WEEKS_FOR_FOUR_WEEK_TREND, WEEKS_FOR_CV_TREND_LONG) # Always 8
message(paste(" Attempting to load up to", num_weeks_to_load, "weeks of historical data..."))
# Only auto-generate on first call (not in recursive calls from within load_historical_field_data)
allow_auto_gen <- !exists("_INSIDE_AUTO_GENERATE", envir = .GlobalEnv)
historical_data <- load_historical_field_data(project_dir, current_week, reports_dir,
num_weeks = num_weeks_to_load,
auto_generate = allow_auto_gen,
field_boundaries_sf = field_boundaries_sf)
# Load harvest.xlsx for planting dates (season_start)
message("\nLoading harvest data from harvest.xlsx for planting dates...")
harvest_file_path <- file.path(data_dir, "harvest.xlsx")
harvesting_data <- tryCatch({
if (file.exists(harvest_file_path)) {
harvest_raw <- readxl::read_excel(harvest_file_path)
harvest_raw$season_start <- as.Date(harvest_raw$season_start)
harvest_raw$season_end <- as.Date(harvest_raw$season_end)
message(paste(" ✓ Loaded harvest data:", nrow(harvest_raw), "rows"))
harvest_raw
} else {
message(paste(" WARNING: harvest.xlsx not found at", harvest_file_path))
NULL
}
}, error = function(e) {
message(paste(" ERROR loading harvest.xlsx:", e$message))
NULL
})
planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
# Validate planting_dates
if (is.null(planting_dates) || nrow(planting_dates) == 0) {
message("WARNING: No planting dates available. Using NA for all fields.")
planting_dates <- data.frame(
field_id = field_boundaries_sf$field,
date = rep(as.Date(NA), nrow(field_boundaries_sf)),
stringsAsFactors = FALSE
)
}
# SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile
# ============================================================================
# NEW MODULAR APPROACH: Load/Calculate weekly stats, apply trends
# ============================================================================
# Build tile grid (needed by calculate_field_statistics)
message("\nBuilding tile grid for current week...")
tile_grid <- build_tile_grid(mosaic_dir, current_week, year)
message("\nUsing modular RDS-based approach for weekly statistics...")
# Load/calculate CURRENT week stats (from tiles or RDS cache)
message("\n1. Loading/calculating CURRENT week statistics (week", current_week, ")...")
current_stats <- load_or_calculate_weekly_stats(
week_num = current_week,
year = year,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = tile_grid$mosaic_dir,
reports_dir = reports_dir,
report_date = end_date
)
message(paste(" ✓ Loaded/calculated stats for", nrow(current_stats), "fields in current week"))
# Load/calculate PREVIOUS week stats (from RDS cache or tiles)
message("\n2. Loading/calculating PREVIOUS week statistics (week", previous_week, ")...")
# Calculate report date for previous week (7 days before current)
prev_report_date <- end_date - 7
prev_stats <- load_or_calculate_weekly_stats(
week_num = previous_week,
year = year,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = tile_grid$mosaic_dir,
reports_dir = reports_dir,
report_date = prev_report_date
)
message(paste(" ✓ Loaded/calculated stats for", nrow(prev_stats), "fields in previous week"))
message(paste(" Columns in prev_stats:", paste(names(prev_stats), collapse = ", ")))
message(paste(" CV column non-NA values in prev_stats:", sum(!is.na(prev_stats$CV))))
# Apply trend calculations (requires both weeks)
message("\n3. Calculating trend columns...")
current_stats <- calculate_kpi_trends(current_stats, prev_stats,
project_dir = project_dir,
reports_dir = reports_dir,
current_week = current_week,
year = year)
message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_of_weeks_analysed"))
# Load weekly harvest probabilities from script 31 (if available)
message("\n4. Loading harvest probabilities from script 31...")
harvest_prob_file <- file.path(reports_dir, "kpis", "field_stats",
sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, year))
message(paste(" Looking for:", harvest_prob_file))
imminent_prob_data <- tryCatch({
if (file.exists(harvest_prob_file)) {
prob_df <- readr::read_csv(harvest_prob_file, show_col_types = FALSE)
message(paste(" ✓ Loaded harvest probabilities for", nrow(prob_df), "fields"))
prob_df %>%
select(field, imminent_prob, detected_prob) %>%
rename(Field_id = field, Imminent_prob_actual = imminent_prob, Detected_prob = detected_prob)
} else {
message(paste(" INFO: Harvest probabilities not available (script 31 not run)"))
NULL
}
}, error = function(e) {
message(paste(" WARNING: Could not load harvest probabilities:", e$message))
NULL
})
# ============================================================================
# CALCULATE GAP FILLING KPI (2σ method from kpi_utils.R)
# ============================================================================
message("\nCalculating gap filling scores (2σ method)...")
# Try single merged mosaic first, then fall back to merging tiles
week_mosaic_file <- file.path(mosaic_dir, sprintf("week_%02d_%d.tif", current_week, year))
gap_scores_df <- NULL
if (file.exists(week_mosaic_file)) {
# Single merged mosaic exists - use it directly
tryCatch({
current_week_raster <- terra::rast(week_mosaic_file)
current_ci_band <- current_week_raster[[5]] # CI is the 5th band
names(current_ci_band) <- "CI"
message(paste(" Loaded single mosaic:", week_mosaic_file))
# Calculate gap scores for all fields
gap_result <- calculate_gap_filling_kpi(current_ci_band, field_boundaries_sf)
# Extract field-level results (use field column directly to match current_stats Field_id)
gap_scores_df <- gap_result$field_results %>%
mutate(Field_id = field) %>%
select(Field_id, gap_score)
message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields"))
message(paste(" Gap score range:", round(min(gap_scores_df$gap_score, na.rm=TRUE), 2), "-", round(max(gap_scores_df$gap_score, na.rm=TRUE), 2), "%"))
}, error = function(e) {
message(paste(" WARNING: Could not calculate gap scores from single mosaic:", e$message))
message(" Gap scores will be set to NA")
gap_scores_df <- NULL
})
} else {
# Single mosaic doesn't exist - check for tiles and process per-tile
message(" Single mosaic not found. Checking for tiles...")
# List all tiles for this week (e.g., week_04_2026_01.tif through week_04_2026_25.tif)
tile_pattern <- sprintf("week_%02d_%d_\\d{2}\\.tif$", current_week, year)
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
message(sprintf(" WARNING: No tiles found matching pattern: %s in %s", tile_pattern, mosaic_dir))
message(" Gap scores will be set to NA")
} else {
tryCatch({
message(sprintf(" Found %d tiles. Processing per-tile (memory efficient)...", length(tile_files)))
# Process each tile separately and accumulate results
all_tile_results <- list()
for (i in seq_along(tile_files)) {
tile_file <- tile_files[i]
# Load tile raster
tile_raster <- terra::rast(tile_file)
tile_ci_band <- tile_raster[[5]]
names(tile_ci_band) <- "CI"
# Calculate gap scores for fields in this tile
tile_gap_result <- calculate_gap_filling_kpi(tile_ci_band, field_boundaries_sf)
# Store results (only keep fields with non-NA scores, use field directly to match current_stats)
if (!is.null(tile_gap_result$field_results) && nrow(tile_gap_result$field_results) > 0) {
tile_results_clean <- tile_gap_result$field_results %>%
mutate(Field_id = field) %>%
select(Field_id, gap_score) %>%
filter(!is.na(gap_score))
if (nrow(tile_results_clean) > 0) {
all_tile_results[[i]] <- tile_results_clean
}
}
# Clear memory
rm(tile_raster, tile_ci_band, tile_gap_result)
gc(verbose = FALSE)
}
# Combine all tile results
if (length(all_tile_results) > 0) {
gap_scores_df <- bind_rows(all_tile_results)
# If a field appears in multiple tiles, take the maximum gap score
gap_scores_df <- gap_scores_df %>%
group_by(Field_id) %>%
summarise(gap_score = max(gap_score, na.rm = TRUE), .groups = "drop")
message(paste(" ✓ Calculated gap scores for", nrow(gap_scores_df), "fields across", length(all_tile_results), "tiles"))
message(paste(" Gap score range:", round(min(gap_scores_df$gap_score, na.rm=TRUE), 2), "-", round(max(gap_scores_df$gap_score, na.rm=TRUE), 2), "%"))
} else {
message(" WARNING: No gap scores calculated from any tiles")
gap_scores_df <- NULL
}
}, error = function(e) {
message(paste(" WARNING: Could not process tiles or calculate gap scores:", e$message))
message(" Gap scores will be set to NA")
gap_scores_df <- NULL
})
}
}
# ============================================================================
# Build final output dataframe with all 22 columns (including Gap_score)
# ============================================================================
message("\nBuilding final field analysis output...")
# Pre-calculate acreages with geometry validation
# This avoids geometry errors during field_analysis construction
acreage_lookup <- tryCatch({
lookup_df <- field_boundaries_sf %>%
sf::st_drop_geometry() %>%
as.data.frame() %>%
mutate(
geometry_valid = sapply(seq_len(nrow(field_boundaries_sf)), function(idx) {
tryCatch({
sf::st_is_valid(field_boundaries_sf[idx, ])
}, error = function(e) FALSE)
}),
area_ha = 0
)
# Calculate area for valid geometries
for (idx in which(lookup_df$geometry_valid)) {
tryCatch({
area_m2 <- as.numeric(sf::st_area(field_boundaries_sf[idx, ]))
lookup_df$area_ha[idx] <- area_m2 / 10000
}, error = function(e) {
lookup_df$area_ha[idx] <<- NA_real_
})
}
# Convert hectares to acres
lookup_df %>%
mutate(acreage = area_ha / 0.404686) %>%
select(field, acreage)
}, error = function(e) {
message(paste("Warning: Could not calculate acreages from geometries -", e$message))
data.frame(field = character(0), acreage = numeric(0))
})
field_analysis_df <- current_stats %>%
mutate(
# Column 2: Farm_Section (user fills)
Farm_Section = NA_character_,
# Column 3: Field_name (from GeoJSON - already have Field_id, can look up)
Field_name = Field_id,
# Column 4: Acreage (calculate from geometry)
Acreage = {
acreages_vec <- acreage_lookup$acreage[match(Field_id, acreage_lookup$field)]
if_else(is.na(acreages_vec), 0, acreages_vec)
},
# Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change)
# Column 7: Four_week_trend (from current_stats)
# Column 8: Last_harvest_or_planting_date (from harvest.xlsx - season_start)
Last_harvest_or_planting_date = {
planting_dates$planting_date[match(Field_id, planting_dates$field_id)]
},
# Column 9: Age_week (calculated from report date and planting date)
Age_week = {
sapply(seq_len(nrow(current_stats)), function(idx) {
planting_dt <- Last_harvest_or_planting_date[idx]
if (is.na(planting_dt)) {
return(NA_real_)
}
round(as.numeric(difftime(end_date, planting_dt, units = "weeks")), 0)
})
},
# Column 10: Phase (recalculate based on updated Age_week)
Phase = {
sapply(Age_week, function(age) {
if (is.na(age)) return(NA_character_)
if (age >= 0 & age < 4) return("Germination")
if (age >= 4 & age < 17) return("Tillering")
if (age >= 17 & age < 39) return("Grand Growth")
if (age >= 39) return("Maturation")
NA_character_
})
},
# Column 11: nmr_of_weeks_analysed (already in current_stats from calculate_kpi_trends)
# Column 12: Germination_progress (calculated here from CI values)
# Bin Pct_pixels_CI_gte_2 into 10% intervals: 0-10%, 10-20%, ..., 80-90%, 90-95%, 95-100%
Germination_progress = sapply(Pct_pixels_CI_gte_2, function(pct) {
if (is.na(pct)) return(NA_character_)
if (pct >= 95) return("95-100%")
else if (pct >= 90) return("90-95%")
else if (pct >= 80) return("80-90%")
else if (pct >= 70) return("70-80%")
else if (pct >= 60) return("60-70%")
else if (pct >= 50) return("50-60%")
else if (pct >= 40) return("40-50%")
else if (pct >= 30) return("30-40%")
else if (pct >= 20) return("20-30%")
else if (pct >= 10) return("10-20%")
else return("0-10%")
}),
# Column 13: Imminent_prob (from script 31 or NA if not available)
Imminent_prob = {
if (!is.null(imminent_prob_data)) {
imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)]
} else {
rep(NA_real_, nrow(current_stats))
}
},
# Column 14: Status_Alert (based on harvest probability + crop health status)
# Priority order: Ready for harvest-check → Strong decline → Harvested/bare → NA
Status_Alert = {
sapply(seq_len(nrow(current_stats)), function(idx) {
imminent_prob <- Imminent_prob[idx]
age_w <- Age_week[idx]
weekly_ci_chg <- Weekly_ci_change[idx]
mean_ci_val <- Mean_CI[idx]
# Priority 1: Ready for harvest-check (imminent + mature cane ≥12 months)
if (!is.na(imminent_prob) && imminent_prob > 0.5 && !is.na(age_w) && age_w >= 52) {
return("Ready for harvest-check")
}
# Priority 2: Strong decline in crop health (drop ≥2 points but still >1.5)
if (!is.na(weekly_ci_chg) && weekly_ci_chg <= -2.0 && !is.na(mean_ci_val) && mean_ci_val > 1.5) {
return("Strong decline in crop health")
}
# Priority 3: Harvested/bare (Mean CI < 1.5)
if (!is.na(mean_ci_val) && mean_ci_val < 1.5) {
return("Harvested/bare")
}
# Fallback: no alert
NA_character_
})
},
# Columns 15-16: CI-based columns already in current_stats (CI_range, CI_Percentiles)
# Column 17: Already in current_stats (CV)
# Column 18: Already in current_stats (CV_Trend_Short_Term)
# Column 19: CV_Trend_Long_Term (from current_stats - raw slope value)
# Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope)
# 3 classes: More uniform (slope < -0.01), Stable uniformity (-0.01 to 0.01), Less uniform (slope > 0.01)
CV_Trend_Long_Term_Category = {
sapply(current_stats$CV_Trend_Long_Term, function(slope) {
if (is.na(slope)) {
return(NA_character_)
} else if (slope < -0.01) {
return("More uniform")
} else if (slope > 0.01) {
return("Less uniform")
} else {
return("Stable uniformity")
}
})
},
# Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category)
# Bin Cloud_pct_clear into 10% intervals: 0-10%, 10-20%, ..., 80-90%, 90-95%, 95-100%
Cloud_pct_clear = sapply(Cloud_pct_clear, function(pct) {
if (is.na(pct)) return(NA_character_)
if (pct >= 95) return("95-100%")
else if (pct >= 90) return("90-95%")
else if (pct >= 80) return("80-90%")
else if (pct >= 70) return("70-80%")
else if (pct >= 60) return("60-70%")
else if (pct >= 50) return("50-60%")
else if (pct >= 40) return("40-50%")
else if (pct >= 30) return("30-40%")
else if (pct >= 20) return("20-30%")
else if (pct >= 10) return("10-20%")
else return("0-10%")
}),
# Column 22: Gap_score (2σ below median - from kpi_utils.R)
Gap_score = {
if (!is.null(gap_scores_df) && nrow(gap_scores_df) > 0) {
# Debug: Print first few gap scores
message(sprintf(" Joining %d gap scores to field_analysis (first 3: %s)",
nrow(gap_scores_df),
paste(head(gap_scores_df$gap_score, 3), collapse=", ")))
message(sprintf(" First 3 Field_ids in gap_scores_df: %s",
paste(head(gap_scores_df$Field_id, 3), collapse=", ")))
message(sprintf(" First 3 Field_ids in current_stats: %s",
paste(head(current_stats$Field_id, 3), collapse=", ")))
gap_scores_df$gap_score[match(current_stats$Field_id, gap_scores_df$Field_id)]
} else {
rep(NA_real_, nrow(current_stats))
}
}
) %>%
select(
all_of(c("Field_id", "Farm_Section", "Field_name", "Acreage", "Status_Alert",
"Last_harvest_or_planting_date", "Age_week", "Phase",
"Germination_progress",
"Mean_CI", "Weekly_ci_change", "Four_week_trend", "CI_range", "CI_Percentiles",
"CV", "CV_Trend_Short_Term", "CV_Trend_Long_Term", "CV_Trend_Long_Term_Category",
"Imminent_prob", "Cloud_pct_clear", "Cloud_category", "Gap_score"))
)
message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 22 columns (including Gap_score)"))
export_paths <- export_field_analysis_excel(
field_analysis_df,
NULL,
project_dir,
current_week,
year,
reports_dir
)
cat("\n--- Per-field Results (first 10) ---\n")
available_cols <- c("Field_id", "Acreage", "Age_week", "Mean_CI", "Four_week_trend", "Status_Alert", "Cloud_category")
available_cols <- available_cols[available_cols %in% names(field_analysis_df)]
if (length(available_cols) > 0) {
print(head(field_analysis_df[, available_cols], 10))
}
# ========== FARM-LEVEL KPI AGGREGATION ==========
# Aggregate the per-field analysis into farm-level summary statistics
cat("\n=== CALCULATING FARM-LEVEL KPI SUMMARY ===\n")
# Filter to only fields that have actual data (non-NA CI and valid acreage)
field_data <- field_analysis_df %>%
filter(!is.na(Mean_CI) & !is.na(Acreage)) %>%
filter(Acreage > 0)
if (nrow(field_data) > 0) {
if (nrow(field_data) > 0) {
# Create summary statistics
farm_summary <- list()
# 1. PHASE DISTRIBUTION
phase_dist <- field_data %>%
group_by(Phase) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Phase)
farm_summary$phase_distribution <- phase_dist
# 2. STATUS ALERT DISTRIBUTION
status_dist <- field_data %>%
group_by(Status_Alert) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Status_Alert)
farm_summary$status_distribution <- status_dist
# 3. CLOUD COVERAGE DISTRIBUTION
cloud_dist <- field_data %>%
group_by(Cloud_category) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Cloud_category)
farm_summary$cloud_distribution <- cloud_dist
# 4. OVERALL STATISTICS
farm_summary$overall_stats <- data.frame(
total_fields = nrow(field_data),
total_acreage = sum(field_data$Acreage, na.rm = TRUE),
mean_ci = round(mean(field_data$Mean_CI, na.rm = TRUE), 2),
median_ci = round(median(field_data$Mean_CI, na.rm = TRUE), 2),
mean_cv = round(mean(field_data$CI_CV, na.rm = TRUE), 4),
week = current_week,
year = year,
date = as.character(end_date)
)
# Print summaries
cat("\n--- PHASE DISTRIBUTION ---\n")
print(phase_dist)
cat("\n--- STATUS TRIGGER DISTRIBUTION ---\n")
print(status_dist)
cat("\n--- CLOUD COVERAGE DISTRIBUTION ---\n")
print(cloud_dist)
cat("\n--- OVERALL FARM STATISTICS ---\n")
print(farm_summary$overall_stats)
farm_kpi_results <- farm_summary
} else {
farm_kpi_results <- NULL
}
} else {
farm_kpi_results <- NULL
}
# ========== FINAL SUMMARY ==========
cat("\n", strrep("=", 70), "\n")
cat("80_CALCULATE_KPIs.R - COMPLETION SUMMARY\n")
cat(strrep("=", 70), "\n")
cat("Per-field analysis fields analyzed:", nrow(field_analysis_df), "\n")
cat("Excel export:", export_paths$excel, "\n")
cat("RDS export:", export_paths$rds, "\n")
cat("CSV export:", export_paths$csv, "\n")
if (!is.null(farm_kpi_results)) {
cat("\nFarm-level KPIs: CALCULATED\n")
} else {
cat("\nFarm-level KPIs: SKIPPED (no valid tile data extracted)\n")
}
cat("\n✓ Consolidated KPI calculation complete!\n")
cat(" - Per-field data exported\n")
cat(" - Farm-level KPIs calculated\n")
cat(" - All outputs in:", reports_dir, "\n\n")
}
if (sys.nframe() == 0) {
main()
}