expanding csv table

This commit is contained in:
Timon 2026-01-15 15:35:16 +01:00
parent fabbf3214d
commit 4e94a9a78b

View file

@ -10,7 +10,7 @@
# - Per-field analysis with SC-64 enhancements (4-week trends, CI percentiles, etc.)
# - Farm-level KPI calculation (6 metrics for executive overview)
# - Parallel processing (tile-aware, 1000+ fields supported)
# - Comprehensive Excel + RDS + CSV exports
# - Comprehensive Excel + RDS + CSV exports (21 columns per spec)
# - Test mode for development
#
# COMMAND-LINE USAGE:
@ -26,6 +26,71 @@
# source("r_app/80_calculate_kpis.R")
# main()
# ============================================================================
# EXCEL OUTPUT SPECIFICATION (21 COLUMNS)
# ============================================================================
# This script exports 21 columns per the field analysis specification:
#
# COMPLETED/IN PROGRESS:
# 1. Field_id ✓ - Unique field identifier
# 2. Farm_Section - Management zone (to be filled by user)
# 3. Field_name ✓ - Client-facing field name (from GeoJSON)
# 4. Acreage ✓ - Field size in acres
# 5. Mean_CI ✓ - Average Chlorophyll Index
# 6. Weekly_ci_change ✓ - Week-over-week CI change
# 7. Four_week_trend - [FUTURE] Trend over 4 weeks (requires historical mosaics)
# 8. Last_harvest_or_planting_date - [DUMMY for now] Will be from harvest Excel + LSTM (script 31)
# 9. Age_week ✓ - Weeks since planting
# 10. Phase (age based) ✓ - Growth phase (Germination, Tillering, Grand Growth, Maturation)
# 11. nmr_weeks_in_this_phase - [TODO] Weeks spent in current phase (track phase transitions)
# 12. Germination_progress - [TODO] % pixels with CI >= threshold (default 2, for age < 4 months)
# 13. Imminent_prob - [DUMMY for now] Harvest probability (will be from script 31 output)
# 14. Status_trigger ✓ - Alerts (harvest_ready, stress, etc.)
# 15. CI_range (min-max) - [TODO] Min and max CI values in field
# 16. CI_Percentiles ✓ - 10th-90th percentile of CI (p10-p90 format)
# 17. CV ✓ - Coefficient of variation (field uniformity)
# 18. CV_Trend_Short_Term - [TODO] 2-week CV trend (current week CV - last week CV)
# 19. CV_Trend_Long_Term - [FUTURE] 8-week CV slope (requires linear regression, historical mosaics)
# 20. Cloud_pct_clear ✓ - % field visible (pixel coverage)
# 21. Cloud_category ✓ - Cloud classification (Clear view / Partial coverage / No image available)
#
# IMPLEMENTATION PLAN (ordered by difficulty):
# ============================================================================
# PHASE 1 - EASY (Current data only):
# [✓] Remove Mean_CI_prev column
# [✓] Add Field_name column (from field_boundaries_sf$field)
# [✓] Add Farm_Section column (empty, user will fill)
# [✓] Add Last_harvest_or_planting_date (use UNIFORM_PLANTING_DATE as dummy)
# [✓] Add CI_range (min/max from pixel extraction)
# [✓] Add Cloud_pct_clear (% from pixel coverage)
# [✓] Column order: Reorder to match spec (1-21)
#
# PHASE 1 LIMITATION (Known Issue - To Fix in PHASE 2):
# - Fields spanning multiple tiles currently show statistics from first intersecting tile only
# - This results in p10 ≈ p90 (few pixels per tile) instead of field-wide percentiles
# - FIX: After extracting all tiles, group by field_id and aggregate pixel values across all tiles
# before calculating percentiles. This will give true field-wide statistics.
#
# PHASE 2 - MEDIUM (Requires computation):
# [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV)
# [ ] Add Germination_progress (% pixels CI >= GERMINATION_CI_THRESHOLD, configurable)
# [ ] Add Imminent_prob column (dummy NA, will merge from script 31 harvest_imminent_weekly.csv)
# [ ] Add CV_Trend_Short_Term (requires loading last week's CV values)
#
# PHASE 3 - COMPLEX (Requires historical data):
# [ ] Add Four_week_trend (CI value difference week vs 4 weeks ago, requires loading prev mosaics)
# [ ] Add CV_Trend_Long_Term (8-week slope: linear regression on 8 weeks of CV, suggests lm())
# [ ] Load previous week's CSV to cross-check phase transitions and trends
#
# NOTES:
# - Script 31 (harvest_imminent_weekly.py) outputs: field, imminent_prob, detected_prob, week, year
# - Will need to LEFT JOIN on (field, week, year) to populate Imminent_prob
# - Phase transition logic: Compare this week's Phase vs last week's Phase from CSV
# - For 8-week CV slope: Linear regression slope = (CV_week8 - CV_week1) / 7 weeks (approximately)
# or use lm(CV ~ week) on 8-week sequence for proper slope calculation
# - Germination_progress only calculated if Age_week < 17 (before end of Tillering phase)
# - Cloud_pct_clear calculated as: (pixel_count / expected_pixels) * 100
# ============================================================================
# *** CONFIGURATION SECTION - MANUALLY DEFINED THRESHOLDS ***
# ============================================================================
@ -34,6 +99,10 @@
TEST_MODE <- TRUE
TEST_MODE_NUM_WEEKS <- 2
# GERMINATION PROGRESS THRESHOLD
# Percentage of pixels that must reach this CI value to count as "germinated"
GERMINATION_CI_THRESHOLD <- 2.0 # Pixels with CI >= 2 count as germinated
# FOUR-WEEK TREND THRESHOLDS
FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5
FOUR_WEEK_TREND_GROWTH_MIN <- 0.1
@ -522,54 +591,41 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
))
}
# Extract CI values: EXACTLY LIKE SCRIPT 20
# Crop to field bounding box first, then extract with sf directly (not terra::vect conversion)
# SINGLE EXTRACTION: Get all pixel values for this field, then calculate all stats from it
field_bbox <- sf::st_bbox(field_sf)
ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out")
extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE)
# extracted_vals is a data.frame with ID column (field index) + mean value
mean_ci_current <- as.numeric(extracted_vals[1, 2])
# Extract all pixels in one call (no fun= parameter means we get raw pixel values)
all_extracted <- terra::extract(ci_cropped, field_sf)[, 2]
current_ci_vals <- all_extracted[!is.na(all_extracted)]
if (is.na(mean_ci_current)) {
if (length(current_ci_vals) == 0) {
return(data.frame(
Field_id = field_id,
error = "No CI values extracted from tiles"
))
}
# For per-tile extraction, we only have mean from the aggregation function
# To get variance/CV, we need to extract all pixels without the fun parameter
# But for farm-level purposes, the mean CI is sufficient
all_extracted <- terra::extract(ci_cropped, field_sf)[, 2]
current_ci_vals <- all_extracted[!is.na(all_extracted)]
# Calculate all statistics from the single extraction
mean_ci_current <- mean(current_ci_vals, na.rm = TRUE)
ci_std <- sd(current_ci_vals, na.rm = TRUE)
cv_current <- if (mean_ci_current > 0) ci_std / mean_ci_current else NA_real_
range_min <- min(current_ci_vals, na.rm = TRUE)
range_max <- max(current_ci_vals, na.rm = TRUE)
range_str <- sprintf("%.1f-%.1f", range_min, range_max)
ci_percentiles_str <- get_ci_percentiles(current_ci_vals)
# Cloud coverage from extraction metadata
num_total <- length(all_extracted)
num_data <- sum(!is.na(all_extracted))
num_data <- length(current_ci_vals)
pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0
cloud_cat <- if (num_data == 0) "No image available"
else if (pct_clear >= 99.5) "Clear view"
else "Partial coverage"
cloud_pct <- 100 - pct_clear
cloud_interval <- round_cloud_to_intervals(pct_clear)
if (length(current_ci_vals) == 0) {
return(data.frame(
Field_id = field_id,
error = "No CI values extracted"
))
}
mean_ci_current <- mean(current_ci_vals, na.rm = TRUE)
ci_std <- sd(current_ci_vals, na.rm = TRUE)
cv_current <- ci_std / mean_ci_current
range_min <- min(current_ci_vals, na.rm = TRUE)
range_max <- max(current_ci_vals, na.rm = TRUE)
range_str <- sprintf("%.1f-%.1f", range_min, range_max)
ci_percentiles_str <- get_ci_percentiles(current_ci_vals)
# Weekly change (extract previous week same way - single extraction)
weekly_ci_change <- NA
previous_ci_vals <- NULL
@ -578,8 +634,8 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
if (!is.null(previous_ci)) {
prev_bbox <- sf::st_bbox(field_sf)
prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out")
prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2]
previous_ci_vals <- prev_extracted[!is.na(prev_extracted)]
prev_extracted_all <- terra::extract(prev_ci_cropped, field_sf)[, 2]
previous_ci_vals <- prev_extracted_all[!is.na(prev_extracted_all)]
if (length(previous_ci_vals) > 0) {
mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE)
weekly_ci_change <- mean_ci_current - mean_ci_previous
@ -743,11 +799,11 @@ generate_field_analysis_summary <- function(field_df) {
total_acreage <- sum(field_df$Acreage, na.rm = TRUE)
germination_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Germination"], na.rm = TRUE)
tillering_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Tillering"], na.rm = TRUE)
grand_growth_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Grand Growth"], na.rm = TRUE)
maturation_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Maturation"], na.rm = TRUE)
unknown_phase_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Unknown"], na.rm = TRUE)
germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE)
tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE)
grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE)
maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE)
unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE)
harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE)
stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE)
@ -1070,55 +1126,58 @@ extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
#'
#' @param ci_band Single CI band from terra raster
#' @param field_boundaries_sf SF object with field geometries
#' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count
#' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, min_ci, max_ci, pixel_count_valid, pixel_count_total
# Extract all pixels for all fields at once (more efficient than individual calls)
all_pixels <- terra::extract(ci_band, field_boundaries_sf)
# SINGLE EXTRACTION: Get all pixels for all fields at once (no aggregation function)
# Result: data.frame with ID column (field indices) and value column (pixel values)
extract_result <- terra::extract(ci_band, field_boundaries_sf)
# Calculate statistics for each field
stats_list <- list()
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
# Extract pixel values for this field (skip ID column 1)
pixels <- all_pixels[field_idx, -1, drop = TRUE]
pixels <- as.numeric(pixels)
pixels <- pixels[!is.na(pixels)]
# Get all pixels for this field from the single extraction
# extract_result has columns [ID, value] where ID is field index (1-based)
field_pixels <- extract_result[extract_result$ID == field_idx, 2]
pixels <- as.numeric(field_pixels[!is.na(field_pixels)]) # Remove NAs
# Only calculate stats if we have pixels
if (length(pixels) > 0) {
mean_val <- mean(pixels, na.rm = TRUE)
# Only calculate CV if mean > 0 (avoid division by zero)
if (mean_val > 0) {
cv_val <- sd(pixels, na.rm = TRUE) / mean_val
} else {
cv_val <- NA
}
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = mean_val,
cv = cv_val,
p10 = p10_val,
p90 = p90_val,
pixel_count = length(pixels),
stringsAsFactors = FALSE
)
} else {
# No pixels for this field (doesn't intersect tile)
if (length(pixels) == 0) {
# No data for this field
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = NA_real_,
cv = NA_real_,
p10 = NA_real_,
p90 = NA_real_,
pixel_count = 0,
min_ci = NA_real_,
max_ci = NA_real_,
pixel_count_valid = 0,
pixel_count_total = 0,
stringsAsFactors = FALSE
)
next
}
# Calculate all statistics from pixels array
mean_val <- mean(pixels, na.rm = TRUE)
cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
min_val <- min(pixels, na.rm = TRUE)
max_val <- max(pixels, na.rm = TRUE)
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = mean_val,
cv = cv_val,
p10 = p10_val,
p90 = p90_val,
min_ci = min_val,
max_ci = max_val,
pixel_count_valid = length(pixels),
pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]),
stringsAsFactors = FALSE
)
}
return(dplyr::bind_rows(stats_list))
@ -1312,6 +1371,13 @@ main <- function() {
message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data"))
if (num_with_data > 0) {
message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", ")))
# Check percentiles
sample_field <- which(!is.na(current_stats$mean_ci))[2]
message(paste(" [DEBUG] Field", sample_field, "- p10:", current_stats$p10[sample_field],
"p90:", current_stats$p90[sample_field],
"min:", current_stats$min_ci[sample_field],
"max:", current_stats$max_ci[sample_field],
"valid_pixels:", current_stats$pixel_count_valid[sample_field]))
}
}
@ -1338,10 +1404,9 @@ main <- function() {
}
mean_ci_current <- current_stats$mean_ci[field_idx]
pixel_count <- current_stats$pixel_count[field_idx]
# SKIP fields with no data in this tile (they don't intersect this tile)
if (is.na(pixel_count) || pixel_count == 0) {
if (is.na(current_stats$pixel_count_valid[field_idx]) || current_stats$pixel_count_valid[field_idx] == 0) {
next
}
ci_cv_current <- current_stats$cv[field_idx]
@ -1370,9 +1435,9 @@ main <- function() {
# Use the percentiles and mean to create a synthetic distribution for status_trigger
# For now, use mean CI repeated by pixel count for testing
# TODO: Consider extracting pixels directly if needed for more complex triggers
pixel_count <- current_stats$pixel_count[field_idx]
ci_vals_current <- if (pixel_count > 0) {
rep(mean_ci_current, pixel_count) # Simplified: use mean value repeated
pixel_count_valid <- current_stats$pixel_count_valid[field_idx]
ci_vals_current <- if (!is.na(pixel_count_valid) && pixel_count_valid > 0) {
rep(mean_ci_current, pixel_count_valid) # Simplified: use mean value repeated
} else {
numeric(0)
}
@ -1393,36 +1458,68 @@ main <- function() {
phase <- get_phase_by_age(age_weeks)
status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks)
# Cloud coverage categorization based on CI value
# No data = No image available
# CI 0.01 to 95 = Partial coverage
# CI >= 95 = Clear view
if (is.na(mean_ci_current) || mean_ci_current == 0) {
cloud_category <- "No image available"
# Set all CI metrics to NA since no valid data
ci_change <- NA
ci_cv_current <- NA
ci_percentile_low <- NA
ci_percentile_high <- NA
} else if (mean_ci_current >= 95) {
cloud_category <- "Clear view"
} else {
cloud_category <- "Partial coverage"
# Calculate Cloud_pct_clear: percentage of field with valid data
# Binned to 10% intervals (0, 10, 20, ..., 90, 100)
cloud_pct_clear <- {
valid_count <- current_stats$pixel_count_valid[field_idx]
total_count <- current_stats$pixel_count_total[field_idx]
if (!is.na(valid_count) && !is.na(total_count) && total_count > 0) {
pct <- (valid_count / total_count) * 100
round(pct / 10) * 10
} else {
NA_real_
}
}
# Build result row
# Cloud categorization based on pixel coverage (Cloud_pct_clear)
cloud_category <- if (is.na(cloud_pct_clear)) {
"No image available"
} else if (cloud_pct_clear >= 90) {
"Clear view"
} else if (cloud_pct_clear > 0) {
"Partial coverage"
} else {
"No image available"
}
# Get min/max CI values
ci_min <- current_stats$min_ci[field_idx]
ci_max <- current_stats$max_ci[field_idx]
ci_range <- if (!is.na(ci_min) && !is.na(ci_max)) {
sprintf("%.1f-%.1f", ci_min, ci_max)
} else {
NA_character_
}
# Get field_name from field_boundaries_sf
field_name <- field_sf$field
# Build result row (21 columns per specification, in order)
result_row <- data.frame(
Field_id = field_id,
Farm_Section = NA_character_,
Field_name = field_name,
Acreage = field_area_acres,
Mean_CI = mean_ci_current,
Mean_CI_prev = mean_ci_previous,
CI_change = ci_change,
CI_CV = ci_cv_current,
CI_percentile_low = ci_percentile_low,
CI_percentile_high = ci_percentile_high,
Weekly_ci_change = ci_change,
Four_week_trend = NA_character_,
Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
Age_weeks = age_weeks,
Phase = phase,
nmr_weeks_in_this_phase = NA_real_,
Germination_progress = NA_real_,
Imminent_prob = NA_real_,
Status_trigger = status_trigger,
CI_range = ci_range,
CI_Percentiles = if (!is.na(ci_percentile_low) && !is.na(ci_percentile_high)) {
sprintf("%.1f-%.1f", ci_percentile_low, ci_percentile_high)
} else {
NA_character_
},
CV = ci_cv_current,
CV_Trend_Short_Term = NA_real_,
CV_Trend_Long_Term = NA_real_,
Cloud_pct_clear = cloud_pct_clear,
Cloud_category = cloud_category,
stringsAsFactors = FALSE
)