From 4e94a9a78bba1f136a07f2a99b34c9fcabbdcd60 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 15:35:16 +0100 Subject: [PATCH] expanding csv table --- r_app/80_calculate_kpis.R | 289 +++++++++++++++++++++++++------------- 1 file changed, 193 insertions(+), 96 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 59a9195..816ad83 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -10,7 +10,7 @@ # - Per-field analysis with SC-64 enhancements (4-week trends, CI percentiles, etc.) # - Farm-level KPI calculation (6 metrics for executive overview) # - Parallel processing (tile-aware, 1000+ fields supported) -# - Comprehensive Excel + RDS + CSV exports +# - Comprehensive Excel + RDS + CSV exports (21 columns per spec) # - Test mode for development # # COMMAND-LINE USAGE: @@ -26,6 +26,71 @@ # source("r_app/80_calculate_kpis.R") # main() +# ============================================================================ +# EXCEL OUTPUT SPECIFICATION (21 COLUMNS) +# ============================================================================ +# This script exports 21 columns per the field analysis specification: +# +# COMPLETED/IN PROGRESS: +# 1. Field_id ✓ - Unique field identifier +# 2. Farm_Section - Management zone (to be filled by user) +# 3. Field_name ✓ - Client-facing field name (from GeoJSON) +# 4. Acreage ✓ - Field size in acres +# 5. Mean_CI ✓ - Average Chlorophyll Index +# 6. Weekly_ci_change ✓ - Week-over-week CI change +# 7. Four_week_trend - [FUTURE] Trend over 4 weeks (requires historical mosaics) +# 8. Last_harvest_or_planting_date - [DUMMY for now] Will be from harvest Excel + LSTM (script 31) +# 9. Age_week ✓ - Weeks since planting +# 10. Phase (age based) ✓ - Growth phase (Germination, Tillering, Grand Growth, Maturation) +# 11. nmr_weeks_in_this_phase - [TODO] Weeks spent in current phase (track phase transitions) +# 12. Germination_progress - [TODO] % pixels with CI >= threshold (default 2, for age < 4 months) +# 13. Imminent_prob - [DUMMY for now] Harvest probability (will be from script 31 output) +# 14. Status_trigger ✓ - Alerts (harvest_ready, stress, etc.) +# 15. CI_range (min-max) - [TODO] Min and max CI values in field +# 16. CI_Percentiles ✓ - 10th-90th percentile of CI (p10-p90 format) +# 17. CV ✓ - Coefficient of variation (field uniformity) +# 18. CV_Trend_Short_Term - [TODO] 2-week CV trend (current week CV - last week CV) +# 19. CV_Trend_Long_Term - [FUTURE] 8-week CV slope (requires linear regression, historical mosaics) +# 20. Cloud_pct_clear ✓ - % field visible (pixel coverage) +# 21. Cloud_category ✓ - Cloud classification (Clear view / Partial coverage / No image available) +# +# IMPLEMENTATION PLAN (ordered by difficulty): +# ============================================================================ +# PHASE 1 - EASY (Current data only): +# [✓] Remove Mean_CI_prev column +# [✓] Add Field_name column (from field_boundaries_sf$field) +# [✓] Add Farm_Section column (empty, user will fill) +# [✓] Add Last_harvest_or_planting_date (use UNIFORM_PLANTING_DATE as dummy) +# [✓] Add CI_range (min/max from pixel extraction) +# [✓] Add Cloud_pct_clear (% from pixel coverage) +# [✓] Column order: Reorder to match spec (1-21) +# +# PHASE 1 LIMITATION (Known Issue - To Fix in PHASE 2): +# - Fields spanning multiple tiles currently show statistics from first intersecting tile only +# - This results in p10 ≈ p90 (few pixels per tile) instead of field-wide percentiles +# - FIX: After extracting all tiles, group by field_id and aggregate pixel values across all tiles +# before calculating percentiles. This will give true field-wide statistics. +# +# PHASE 2 - MEDIUM (Requires computation): +# [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV) +# [ ] Add Germination_progress (% pixels CI >= GERMINATION_CI_THRESHOLD, configurable) +# [ ] Add Imminent_prob column (dummy NA, will merge from script 31 harvest_imminent_weekly.csv) +# [ ] Add CV_Trend_Short_Term (requires loading last week's CV values) +# +# PHASE 3 - COMPLEX (Requires historical data): +# [ ] Add Four_week_trend (CI value difference week vs 4 weeks ago, requires loading prev mosaics) +# [ ] Add CV_Trend_Long_Term (8-week slope: linear regression on 8 weeks of CV, suggests lm()) +# [ ] Load previous week's CSV to cross-check phase transitions and trends +# +# NOTES: +# - Script 31 (harvest_imminent_weekly.py) outputs: field, imminent_prob, detected_prob, week, year +# - Will need to LEFT JOIN on (field, week, year) to populate Imminent_prob +# - Phase transition logic: Compare this week's Phase vs last week's Phase from CSV +# - For 8-week CV slope: Linear regression slope = (CV_week8 - CV_week1) / 7 weeks (approximately) +# or use lm(CV ~ week) on 8-week sequence for proper slope calculation +# - Germination_progress only calculated if Age_week < 17 (before end of Tillering phase) +# - Cloud_pct_clear calculated as: (pixel_count / expected_pixels) * 100 + # ============================================================================ # *** CONFIGURATION SECTION - MANUALLY DEFINED THRESHOLDS *** # ============================================================================ @@ -34,6 +99,10 @@ TEST_MODE <- TRUE TEST_MODE_NUM_WEEKS <- 2 +# GERMINATION PROGRESS THRESHOLD +# Percentage of pixels that must reach this CI value to count as "germinated" +GERMINATION_CI_THRESHOLD <- 2.0 # Pixels with CI >= 2 count as germinated + # FOUR-WEEK TREND THRESHOLDS FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5 FOUR_WEEK_TREND_GROWTH_MIN <- 0.1 @@ -522,54 +591,41 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week )) } - # Extract CI values: EXACTLY LIKE SCRIPT 20 - # Crop to field bounding box first, then extract with sf directly (not terra::vect conversion) + # SINGLE EXTRACTION: Get all pixel values for this field, then calculate all stats from it field_bbox <- sf::st_bbox(field_sf) ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out") - extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE) - # extracted_vals is a data.frame with ID column (field index) + mean value - mean_ci_current <- as.numeric(extracted_vals[1, 2]) + # Extract all pixels in one call (no fun= parameter means we get raw pixel values) + all_extracted <- terra::extract(ci_cropped, field_sf)[, 2] + current_ci_vals <- all_extracted[!is.na(all_extracted)] - if (is.na(mean_ci_current)) { + if (length(current_ci_vals) == 0) { return(data.frame( Field_id = field_id, error = "No CI values extracted from tiles" )) } - # For per-tile extraction, we only have mean from the aggregation function - # To get variance/CV, we need to extract all pixels without the fun parameter - # But for farm-level purposes, the mean CI is sufficient - all_extracted <- terra::extract(ci_cropped, field_sf)[, 2] - current_ci_vals <- all_extracted[!is.na(all_extracted)] + # Calculate all statistics from the single extraction + mean_ci_current <- mean(current_ci_vals, na.rm = TRUE) + ci_std <- sd(current_ci_vals, na.rm = TRUE) + cv_current <- if (mean_ci_current > 0) ci_std / mean_ci_current else NA_real_ + range_min <- min(current_ci_vals, na.rm = TRUE) + range_max <- max(current_ci_vals, na.rm = TRUE) + range_str <- sprintf("%.1f-%.1f", range_min, range_max) + ci_percentiles_str <- get_ci_percentiles(current_ci_vals) + # Cloud coverage from extraction metadata num_total <- length(all_extracted) - num_data <- sum(!is.na(all_extracted)) + num_data <- length(current_ci_vals) pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 - cloud_cat <- if (num_data == 0) "No image available" else if (pct_clear >= 99.5) "Clear view" else "Partial coverage" cloud_pct <- 100 - pct_clear cloud_interval <- round_cloud_to_intervals(pct_clear) - if (length(current_ci_vals) == 0) { - return(data.frame( - Field_id = field_id, - error = "No CI values extracted" - )) - } - - mean_ci_current <- mean(current_ci_vals, na.rm = TRUE) - ci_std <- sd(current_ci_vals, na.rm = TRUE) - cv_current <- ci_std / mean_ci_current - range_min <- min(current_ci_vals, na.rm = TRUE) - range_max <- max(current_ci_vals, na.rm = TRUE) - range_str <- sprintf("%.1f-%.1f", range_min, range_max) - - ci_percentiles_str <- get_ci_percentiles(current_ci_vals) - + # Weekly change (extract previous week same way - single extraction) weekly_ci_change <- NA previous_ci_vals <- NULL @@ -578,8 +634,8 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week if (!is.null(previous_ci)) { prev_bbox <- sf::st_bbox(field_sf) prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out") - prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2] - previous_ci_vals <- prev_extracted[!is.na(prev_extracted)] + prev_extracted_all <- terra::extract(prev_ci_cropped, field_sf)[, 2] + previous_ci_vals <- prev_extracted_all[!is.na(prev_extracted_all)] if (length(previous_ci_vals) > 0) { mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE) weekly_ci_change <- mean_ci_current - mean_ci_previous @@ -743,11 +799,11 @@ generate_field_analysis_summary <- function(field_df) { total_acreage <- sum(field_df$Acreage, na.rm = TRUE) - germination_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Germination"], na.rm = TRUE) - tillering_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Tillering"], na.rm = TRUE) - grand_growth_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Grand Growth"], na.rm = TRUE) - maturation_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Maturation"], na.rm = TRUE) - unknown_phase_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Unknown"], na.rm = TRUE) + germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE) + tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE) + grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE) + maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE) + unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE) harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE) stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE) @@ -1070,55 +1126,58 @@ extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) { #' #' @param ci_band Single CI band from terra raster #' @param field_boundaries_sf SF object with field geometries - #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count + #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, min_ci, max_ci, pixel_count_valid, pixel_count_total - # Extract all pixels for all fields at once (more efficient than individual calls) - all_pixels <- terra::extract(ci_band, field_boundaries_sf) + # SINGLE EXTRACTION: Get all pixels for all fields at once (no aggregation function) + # Result: data.frame with ID column (field indices) and value column (pixel values) + extract_result <- terra::extract(ci_band, field_boundaries_sf) # Calculate statistics for each field stats_list <- list() for (field_idx in seq_len(nrow(field_boundaries_sf))) { - # Extract pixel values for this field (skip ID column 1) - pixels <- all_pixels[field_idx, -1, drop = TRUE] - pixels <- as.numeric(pixels) - pixels <- pixels[!is.na(pixels)] + # Get all pixels for this field from the single extraction + # extract_result has columns [ID, value] where ID is field index (1-based) + field_pixels <- extract_result[extract_result$ID == field_idx, 2] + pixels <- as.numeric(field_pixels[!is.na(field_pixels)]) # Remove NAs - # Only calculate stats if we have pixels - if (length(pixels) > 0) { - mean_val <- mean(pixels, na.rm = TRUE) - - # Only calculate CV if mean > 0 (avoid division by zero) - if (mean_val > 0) { - cv_val <- sd(pixels, na.rm = TRUE) / mean_val - } else { - cv_val <- NA - } - - p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] - p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] - - stats_list[[field_idx]] <- data.frame( - field_idx = field_idx, - mean_ci = mean_val, - cv = cv_val, - p10 = p10_val, - p90 = p90_val, - pixel_count = length(pixels), - stringsAsFactors = FALSE - ) - } else { - # No pixels for this field (doesn't intersect tile) + if (length(pixels) == 0) { + # No data for this field stats_list[[field_idx]] <- data.frame( field_idx = field_idx, mean_ci = NA_real_, cv = NA_real_, p10 = NA_real_, p90 = NA_real_, - pixel_count = 0, + min_ci = NA_real_, + max_ci = NA_real_, + pixel_count_valid = 0, + pixel_count_total = 0, stringsAsFactors = FALSE ) + next } + + # Calculate all statistics from pixels array + mean_val <- mean(pixels, na.rm = TRUE) + cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_ + p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] + p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] + min_val <- min(pixels, na.rm = TRUE) + max_val <- max(pixels, na.rm = TRUE) + + stats_list[[field_idx]] <- data.frame( + field_idx = field_idx, + mean_ci = mean_val, + cv = cv_val, + p10 = p10_val, + p90 = p90_val, + min_ci = min_val, + max_ci = max_val, + pixel_count_valid = length(pixels), + pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]), + stringsAsFactors = FALSE + ) } return(dplyr::bind_rows(stats_list)) @@ -1312,6 +1371,13 @@ main <- function() { message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data")) if (num_with_data > 0) { message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", "))) + # Check percentiles + sample_field <- which(!is.na(current_stats$mean_ci))[2] + message(paste(" [DEBUG] Field", sample_field, "- p10:", current_stats$p10[sample_field], + "p90:", current_stats$p90[sample_field], + "min:", current_stats$min_ci[sample_field], + "max:", current_stats$max_ci[sample_field], + "valid_pixels:", current_stats$pixel_count_valid[sample_field])) } } @@ -1338,10 +1404,9 @@ main <- function() { } mean_ci_current <- current_stats$mean_ci[field_idx] - pixel_count <- current_stats$pixel_count[field_idx] # SKIP fields with no data in this tile (they don't intersect this tile) - if (is.na(pixel_count) || pixel_count == 0) { + if (is.na(current_stats$pixel_count_valid[field_idx]) || current_stats$pixel_count_valid[field_idx] == 0) { next } ci_cv_current <- current_stats$cv[field_idx] @@ -1370,9 +1435,9 @@ main <- function() { # Use the percentiles and mean to create a synthetic distribution for status_trigger # For now, use mean CI repeated by pixel count for testing # TODO: Consider extracting pixels directly if needed for more complex triggers - pixel_count <- current_stats$pixel_count[field_idx] - ci_vals_current <- if (pixel_count > 0) { - rep(mean_ci_current, pixel_count) # Simplified: use mean value repeated + pixel_count_valid <- current_stats$pixel_count_valid[field_idx] + ci_vals_current <- if (!is.na(pixel_count_valid) && pixel_count_valid > 0) { + rep(mean_ci_current, pixel_count_valid) # Simplified: use mean value repeated } else { numeric(0) } @@ -1393,36 +1458,68 @@ main <- function() { phase <- get_phase_by_age(age_weeks) status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks) - # Cloud coverage categorization based on CI value - # No data = No image available - # CI 0.01 to 95 = Partial coverage - # CI >= 95 = Clear view - if (is.na(mean_ci_current) || mean_ci_current == 0) { - cloud_category <- "No image available" - # Set all CI metrics to NA since no valid data - ci_change <- NA - ci_cv_current <- NA - ci_percentile_low <- NA - ci_percentile_high <- NA - } else if (mean_ci_current >= 95) { - cloud_category <- "Clear view" - } else { - cloud_category <- "Partial coverage" + # Calculate Cloud_pct_clear: percentage of field with valid data + # Binned to 10% intervals (0, 10, 20, ..., 90, 100) + cloud_pct_clear <- { + valid_count <- current_stats$pixel_count_valid[field_idx] + total_count <- current_stats$pixel_count_total[field_idx] + if (!is.na(valid_count) && !is.na(total_count) && total_count > 0) { + pct <- (valid_count / total_count) * 100 + round(pct / 10) * 10 + } else { + NA_real_ + } } - # Build result row + # Cloud categorization based on pixel coverage (Cloud_pct_clear) + cloud_category <- if (is.na(cloud_pct_clear)) { + "No image available" + } else if (cloud_pct_clear >= 90) { + "Clear view" + } else if (cloud_pct_clear > 0) { + "Partial coverage" + } else { + "No image available" + } + + # Get min/max CI values + ci_min <- current_stats$min_ci[field_idx] + ci_max <- current_stats$max_ci[field_idx] + ci_range <- if (!is.na(ci_min) && !is.na(ci_max)) { + sprintf("%.1f-%.1f", ci_min, ci_max) + } else { + NA_character_ + } + + # Get field_name from field_boundaries_sf + field_name <- field_sf$field + + # Build result row (21 columns per specification, in order) result_row <- data.frame( Field_id = field_id, + Farm_Section = NA_character_, + Field_name = field_name, Acreage = field_area_acres, Mean_CI = mean_ci_current, - Mean_CI_prev = mean_ci_previous, - CI_change = ci_change, - CI_CV = ci_cv_current, - CI_percentile_low = ci_percentile_low, - CI_percentile_high = ci_percentile_high, + Weekly_ci_change = ci_change, + Four_week_trend = NA_character_, + Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, Age_weeks = age_weeks, Phase = phase, + nmr_weeks_in_this_phase = NA_real_, + Germination_progress = NA_real_, + Imminent_prob = NA_real_, Status_trigger = status_trigger, + CI_range = ci_range, + CI_Percentiles = if (!is.na(ci_percentile_low) && !is.na(ci_percentile_high)) { + sprintf("%.1f-%.1f", ci_percentile_low, ci_percentile_high) + } else { + NA_character_ + }, + CV = ci_cv_current, + CV_Trend_Short_Term = NA_real_, + CV_Trend_Long_Term = NA_real_, + Cloud_pct_clear = cloud_pct_clear, Cloud_category = cloud_category, stringsAsFactors = FALSE )