added some more stats and functions calculate and export csv and rds file for future ref

This commit is contained in:
Timon 2026-01-16 08:04:45 +01:00
parent 4e94a9a78b
commit 6e88acef25

View file

@ -65,11 +65,6 @@
# [✓] Add Cloud_pct_clear (% from pixel coverage)
# [✓] Column order: Reorder to match spec (1-21)
#
# PHASE 1 LIMITATION (Known Issue - To Fix in PHASE 2):
# - Fields spanning multiple tiles currently show statistics from first intersecting tile only
# - This results in p10 ≈ p90 (few pixels per tile) instead of field-wide percentiles
# - FIX: After extracting all tiles, group by field_id and aggregate pixel values across all tiles
# before calculating percentiles. This will give true field-wide statistics.
#
# PHASE 2 - MEDIUM (Requires computation):
# [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV)
@ -539,6 +534,245 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL)
})
}
# ============================================================================
# MODULAR STATISTICS CALCULATION (Reusable for any week)
# ============================================================================
calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
mosaic_dir, report_date = Sys.Date()) {
message(paste("Calculating statistics for all fields - Week", week_num, year))
# Build tile file list
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir))
}
message(paste(" Found", length(tile_files), "tiles for week", week_num))
results_list <- list()
fields_processed <- 0
# SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile
for (tile_idx in seq_along(tile_files)) {
tile_file <- tile_files[tile_idx]
tryCatch({
# Load tile
current_rast <- terra::rast(tile_file)
ci_band <- current_rast[["CI"]]
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
message(paste(" [SKIP] Tile", basename(tile_file), "- CI band not found"))
return(NULL)
}
# Extract all fields from this tile in ONE call
# terra::extract returns a dataframe with columns: ID, CI
# where each row is one pixel, and ID indicates which polygon it came from
extracted <- terra::extract(ci_band, field_boundaries_sf, na.rm = FALSE)
# Group by field ID and calculate statistics for each field
# extracted$ID contains the field polygon index (1 to nrow(field_boundaries_sf))
unique_field_ids <- unique(extracted$ID[!is.na(extracted$ID)])
for (field_poly_idx in unique_field_ids) {
# Get all CI values for this field from this tile
field_id <- field_boundaries_sf$field[field_poly_idx]
ci_vals <- extracted$CI[extracted$ID == field_poly_idx]
ci_vals <- ci_vals[!is.na(ci_vals)]
# Skip if no data for this field in this tile
if (length(ci_vals) == 0) {
next
}
# Calculate statistics
mean_ci <- mean(ci_vals, na.rm = TRUE)
ci_std <- sd(ci_vals, na.rm = TRUE)
cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_
range_min <- min(ci_vals, na.rm = TRUE)
range_max <- max(ci_vals, na.rm = TRUE)
range_str <- sprintf("%.1f-%.1f", range_min, range_max)
ci_percentiles_str <- get_ci_percentiles(ci_vals)
# Cloud coverage: count total pixels vs non-NA pixels for this field
field_rows <- extracted[extracted$ID == field_poly_idx, ]
num_total <- nrow(field_rows)
num_data <- sum(!is.na(field_rows$CI))
pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0
cloud_cat <- if (num_data == 0) "No image available"
else if (pct_clear >= 99.5) "Clear view"
else "Partial coverage"
# Age and Phase
age_weeks <- if (USE_UNIFORM_AGE) {
as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks"))
} else {
NA_real_
}
phase <- get_phase_by_age(age_weeks)
# Germination progress (only for young plants, age < 17 weeks)
germination_progress <- NA_character_
if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks < 17) {
pct_ci_ge_threshold <- sum(ci_vals >= GERMINATION_CI_THRESHOLD) / length(ci_vals) * 100
germination_progress <- sprintf("%.1f%%", pct_ci_ge_threshold)
}
# Store result (check if field already exists from another tile)
existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id)
if (length(existing_idx) > 0) {
# Field already in results from previous tile - keep first occurrence or average
# For now, keep the first one (earlier tiles)
next
}
# Store new field result
results_list[[length(results_list) + 1]] <- data.frame(
Field_id = field_id,
Mean_CI = round(mean_ci, 2),
CV = round(cv, 4),
CI_range = range_str,
CI_Percentiles = ci_percentiles_str,
Cloud_pct_clear = pct_clear,
Cloud_category = cloud_cat,
Age_week = round(age_weeks, 1),
Phase = phase,
Germination_progress = germination_progress,
stringsAsFactors = FALSE
)
fields_processed <- fields_processed + 1
}
message(paste(" Tile", tile_idx, "of", length(tile_files), "processed"))
}, error = function(e) {
message(paste(" [ERROR] Tile", basename(tile_file), ":", e$message))
})
}
if (length(results_list) == 0) {
stop(paste("No fields processed successfully for week", week_num))
}
stats_df <- dplyr::bind_rows(results_list)
message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields"))
return(stats_df)
}
# ============================================================================
# CALCULATE KPI TRENDS (Requires previous week RDS)
# ============================================================================
calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
message("Calculating KPI trends from current and previous week data")
# Initialize new columns with defaults
current_stats$Weekly_ci_change <- NA_real_
current_stats$CV_Trend_Short_Term <- NA_real_
current_stats$nmr_weeks_in_this_phase <- 1L
# If no previous week data, return with defaults
if (is.null(prev_stats) || nrow(prev_stats) == 0) {
message(" No previous week data available - using defaults")
return(current_stats)
}
# Build lookup indices for previous week (by Field_id)
prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id)
# For each field in current week, lookup previous values
for (i in seq_len(nrow(current_stats))) {
field_id <- current_stats$Field_id[i]
prev_idx <- prev_lookup[field_id]
if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) {
# Field exists in previous week
prev_row <- prev_stats[prev_idx, ]
# Weekly CI change (current Mean_CI - previous Mean_CI)
if (!is.na(prev_row$Mean_CI) && !is.na(current_stats$Mean_CI[i])) {
current_stats$Weekly_ci_change[i] <-
round(current_stats$Mean_CI[i] - prev_row$Mean_CI, 2)
}
# CV short-term trend (current CV - previous CV)
if (!is.na(prev_row$CV) && !is.na(current_stats$CV[i])) {
current_stats$CV_Trend_Short_Term[i] <-
round(current_stats$CV[i] - prev_row$CV, 4)
}
# Weeks in current phase (track phase transitions)
if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase)) {
if (current_stats$Phase[i] == prev_row$Phase) {
# Same phase - increment counter
prev_weeks <- if (!is.na(prev_row$nmr_weeks_in_this_phase)) {
prev_row$nmr_weeks_in_this_phase
} else {
1
}
current_stats$nmr_weeks_in_this_phase[i] <- prev_weeks + 1L
} else {
# Phase changed - reset to 1
current_stats$nmr_weeks_in_this_phase[i] <- 1L
}
}
}
}
message(" Calculated trends for all fields")
return(current_stats)
}
# ============================================================================
# LOAD OR CALCULATE WEEKLY STATISTICS (RDS Caching)
# ============================================================================
load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf,
mosaic_dir, reports_dir, report_date = Sys.Date()) {
# Build RDS file path
rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, week_num)
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
# Try to load existing RDS (fast cache)
if (file.exists(rds_path)) {
message(paste("Loading cached statistics from:", basename(rds_path)))
return(readRDS(rds_path))
}
# RDS not found - calculate from tiles
message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num))
stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year,
mosaic_dir, report_date)
# Create output directory if needed
output_dir <- file.path(reports_dir, "kpis", "field_stats")
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
# Export RDS (for fast lookup next week)
saveRDS(stats_df, rds_path)
message(paste("Saved weekly statistics RDS:", basename(rds_path)))
# Export CSV (for user review)
csv_filename <- sprintf("%s_field_stats_week%02d.csv", project_dir, week_num)
csv_path <- file.path(output_dir, csv_filename)
readr::write_csv(stats_df, csv_path)
message(paste("Saved weekly statistics CSV:", basename(csv_path)))
return(stats_df)
}
# ============================================================================
# PARALLEL FIELD ANALYSIS FUNCTION
# ============================================================================
@ -1316,248 +1550,116 @@ main <- function() {
}
# SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile
message("\nProcessing tiles and extracting field statistics...")
all_tile_results <- list()
# ============================================================================
# NEW MODULAR APPROACH: Load/Calculate weekly stats, apply trends
# ============================================================================
for (i in seq_along(tile_files)) {
tile_file <- tile_files[i]
message(paste(" Processing tile", i, "of", length(tile_files), ":", basename(tile_file)))
# Build tile grid (needed by calculate_field_statistics)
message("\nBuilding tile grid for current week...")
tile_grid <- build_tile_grid(mosaic_dir, current_week, year)
tryCatch({
# Load current tile and previous week tile
current_rast <- terra::rast(tile_file)
message("\nUsing modular RDS-based approach for weekly statistics...")
# DEBUG: Check tile structure on first tile
if (i == 1) {
message(paste(" [DEBUG] Tile CRS:", terra::crs(current_rast)))
message(paste(" [DEBUG] Tile extent:", paste(terra::ext(current_rast))))
message(paste(" [DEBUG] Field boundaries CRS:", sf::st_crs(field_boundaries_sf)))
field_bbox <- sf::st_bbox(field_boundaries_sf)
message(paste(" [DEBUG] Field bbox:", paste(round(field_bbox, 2))))
message(paste(" [DEBUG] Band names:", paste(names(current_rast), collapse=", ")))
}
# Extract CI band by name
ci_band <- current_rast[["CI"]]
# Check if CI band exists - use proper logical checks
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
message(paste(" ERROR: CI band not found. Available bands:", paste(names(current_rast), collapse=", ")))
next
}
# Check if CI band has any valid data
if (tryCatch(all(is.na(values(ci_band))), error = function(e) TRUE)) {
message(paste(" ERROR: CI band has no valid data"))
next
}
# Load previous week tile if available
previous_tile_file <- sub(sprintf("week_%02d", current_week),
sprintf("week_%02d", previous_week),
tile_file)
previous_ci <- NULL
if (file.exists(previous_tile_file)) {
previous_rast <- terra::rast(previous_tile_file)
previous_ci <- previous_rast[["CI"]]
}
# OPTION 1 + 2: Extract all CI statistics from one pixel extraction (single call)
current_stats <- extract_field_statistics_from_ci(ci_band, field_boundaries_sf)
# DEBUG: Check extraction result on first tile
if (i == 1) {
num_with_data <- sum(!is.na(current_stats$mean_ci))
message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data"))
if (num_with_data > 0) {
message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", ")))
# Check percentiles
sample_field <- which(!is.na(current_stats$mean_ci))[2]
message(paste(" [DEBUG] Field", sample_field, "- p10:", current_stats$p10[sample_field],
"p90:", current_stats$p90[sample_field],
"min:", current_stats$min_ci[sample_field],
"max:", current_stats$max_ci[sample_field],
"valid_pixels:", current_stats$pixel_count_valid[sample_field]))
}
}
# Extract previous week CI statistics if available
previous_stats <- NULL
if (!is.null(previous_ci)) {
previous_stats <- extract_field_statistics_from_ci(previous_ci, field_boundaries_sf)
}
# Process each field that was extracted
field_results_this_tile <- list()
fields_added <- 0
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
tryCatch({
field_id <- field_boundaries_sf$field[field_idx]
field_sf <- field_boundaries_sf[field_idx, ]
# Get statistics from helper function results
# current_stats should have same number of rows as field_boundaries_sf
if (field_idx > nrow(current_stats)) {
message(paste(" [ERROR] field_idx", field_idx, "> nrow(current_stats)", nrow(current_stats)))
next
}
mean_ci_current <- current_stats$mean_ci[field_idx]
# SKIP fields with no data in this tile (they don't intersect this tile)
if (is.na(current_stats$pixel_count_valid[field_idx]) || current_stats$pixel_count_valid[field_idx] == 0) {
next
}
ci_cv_current <- current_stats$cv[field_idx]
ci_percentile_low <- current_stats$p10[field_idx]
ci_percentile_high <- current_stats$p90[field_idx]
# If field doesn't intersect this tile, mean_ci_current will be NA
if (is.na(mean_ci_current)) {
next # Skip this field - doesn't intersect this tile
}
field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000
field_area_acres <- field_area_ha / 0.404686
# Extract previous week CI if available
mean_ci_previous <- NA
ci_change <- NA
if (!is.null(previous_stats)) {
mean_ci_previous <- previous_stats$mean_ci[field_idx]
if (!is.na(mean_ci_previous)) {
ci_change <- mean_ci_current - mean_ci_previous
}
}
# Reconstruct pixel values for status trigger (we need the actual pixel array)
# Use the percentiles and mean to create a synthetic distribution for status_trigger
# For now, use mean CI repeated by pixel count for testing
# TODO: Consider extracting pixels directly if needed for more complex triggers
pixel_count_valid <- current_stats$pixel_count_valid[field_idx]
ci_vals_current <- if (!is.na(pixel_count_valid) && pixel_count_valid > 0) {
rep(mean_ci_current, pixel_count_valid) # Simplified: use mean value repeated
} else {
numeric(0)
}
# Calculate age
age_weeks <- if (!is.null(planting_dates) && nrow(planting_dates) > 0 && field_idx <= nrow(planting_dates)) {
planting_date <- planting_dates$date[field_idx]
if (!is.na(planting_date)) {
as.numeric(difftime(end_date, planting_date, units = "weeks"))
} else {
0
}
} else {
0
}
# Get phase and status
phase <- get_phase_by_age(age_weeks)
status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks)
# Calculate Cloud_pct_clear: percentage of field with valid data
# Binned to 10% intervals (0, 10, 20, ..., 90, 100)
cloud_pct_clear <- {
valid_count <- current_stats$pixel_count_valid[field_idx]
total_count <- current_stats$pixel_count_total[field_idx]
if (!is.na(valid_count) && !is.na(total_count) && total_count > 0) {
pct <- (valid_count / total_count) * 100
round(pct / 10) * 10
} else {
NA_real_
}
}
# Cloud categorization based on pixel coverage (Cloud_pct_clear)
cloud_category <- if (is.na(cloud_pct_clear)) {
"No image available"
} else if (cloud_pct_clear >= 90) {
"Clear view"
} else if (cloud_pct_clear > 0) {
"Partial coverage"
} else {
"No image available"
}
# Get min/max CI values
ci_min <- current_stats$min_ci[field_idx]
ci_max <- current_stats$max_ci[field_idx]
ci_range <- if (!is.na(ci_min) && !is.na(ci_max)) {
sprintf("%.1f-%.1f", ci_min, ci_max)
} else {
NA_character_
}
# Get field_name from field_boundaries_sf
field_name <- field_sf$field
# Build result row (21 columns per specification, in order)
result_row <- data.frame(
Field_id = field_id,
Farm_Section = NA_character_,
Field_name = field_name,
Acreage = field_area_acres,
Mean_CI = mean_ci_current,
Weekly_ci_change = ci_change,
Four_week_trend = NA_character_,
Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
Age_weeks = age_weeks,
Phase = phase,
nmr_weeks_in_this_phase = NA_real_,
Germination_progress = NA_real_,
Imminent_prob = NA_real_,
Status_trigger = status_trigger,
CI_range = ci_range,
CI_Percentiles = if (!is.na(ci_percentile_low) && !is.na(ci_percentile_high)) {
sprintf("%.1f-%.1f", ci_percentile_low, ci_percentile_high)
} else {
NA_character_
},
CV = ci_cv_current,
CV_Trend_Short_Term = NA_real_,
CV_Trend_Long_Term = NA_real_,
Cloud_pct_clear = cloud_pct_clear,
Cloud_category = cloud_category,
stringsAsFactors = FALSE
# Load/calculate CURRENT week stats (from tiles or RDS cache)
message("\n1. Loading/calculating CURRENT week statistics (week", current_week, ")...")
current_stats <- load_or_calculate_weekly_stats(
week_num = current_week,
year = year,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = tile_grid$mosaic_dir,
reports_dir = reports_dir,
report_date = end_date
)
field_results_this_tile[[as.character(field_id)]] <- result_row
fields_added <- fields_added + 1
message(paste(" ✓ Loaded/calculated stats for", nrow(current_stats), "fields in current week"))
}, error = function(e) {
# Show error for debugging
message(paste(" [FIELD ERROR] Field", field_idx, ":", e$message))
# Load/calculate PREVIOUS week stats (from RDS cache or tiles)
message("\n2. Loading/calculating PREVIOUS week statistics (week", previous_week, ")...")
prev_stats <- load_or_calculate_weekly_stats(
week_num = previous_week,
year = year,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = tile_grid$mosaic_dir,
reports_dir = reports_dir,
report_date = end_date - 7 # Approximately 1 week before
)
message(paste(" ✓ Loaded/calculated stats for", nrow(prev_stats), "fields in previous week"))
# Apply trend calculations (requires both weeks)
message("\n3. Calculating trend columns...")
current_stats <- calculate_kpi_trends(current_stats, prev_stats)
message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, nmr_weeks_in_this_phase"))
# ============================================================================
# Build final output dataframe with all 21 columns
# ============================================================================
message("\nBuilding final field analysis output...")
field_analysis_df <- current_stats %>%
mutate(
# Column 2: Farm_Section (user fills)
Farm_Section = NA_character_,
# Column 3: Field_name (from GeoJSON - already have Field_id, can look up)
Field_name = Field_id,
# Column 4: Acreage (calculate from geometry)
Acreage = {
acreages <- sapply(seq_len(nrow(field_boundaries_sf)), function(idx) {
field_sf <- field_boundaries_sf[idx, ]
field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000
field_area_ha / 0.404686
})
}
acreages[match(Field_id, field_boundaries_sf$field)]
},
# Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change)
# Column 7: Four_week_trend (Phase 3 future)
Four_week_trend = NA_character_,
# Column 8: Last_harvest_or_planting_date (dummy for now)
Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
# Columns 9-10: Already in current_stats (Age_week, Phase)
# Column 11: nmr_weeks_in_this_phase (already calculated)
# Column 12: Germination_progress (already calculated)
# Column 13: Imminent_prob (placeholder)
Imminent_prob = "placeholder data",
# Column 14: Status_trigger (need to add)
Status_trigger = {
triggers <- sapply(seq_len(nrow(current_stats)), function(idx) {
field_id <- current_stats$Field_id[idx]
field_idx <- which(field_boundaries_sf$field == field_id)[1]
if (is.na(field_idx)) return(NA_character_)
if (length(field_results_this_tile) > 0) {
all_tile_results[[basename(tile_file)]] <- dplyr::bind_rows(field_results_this_tile)
message(paste(" Extracted", length(field_results_this_tile), "fields from tile (processed", fields_added, "fields total)"))
} else {
message(paste(" WARNING: No fields extracted from this tile (processed", fields_added, "fields, all either NA or errored)"))
}
# Reconstruct CI values from Mean_CI for status trigger logic
# For now, use simplified approach
age_w <- current_stats$Age_week[idx]
ci_change <- current_stats$Weekly_ci_change[idx]
}, error = function(e) {
message(paste(" Error processing tile", basename(tile_file), ":", e$message))
# Using mean CI as proxy (could be improved with pixel distribution)
ci_vals <- rep(current_stats$Mean_CI[idx], 100)
get_status_trigger(ci_vals, ci_change, age_w)
})
}
triggers
},
# Columns 15-16: Already in current_stats (CI_range, CI_Percentiles)
# Column 17: Already in current_stats (CV)
# Column 18: Already in current_stats (CV_Trend_Short_Term)
# Column 19: CV_Trend_Long_Term (Phase 3 future)
CV_Trend_Long_Term = NA_real_,
# Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category)
.keep = "all" # Keep all existing columns
) %>%
select(
Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change,
Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase,
nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger,
CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term,
Cloud_pct_clear, Cloud_category
)
# Combine all tile results, keeping unique fields (may appear in multiple tiles)
if (length(all_tile_results) == 0) {
stop("No fields extracted from any tiles!")
}
field_analysis_df <- dplyr::bind_rows(all_tile_results) %>%
distinct(Field_id, .keep_all = TRUE)
if (nrow(field_analysis_df) == 0) {
stop("No fields analyzed successfully!")
}
message(paste("✓ Analyzed", nrow(field_analysis_df), "fields"))
message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 21 columns"))
summary_statistics_df <- generate_field_analysis_summary(field_analysis_df)