added last columns of trends, only harvest imminent and age left to do

This commit is contained in:
Timon 2026-01-16 09:10:39 +01:00
parent 8d84c8cab5
commit 3e4430b3be

View file

@ -26,6 +26,55 @@
# source("r_app/80_calculate_kpis.R")
# main()
# ============================================================================
# PENDING WORK - PHASE 4 (Next Sprint)
# ============================================================================
#
# CRITICAL INTEGRATIONS:
#
# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307)
# [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv
# Columns: field, imminent_prob, detected_prob, week, year
# [ ] LEFT JOIN to field_analysis_df by (field, week, year)
# [ ] Replace hardcoded "placeholder data" in Status_trigger calculation
# [ ] Update column to show actual harvest probability (0-1 or 0-100%)
#
# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31)
# [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field
# [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE
# [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks")
# [ ] Removes TEST MODE hardcoding and enables field-specific aging
#
# UTILITY FILE REFACTORING (Improve code maintainability):
#
# 3. CREATE r_app/80_weekly_stats_utils.R
# [ ] Extract functions from lines 250-795 (calculation layer):
# - calculate_field_statistics()
# - calculate_kpi_trends()
# - load_or_calculate_weekly_stats()
# - Helper: load_tiles_for_field(), get_tile_ids_for_field()
# - Helper: extract_field_statistics_from_ci()
# [ ] Clean separation: DATA CALCULATION ONLY (no Excel export)
# [ ] Reusable by run_full_pipeline.R and other scripts
#
# 4. CREATE r_app/80_report_building_utils.R
# [ ] Extract functions from lines 1350-2100+ (output/reporting layer):
# - generate_field_analysis_summary()
# - export_field_analysis_excel()
# - calculate_and_export_farm_kpis()
# - Helper: categorize_*, get_*, round_* functions
# - Helper: get_phase_by_age(), get_status_trigger()
# [ ] Clean separation: OUTPUT/FORMATTING ONLY (consumes calculated stats)
# [ ] Reusable for alternative export formats (PDF, HTML, dashboard)
#
# TESTING PLAN:
# [ ] Verify 8-week historical data loads (currently TEST_MODE = 2 weeks only)
# [ ] Confirm Four_week_trend calculates from 1-4 weeks (graceful degradation)
# [ ] Confirm CV_Trend_Long_Term uses full 8-week regression (when available)
# [ ] Load script 31 output and validate imminent_prob population
#
# ============================================================================
# ============================================================================
# EXCEL OUTPUT SPECIFICATION (21 COLUMNS)
# ============================================================================
@ -114,6 +163,20 @@ FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5
# CV TREND THRESHOLDS
CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05
# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS
# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend
# Negative slope = CV decreasing = field becoming MORE uniform = GOOD
# Positive slope = CV increasing = field becoming MORE patchy = BAD
# Near zero = Homogenous growth (all crops progressing equally)
CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.05 # CV decreasing rapidly
CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # Gradual synchronization
CV_SLOPE_IMPROVEMENT_MAX <- -0.005 # Becoming uniform
CV_SLOPE_HOMOGENOUS_MIN <- -0.005 # Stable, uniform growth
CV_SLOPE_HOMOGENOUS_MAX <- 0.005 # No change in uniformity
CV_SLOPE_PATCHINESS_MIN <- 0.005 # Minor divergence
CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness
CV_SLOPE_SEVERE_MIN <- 0.02 # Field fragmentation beginning
# CLOUD COVER ROUNDING INTERVALS
CLOUD_INTERVALS <- c(0, 50, 60, 70, 80, 90, 100)
@ -404,6 +467,100 @@ calculate_cv_trend <- function(cv_current, cv_previous) {
return(round(cv_current - cv_previous, 4))
}
calculate_four_week_trend <- function(mean_ci_values) {
#' Calculate four-week CI trend from available weeks
#'
#' Uses whatever weeks are available (1-4 weeks) to estimate trend
#' Returns difference between current (most recent) and oldest available week
#'
#' @param mean_ci_values vector of Mean_CI values in chronological order (oldest to newest)
#' @return numeric: CI difference (current - oldest), rounded to 2 decimals
if (is.null(mean_ci_values) || length(mean_ci_values) == 0) {
return(NA_real_)
}
# Remove NAs
ci_clean <- mean_ci_values[!is.na(mean_ci_values)]
if (length(ci_clean) < 2) {
# Need at least 2 weeks to calculate trend
return(NA_real_)
}
# Calculate difference: current - oldest
trend <- ci_clean[length(ci_clean)] - ci_clean[1]
return(round(trend, 2))
}
categorize_cv_slope <- function(slope) {
#' Categorize CV slope (8-week regression) into field uniformity interpretation
#'
#' Slope interpretation:
#' Negative slope = CV decreasing = field becoming MORE uniform = GOOD
#' Positive slope = CV increasing = field becoming MORE patchy = BAD
#' Near zero = Homogenous growth (all crops progressing equally)
#'
#' Categories:
#' - "Excellent uniformity": Slope <= -0.02 (CV decreasing, field synchronizing)
#' - "Homogenous growth": -0.02 < slope < 0.005 (stable, uniform growth)
#' - "Minor patchiness": 0.005 <= slope <= 0.02 (CV slowly increasing)
#' - "Severe fragmentation": slope > 0.02 (rapid CV increase, parts diverging)
#'
#' @param slope numeric: CV trend slope per week
#' @return character: Category string
if (is.na(slope)) {
return(NA_character_)
}
if (slope <= CV_SLOPE_IMPROVEMENT_MIN) {
return("Excellent uniformity")
} else if (slope < CV_SLOPE_HOMOGENOUS_MIN) {
return("Homogenous growth")
} else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) {
return("Homogenous growth")
} else if (slope <= CV_SLOPE_PATCHINESS_MAX) {
return("Minor patchiness")
} else {
return("Severe fragmentation")
}
}
calculate_cv_trend_long_term <- function(cv_values) {
#' Calculate 8-week CV trend via linear regression slope
#'
#' Fits linear regression to CV over available weeks (1-8)
#' Returns slope = rate of change per week
#'
#' @param cv_values vector of CV values in chronological order (oldest to newest)
#' @return numeric: Regression slope (CV change per week), rounded to 4 decimals
if (is.null(cv_values) || length(cv_values) == 0) {
return(NA_real_)
}
# Remove NAs
cv_clean <- cv_values[!is.na(cv_values)]
if (length(cv_clean) < 2) {
# Need at least 2 weeks to fit a line
return(NA_real_)
}
# Create week sequence matching data length
weeks <- seq_along(cv_clean)
# Fit linear model
tryCatch({
lm_fit <- lm(cv_clean ~ weeks)
slope <- coef(lm_fit)["weeks"]
return(round(as.numeric(slope), 4))
}, error = function(e) {
return(NA_real_)
})
}
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
@ -498,7 +655,7 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n
}
USE_UNIFORM_AGE <- TRUE
UNIFORM_PLANTING_DATE <- as.Date("2025-01-01")
UNIFORM_PLANTING_DATE <- as.Date("2026-01-01")
extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
if (USE_UNIFORM_AGE) {
@ -688,13 +845,17 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
# CALCULATE KPI TRENDS (Requires previous week RDS)
# ============================================================================
calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
project_dir = NULL, reports_dir = NULL,
current_week = NULL, year = NULL) {
message("Calculating KPI trends from current and previous week data")
# Initialize new columns with defaults
current_stats$Weekly_ci_change <- NA_real_
current_stats$CV_Trend_Short_Term <- NA_real_
current_stats$Four_week_trend <- NA_real_
current_stats$CV_Trend_Long_Term <- NA_real_
current_stats$nmr_weeks_in_this_phase <- 1L
# If no previous week data, return with defaults
@ -704,36 +865,18 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
}
message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns"))
message(paste(" prev_stats columns:", paste(names(prev_stats), collapse = ", ")))
# Build lookup indices for previous week (by Field_id)
prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id)
# Try to load previous week's field_analysis to get nmr_weeks_in_this_phase history
prev_field_analysis <- NULL
prev_analysis_csv <- file.path(
reports_dir, "kpis", "field_analysis",
sprintf("%s_field_analysis_week%02d.csv",
paste(strsplit(current_stats$Field_id[1], "")[[1]][1], collapse=""), # Get project from field
as.numeric(format(Sys.Date() - 7, "%V"))) # Approximate previous week
)
# Better way: construct the previous week number properly
current_week_num <- as.numeric(format(Sys.Date(), "%V"))
prev_week_num <- current_week_num - 1
if (prev_week_num < 1) prev_week_num <- 52
# This is a bit tricky - we need the project_dir from the main scope
# For now, assume we can infer it or pass it through
# Let's use a simpler approach: look for any field_analysis_week* file that's recent
tryCatch({
analysis_dir <- file.path(reports_dir, "kpis", "field_analysis")
if (dir.exists(analysis_dir)) {
# Find the most recent field_analysis CSV (should be previous week)
analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE)
if (length(analysis_files) > 0) {
# Sort by modification time and get the most recent
recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)]
prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE,
col_select = c(Field_id, nmr_weeks_in_this_phase, Phase))
@ -747,49 +890,139 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase"))
}
# For each field in current week, lookup previous values
# ============================================================
# PHASE 3: Load 4-8 weeks of historical data for trend calculations
# ============================================================
historical_4weeks <- list()
historical_8weeks <- list()
if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) {
message(" Loading historical field_stats for 4-week and 8-week trends...")
# Load up to 4 weeks back for four_week_trend
for (lookback in 1:4) {
target_week <- current_week - lookback
if (target_week < 1) target_week <- target_week + 52
rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week)
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
if (file.exists(rds_path)) {
tryCatch({
stats_data <- readRDS(rds_path)
historical_4weeks[[length(historical_4weeks) + 1]] <- list(
week = target_week,
stats = stats_data
)
}, error = function(e) {
message(paste(" Warning: Could not load week", target_week, ":", e$message))
})
}
}
# Load up to 8 weeks back for cv_trend_long_term
for (lookback in 1:8) {
target_week <- current_week - lookback
if (target_week < 1) target_week <- target_week + 52
rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week)
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
if (file.exists(rds_path)) {
tryCatch({
stats_data <- readRDS(rds_path)
historical_8weeks[[length(historical_8weeks) + 1]] <- list(
week = target_week,
stats = stats_data
)
}, error = function(e) {
# Silently skip - we'll work with whatever weeks exist
})
}
}
if (length(historical_4weeks) > 0) {
message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend"))
}
if (length(historical_8weeks) > 0) {
message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend"))
}
}
# For each field in current week, lookup previous values and calculate trends
cv_trends_calculated <- 0
four_week_trends_calculated <- 0
cv_long_term_calculated <- 0
for (i in seq_len(nrow(current_stats))) {
field_id <- current_stats$Field_id[i]
prev_idx <- prev_lookup[field_id]
if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) {
# Field exists in previous week - extract row carefully
prev_row <- prev_stats[prev_idx, , drop = FALSE] # Keep as dataframe
prev_row <- prev_stats[prev_idx, , drop = FALSE]
if (nrow(prev_row) == 0) {
# Field not found - skip
next
}
# Access values from single-row dataframe
prev_mean_ci <- prev_row$Mean_CI[1]
prev_cv <- prev_row$CV[1]
prev_phase <- prev_row$Phase[1]
# Weekly CI change (current Mean_CI - previous Mean_CI)
if (!is.na(prev_mean_ci) && !is.na(current_stats$Mean_CI[i])) {
# WEEKLY CI CHANGE
prev_ci <- prev_row$Mean_CI[1]
if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) {
current_stats$Weekly_ci_change[i] <-
round(current_stats$Mean_CI[i] - prev_mean_ci, 2)
}
# CV short-term trend (current CV - previous CV)
# DEBUG: Check first few fields
if (i <= 3) {
message(paste(" Field", field_id, "- CV_prev:", prev_cv, "CV_curr:", current_stats$CV[i]))
round(current_stats$Mean_CI[i] - prev_ci, 2)
}
# CV TREND SHORT TERM (2-week comparison)
prev_cv <- prev_row$CV[1]
if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) {
trend_val <- round(current_stats$CV[i] - prev_cv, 4)
current_stats$CV_Trend_Short_Term[i] <- trend_val
current_stats$CV_Trend_Short_Term[i] <-
calculate_cv_trend(current_stats$CV[i], prev_cv)
cv_trends_calculated <- cv_trends_calculated + 1
}
# FOUR-WEEK TREND (if available)
if (length(historical_4weeks) > 0) {
ci_values_4week <- numeric()
if (i <= 3) {
message(paste(" -> CV_Trend =", trend_val))
# Add oldest available weeks (reverse order to get oldest first)
for (hist_idx in rev(seq_along(historical_4weeks))) {
hist_data <- historical_4weeks[[hist_idx]]$stats
hist_field <- which(hist_data$Field_id == field_id)
if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) {
ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]])
}
}
# Add current week CI
ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i])
if (length(ci_values_4week) >= 2) {
current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week)
four_week_trends_calculated <- four_week_trends_calculated + 1
}
}
# Weeks in current phase (track phase transitions)
# CV TREND LONG TERM (8-week slope)
if (length(historical_8weeks) > 0) {
cv_values_8week <- numeric()
# Add oldest available weeks (reverse order to get oldest first)
for (hist_idx in rev(seq_along(historical_8weeks))) {
hist_data <- historical_8weeks[[hist_idx]]$stats
hist_field <- which(hist_data$Field_id == field_id)
if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) {
cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]])
}
}
# Add current week CV
cv_values_8week <- c(cv_values_8week, current_stats$CV[i])
if (length(cv_values_8week) >= 2) {
slope <- calculate_cv_trend_long_term(cv_values_8week)
current_stats$CV_Trend_Long_Term[i] <- slope
cv_long_term_calculated <- cv_long_term_calculated + 1
}
}
# WEEKS IN CURRENT PHASE (track phase transitions)
# Use previous field_analysis if available for proper counter progression
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
# Look up this field in previous analysis
@ -810,9 +1043,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
current_stats$nmr_weeks_in_this_phase[i] <- 1L
}
}
} else if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) {
} else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) {
# Field not in previous analysis, fall back to prev_stats phase comparison
if (current_stats$Phase[i] == prev_phase) {
if (current_stats$Phase[i] == prev_row$Phase[1]) {
current_stats$nmr_weeks_in_this_phase[i] <- 2L
} else {
current_stats$nmr_weeks_in_this_phase[i] <- 1L
@ -820,9 +1053,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
}
} else {
# No previous field_analysis available - use phase from prev_stats
if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) {
if (current_stats$Phase[i] == prev_phase) {
# Same phase - increment counter (start with 2)
if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) {
if (current_stats$Phase[i] == prev_row$Phase[1]) {
# Same phase - increment counter (start with 2 since prev week was in this phase)
current_stats$nmr_weeks_in_this_phase[i] <- 2L
} else {
# Phase changed - reset to 1
@ -833,8 +1066,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) {
}
}
message(paste(" ✓ Calculated", cv_trends_calculated, "CV_Trend_Short_Term values out of", nrow(current_stats), "fields"))
message(paste(" CV_Trend_Short_Term non-NA values:", sum(!is.na(current_stats$CV_Trend_Short_Term))))
message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields"))
message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields"))
message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields"))
return(current_stats)
}
@ -1702,9 +1936,13 @@ main <- function() {
# Apply trend calculations (requires both weeks)
message("\n3. Calculating trend columns...")
current_stats <- calculate_kpi_trends(current_stats, prev_stats)
current_stats <- calculate_kpi_trends(current_stats, prev_stats,
project_dir = project_dir,
reports_dir = reports_dir,
current_week = current_week,
year = year)
message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, nmr_weeks_in_this_phase"))
message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase"))
# ============================================================================
# Build final output dataframe with all 21 columns
@ -1758,8 +1996,7 @@ main <- function() {
if_else(is.na(acreages_vec), 0, acreages_vec)
},
# Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change)
# Column 7: Four_week_trend (Phase 3 future)
Four_week_trend = NA_character_,
# Column 7: Four_week_trend (from current_stats)
# Column 8: Last_harvest_or_planting_date (dummy for now)
Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE,
# Columns 9-10: Already in current_stats (Age_week, Phase)
@ -1788,8 +2025,11 @@ main <- function() {
# Columns 15-16: Already in current_stats (CI_range, CI_Percentiles)
# Column 17: Already in current_stats (CV)
# Column 18: Already in current_stats (CV_Trend_Short_Term)
# Column 19: CV_Trend_Long_Term (Phase 3 future)
CV_Trend_Long_Term = NA_real_,
# Column 19: CV_Trend_Long_Term (from current_stats - raw slope value)
# Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope)
CV_Trend_Long_Term_Category = {
sapply(current_stats$CV_Trend_Long_Term, categorize_cv_slope)
},
# Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category)
.keep = "all" # Keep all existing columns
) %>%
@ -1797,7 +2037,7 @@ main <- function() {
Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change,
Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase,
nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger,
CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term,
CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, CV_Trend_Long_Term_Category,
Cloud_pct_clear, Cloud_category
)