From e16677eb78ed65b4eabb459ce2c8c67d1fcbb6c6 Mon Sep 17 00:00:00 2001 From: Timon Date: Wed, 4 Feb 2026 12:24:02 +0100 Subject: [PATCH] aura until word creation works. word cration itself needs more work. --- .github/copilot-instructions.md | 2 +- r_app/00_common_utils.R | 788 +++++++--------- r_app/10_create_per_field_tiffs.R | 109 ++- r_app/20_ci_extraction_per_field.R | 81 +- r_app/20_ci_extraction_utils.R | 76 +- r_app/21_convert_ci_rds_to_csv.R | 2 +- r_app/30_growth_model_utils.R | 12 +- r_app/30_interpolate_growth_model.R | 64 +- r_app/40_mosaic_creation_per_field.R | 2 +- r_app/80_calculate_kpis.R | 105 ++- r_app/80_utils_agronomic_support.R | 49 +- r_app/{ => old_scripts}/kpi_utils.R | 0 r_app/parameters_project.R | 1256 ++++++++++++-------------- r_app/parameters_project_OLD.R | 1240 +++++++++++++++++++++++++ r_app/run_full_pipeline.R | 240 +++-- 15 files changed, 2590 insertions(+), 1436 deletions(-) rename r_app/{ => old_scripts}/kpi_utils.R (100%) create mode 100644 r_app/parameters_project_OLD.R diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 8e702b7..f61a2ec 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -319,4 +319,4 @@ After each major stage, verify: --- -_For detailed system architecture, see `r_app/system_architecture/system_architecture.md`. For related Linear issues (code quality, architecture docs), see SC-59, SC-60, SC-61._ +_For detailed system architecture, see `r_app/system_architecture/system_architecture.md`. diff --git a/r_app/00_common_utils.R b/r_app/00_common_utils.R index 3e18784..49a7b58 100644 --- a/r_app/00_common_utils.R +++ b/r_app/00_common_utils.R @@ -1,500 +1,356 @@ # ============================================================================== -# 00_COMMON_UTILS.R +# # 00_COMMON_UTILS.R # ============================================================================== -# GENERIC UTILITY FUNCTIONS FOR SMARTCANE PIPELINE -# -# PURPOSE: -# Centralized location for foundational utilities used across multiple scripts. -# These functions have NO project knowledge, NO client-type dependencies, -# NO domain-specific logic. -# -# USAGE: -# All scripts (10, 20, 21, 30, 40, 80, 90, 91) should source this file: -# -# source(here::here("r_app", "parameters_project.R")) # Config first -# source(here::here("r_app", "00_common_utils.R")) # Then common utilities -# -# FUNCTIONS: -# 1. safe_log() — Generic logging with [LEVEL] prefix -# 2. smartcane_debug() — Conditional debug logging -# 3. smartcane_warn() — Convenience wrapper for WARN-level messages -# 4. date_list() — Generate date sequences for processing windows -# 5. get_iso_week() — Extract ISO week number from date -# 6. get_iso_year() — Extract ISO year from date -# 7. get_iso_week_year() — Extract both ISO week and year as list -# 8. format_week_label() — Format date as week/year label (e.g., "week01_2025") -# 9. load_field_boundaries() — Load field geometries from GeoJSON -# 10. load_harvesting_data() — Load harvest schedule from Excel -# +# # GENERIC UTILITY FUNCTIONS FOR SMARTCANE PIPELINE +# # +# # PURPOSE: +# # Centralized location for foundational utilities used across multiple scripts. +# # These functions have NO project knowledge, NO client-type dependencies, +# # NO domain-specific logic. +# # +# # USAGE: +# # All scripts (10, 20, 21, 30, 40, 80, 90, 91) should source this file: +# # +# # source(here::here("r_app", "parameters_project.R")) # Config first +# # source(here::here("r_app", "00_common_utils.R")) # Then common utilities +# # +# # FUNCTIONS: +# # 1. safe_log() — Generic logging with [LEVEL] prefix +# # 2. smartcane_debug() — Conditional debug logging +# # 3. smartcane_warn() — Convenience wrapper for WARN-level messages +# # 4. date_list() — Generate date sequences for processing windows +# # 5. load_field_boundaries() — Load field geometries from GeoJSON +# # 6. repair_geojson_geometries() — Fix invalid geometries in GeoJSON objects +# # +# # DATE FUNCTIONS (now in parameters_project.R): +# # - get_iso_week() — Extract ISO week number from date +# # - get_iso_year() — Extract ISO year from date +# # - get_iso_week_year() — Extract both ISO week and year as list +# # - format_week_label() — Format date as week/year label +# # - load_harvesting_data() — Load harvest schedule from Excel +# # # ============================================================================== -#' Safe Logging Function -#' -#' Generic logging with [LEVEL] prefix. Works standalone without any framework. -#' Consistent with SmartCane logging standard. -#' -#' @param message The message to log -#' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG" -#' @return NULL (invisible, used for side effects) -#' -#' @examples -#' safe_log("Processing started", "INFO") -#' safe_log("Check input file", "WARNING") -#' safe_log("Failed to load data", "ERROR") -#' -safe_log <- function(message, level = "INFO") { - prefix <- sprintf("[%s]", level) - cat(sprintf("%s %s\n", prefix, message)) -} +# # Source parameters first to get shared date utility functions +# source("parameters_project.R") -#' SmartCane Debug Logging (Conditional) -#' -#' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set. -#' Useful for development/troubleshooting without cluttering normal output. -#' -#' @param message The message to log -#' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE) -#' @return NULL (invisible, used for side effects) -#' -#' @examples -#' smartcane_debug("Processing field 1", verbose = FALSE) # Only if SMARTCANE_DEBUG=TRUE -#' smartcane_debug("Detailed state info", verbose = TRUE) # Always outputs -#' -smartcane_debug <- function(message, verbose = FALSE) { - if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") { - return(invisible(NULL)) - } - safe_log(message, level = "DEBUG") -} +# #' Safe Logging Function +# #' +# #' Generic logging with [LEVEL] prefix. Works standalone without any framework. +# #' Consistent with SmartCane logging standard. +# #' +# #' @param message The message to log +# #' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG" +# #' @return NULL (invisible, used for side effects) +# #' +# #' @examples +# #' safe_log("Processing started", "INFO") +# #' safe_log("Check input file", "WARNING") +# #' safe_log("Failed to load data", "ERROR") +# #' +# safe_log <- function(message, level = "INFO") { +# prefix <- sprintf("[%s]", level) +# cat(sprintf("%s %s\n", prefix, message)) +# } -#' SmartCane Warning Logging -#' -#' Logs WARN-level messages. Convenience wrapper around safe_log(). -#' -#' @param message The message to log -#' @return NULL (invisible, used for side effects) -#' -#' @examples -#' smartcane_warn("Check data format before proceeding") -#' -smartcane_warn <- function(message) { - safe_log(message, level = "WARN") -} +# #' SmartCane Debug Logging (Conditional) +# #' +# #' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set. +# #' Useful for development/troubleshooting without cluttering normal output. +# #' +# #' @param message The message to log +# #' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE) +# #' @return NULL (invisible, used for side effects) +# #' +# #' @examples +# #' smartcane_debug("Processing field 1", verbose = FALSE) # Only if SMARTCANE_DEBUG=TRUE +# #' smartcane_debug("Detailed state info", verbose = TRUE) # Always outputs +# #' +# smartcane_debug <- function(message, verbose = FALSE) { +# if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") { +# return(invisible(NULL)) +# } +# safe_log(message, level = "DEBUG") +# } -#' Extract ISO Week Number from Date -#' -#' Extracts ISO week number (1-53) from a date using %V format. -#' ISO weeks follow the international standard: Week 1 starts on Monday. -#' -#' @param date A Date object or string convertible to Date -#' @return Numeric: ISO week number (1-53) -#' -#' @examples -#' get_iso_week(as.Date("2025-01-15")) # Returns: 3 -#' -get_iso_week <- function(date) { - as.numeric(format(date, "%V")) -} +# #' SmartCane Warning Logging +# #' +# #' Logs WARN-level messages. Convenience wrapper around safe_log(). +# #' +# #' @param message The message to log +# #' @return NULL (invisible, used for side effects) +# #' +# #' @examples +# #' smartcane_warn("Check data format before proceeding") +# #' +# smartcane_warn <- function(message) { +# safe_log(message, level = "WARN") +# } -#' Extract ISO Year from Date -#' -#' Extracts ISO year from a date using %G format. -#' ISO year can differ from calendar year around year boundaries. -#' -#' @param date A Date object or string convertible to Date -#' @return Numeric: ISO year -#' -#' @examples -#' get_iso_year(as.Date("2025-01-01")) # Returns: 2025 -#' -get_iso_year <- function(date) { - as.numeric(format(date, "%G")) -} +# #' Load Field Boundaries from GeoJSON +# #' +# #' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson). +# #' Handles CRS validation and column standardization. +# #' +# #' @param data_dir Directory containing GeoJSON file +# #' @return List with elements: +# #' - field_boundaries_sf: sf (Simple Features) object +# #' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback) +# #' +# #' @details +# #' Automatically selects pivot_2.geojson for ESA project during CI extraction, +# #' otherwise uses pivot.geojson. Handles both multi-polygon and simple polygon geometries. +# #' +# #' @examples +# #' boundaries <- load_field_boundaries("laravel_app/storage/app/angata") +# #' head(boundaries$field_boundaries_sf) +# #' +# load_field_boundaries <- function(data_dir) { +# # Choose field boundaries file based on project and script type +# # ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model) +# # All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson +# use_pivot_2 <- exists("project_dir") && project_dir == "esa" && +# exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03 -#' Extract ISO Week and Year as List -#' -#' Combines get_iso_week() and get_iso_year() for convenience. -#' -#' @param date A Date object or string convertible to Date -#' @return List with elements: week (1-53), year -#' -#' @examples -#' wwy <- get_iso_week_year(as.Date("2025-01-15")) -#' # Returns: list(week = 3, year = 2025) -#' -get_iso_week_year <- function(date) { - list( - week = as.numeric(format(date, "%V")), - year = as.numeric(format(date, "%G")) - ) -} +# if (use_pivot_2) { +# field_boundaries_path <- file.path(data_dir, "pivot_2.geojson") +# } else { +# field_boundaries_path <- file.path(data_dir, "pivot.geojson") +# } -#' Format Date as Week/Year Label -#' -#' Converts a date into a readable week label format. -#' Useful for filenames, directory names, and output identification. -#' -#' @param date A Date object or string convertible to Date -#' @param separator Separator between week number and year (default: "_") -#' @return String in format "week##_YYYY" (e.g., "week03_2025") -#' -#' @examples -#' format_week_label(as.Date("2025-01-15")) # "week03_2025" -#' format_week_label(as.Date("2025-01-15"), "-") # "week03-2025" -#' -format_week_label <- function(date, separator = "_") { - wwy <- get_iso_week_year(date) - sprintf("week%02d%s%d", wwy$week, separator, wwy$year) -} - -#' Load Field Boundaries from GeoJSON -#' -#' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson). -#' Handles CRS validation and column standardization. -#' -#' @param data_dir Directory containing GeoJSON file -#' @return List with elements: -#' - field_boundaries_sf: sf (Simple Features) object -#' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback) -#' -#' @details -#' Automatically selects pivot_2.geojson for ESA project during CI extraction, -#' otherwise uses pivot.geojson. Handles both multi-polygon and simple polygon geometries. -#' -#' @examples -#' boundaries <- load_field_boundaries("laravel_app/storage/app/angata") -#' head(boundaries$field_boundaries_sf) -#' -load_field_boundaries <- function(data_dir) { - # Choose field boundaries file based on project and script type - # ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model) - # All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson - use_pivot_2 <- exists("project_dir") && project_dir == "esa" && - exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03 - - if (use_pivot_2) { - field_boundaries_path <- here(data_dir, "pivot_2.geojson") - } else { - field_boundaries_path <- here(data_dir, "Data", "pivot.geojson") - } - - if (!file.exists(field_boundaries_path)) { - stop(paste("Field boundaries file not found at path:", field_boundaries_path)) - } +# if (!file.exists(field_boundaries_path)) { +# stop(paste("Field boundaries file not found at path:", field_boundaries_path)) +# } - tryCatch({ - # Read GeoJSON with explicit CRS handling - field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE) +# tryCatch({ +# # Read GeoJSON with explicit CRS handling +# field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE) + +# # Remove OBJECTID column immediately if it exists +# if ("OBJECTID" %in% names(field_boundaries_sf)) { +# field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID) +# } + +# # **CRITICAL**: Repair invalid geometries (degenerate vertices, self-intersections, etc.) +# # This must happen BEFORE any spatial operations (CRS transform, intersect, crop, etc.) +# # to prevent S2 geometry validation errors during downstream processing +# field_boundaries_sf <- repair_geojson_geometries(field_boundaries_sf) + +# # Validate and fix CRS if needed +# tryCatch({ +# # Simply assign WGS84 if not already set (safe approach) +# if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) { +# st_crs(field_boundaries_sf) <- 4326 +# warning("CRS was missing, assigned WGS84 (EPSG:4326)") +# } +# }, error = function(e) { +# tryCatch({ +# st_crs(field_boundaries_sf) <<- 4326 +# }, error = function(e2) { +# warning(paste("Could not set CRS:", e2$message)) +# }) +# }) + +# # Handle column names - accommodate optional sub_area column +# if ("sub_area" %in% names(field_boundaries_sf)) { +# field_boundaries_sf <- field_boundaries_sf %>% +# dplyr::select(field, sub_field, sub_area) %>% +# sf::st_set_geometry("geometry") +# } else { +# field_boundaries_sf <- field_boundaries_sf %>% +# dplyr::select(field, sub_field) %>% +# sf::st_set_geometry("geometry") +# } + +# # Convert to terra vector if possible, otherwise use sf +# field_boundaries <- tryCatch({ +# field_boundaries_terra <- terra::vect(field_boundaries_sf) +# crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL) +# crs_str <- if (!is.null(crs_value)) as.character(crs_value) else "" - # Remove OBJECTID column immediately if it exists - if ("OBJECTID" %in% names(field_boundaries_sf)) { - field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID) - } +# if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) { +# terra::crs(field_boundaries_terra) <- "EPSG:4326" +# warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)") +# } +# field_boundaries_terra - # **CRITICAL**: Repair invalid geometries (degenerate vertices, self-intersections, etc.) - # This must happen BEFORE any spatial operations (CRS transform, intersect, crop, etc.) - # to prevent S2 geometry validation errors during downstream processing - field_boundaries_sf <- repair_geojson_geometries(field_boundaries_sf) - - # Validate and fix CRS if needed - tryCatch({ - # Simply assign WGS84 if not already set (safe approach) - if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) { - st_crs(field_boundaries_sf) <- 4326 - warning("CRS was missing, assigned WGS84 (EPSG:4326)") - } - }, error = function(e) { - tryCatch({ - st_crs(field_boundaries_sf) <<- 4326 - }, error = function(e2) { - warning(paste("Could not set CRS:", e2$message)) - }) - }) - - # Handle column names - accommodate optional sub_area column - if ("sub_area" %in% names(field_boundaries_sf)) { - field_boundaries_sf <- field_boundaries_sf %>% - dplyr::select(field, sub_field, sub_area) %>% - sf::st_set_geometry("geometry") - } else { - field_boundaries_sf <- field_boundaries_sf %>% - dplyr::select(field, sub_field) %>% - sf::st_set_geometry("geometry") - } - - # Convert to terra vector if possible, otherwise use sf - field_boundaries <- tryCatch({ - field_boundaries_terra <- terra::vect(field_boundaries_sf) - crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL) - crs_str <- if (!is.null(crs_value)) as.character(crs_value) else "" - - if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) { - terra::crs(field_boundaries_terra) <- "EPSG:4326" - warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)") - } - field_boundaries_terra - - }, error = function(e) { - warning(paste("Terra conversion failed, using sf object instead:", e$message)) - field_boundaries_sf - }) - - return(list( - field_boundaries_sf = field_boundaries_sf, - field_boundaries = field_boundaries - )) - }, error = function(e) { - cat("[DEBUG] Error in load_field_boundaries:\n") - cat(" Message:", e$message, "\n") - cat(" Call:", deparse(e$call), "\n") - stop(paste("Error loading field boundaries:", e$message)) - }) -} +# }, error = function(e) { +# warning(paste("Terra conversion failed, using sf object instead:", e$message)) +# field_boundaries_sf +# }) + +# return(list( +# field_boundaries_sf = field_boundaries_sf, +# field_boundaries = field_boundaries +# )) +# }, error = function(e) { +# cat("[DEBUG] Error in load_field_boundaries:\n") +# cat(" Message:", e$message, "\n") +# cat(" Call:", deparse(e$call), "\n") +# stop(paste("Error loading field boundaries:", e$message)) +# }) +# } -#' Load Harvesting Data from Excel -#' -#' Loads crop harvest schedule from harvest.xlsx file. -#' Handles flexible date formats (numeric, YYYY-MM-DD, DD/MM/YYYY, etc.). -#' -#' @param data_dir Directory containing harvest.xlsx file -#' @return Data frame with columns: field, sub_field, year, season_start, season_end, -#' age (weeks), sub_area, tonnage_ha. Returns NULL if file not found. -#' -#' @examples -#' harvest <- load_harvesting_data("laravel_app/storage/app/angata") -#' head(harvest) -#' -load_harvesting_data <- function(data_dir) { - harvest_file <- here(data_dir, "harvest.xlsx") - - if (!file.exists(harvest_file)) { - warning(paste("Harvest data file not found at path:", harvest_file)) - return(NULL) - } - - # Helper function to parse dates with multiple format detection - parse_flexible_date <- function(x) { - if (is.na(x) || is.null(x)) return(NA_real_) - if (inherits(x, "Date")) return(x) - if (inherits(x, "POSIXct")) return(as.Date(x)) - - # If it's numeric (Excel date serial), convert directly - if (is.numeric(x)) { - return(as.Date(x, origin = "1899-12-30")) - } - - # Try character conversion with multiple formats - x_char <- as.character(x) - formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S") - - for (fmt in formats) { - result <- suppressWarnings(as.Date(x_char, format = fmt)) - if (!is.na(result)) return(result) - } - - return(NA) - } - - tryCatch({ - harvesting_data <- read_excel(harvest_file) %>% - dplyr::select( - c( - "field", - "sub_field", - "year", - "season_start", - "season_end", - "age", - "sub_area", - "tonnage_ha" - ) - ) %>% - mutate( - field = as.character(field), - sub_field = as.character(sub_field), - year = as.numeric(year), - season_start = sapply(season_start, parse_flexible_date), - season_end = sapply(season_end, parse_flexible_date), - season_start = as.Date(season_start, origin = "1970-01-01"), - season_end = as.Date(season_end, origin = "1970-01-01"), - age = as.numeric(age), - sub_area = as.character(sub_area), - tonnage_ha = as.numeric(tonnage_ha) - ) %>% - mutate( - season_end = case_when( - season_end > Sys.Date() ~ Sys.Date(), - is.na(season_end) ~ Sys.Date(), - TRUE ~ season_end - ), - age = round(as.numeric(season_end - season_start) / 7, 0) - ) - - return(harvesting_data) - }, error = function(e) { - warning(paste("Error loading harvesting data:", e$message)) - return(NULL) - }) -} -#' Generate a Sequence of Dates for Processing -#' -#' Creates a date range from start_date to end_date and extracts week/year info. -#' Used by Scripts 20, 30, 40 to determine data processing windows. -#' -#' @param end_date The end date for the sequence (Date object or "YYYY-MM-DD" string) -#' @param offset Number of days to look back from end_date (e.g., 7 for one week) -#' @return A list containing: -#' - week: ISO week number of start_date -#' - year: ISO year of start_date -#' - days_filter: Vector of dates in "YYYY-MM-DD" format -#' - start_date: Start date as Date object -#' - end_date: End date as Date object -#' -#' @details -#' IMPORTANT: Uses `lubridate::week()` and `lubridate::year()` which return -#' ISO week numbers (week 1 starts on Monday). For ISO week-based calculations, -#' use `lubridate::isoweek()` and `lubridate::isoyear()` instead. -#' -#' @examples -#' dates <- date_list(as.Date("2025-01-15"), offset = 7) -#' # Returns: week=2, year=2025, days_filter = c("2025-01-09", ..., "2025-01-15") -#' -#' dates <- date_list("2025-12-31", offset = 14) -#' # Handles string input and returns 14 days of data -#' -date_list <- function(end_date, offset) { - # Input validation - if (!lubridate::is.Date(end_date)) { - end_date <- as.Date(end_date) - if (is.na(end_date)) { - stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.") - } - } + +# #' Generate a Sequence of Dates for Processing +# #' +# #' Creates a date range from start_date to end_date and extracts week/year info. +# #' Used by Scripts 20, 30, 40 to determine data processing windows. +# #' +# #' @param end_date The end date for the sequence (Date object or "YYYY-MM-DD" string) +# #' @param offset Number of days to look back from end_date (e.g., 7 for one week) +# #' @return A list containing: +# #' - week: ISO week number of start_date +# #' - year: ISO year of start_date +# #' - days_filter: Vector of dates in "YYYY-MM-DD" format +# #' - start_date: Start date as Date object +# #' - end_date: End date as Date object +# #' +# #' @details +# #' IMPORTANT: Uses `lubridate::week()` and `lubridate::year()` which return +# #' ISO week numbers (week 1 starts on Monday). For ISO week-based calculations, +# #' use `lubridate::isoweek()` and `lubridate::isoyear()` instead. +# #' +# #' @examples +# #' dates <- date_list(as.Date("2025-01-15"), offset = 7) +# #' # Returns: week=2, year=2025, days_filter = c("2025-01-09", ..., "2025-01-15") +# #' +# #' dates <- date_list("2025-12-31", offset = 14) +# #' # Handles string input and returns 14 days of data +# #' +# date_list <- function(end_date, offset) { +# # Input validation +# if (!lubridate::is.Date(end_date)) { +# end_date <- as.Date(end_date) +# if (is.na(end_date)) { +# stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.") +# } +# } - offset <- as.numeric(offset) - if (is.na(offset) || offset < 1) { - stop("Invalid offset provided. Expected a positive number.") - } +# offset <- as.numeric(offset) +# if (is.na(offset) || offset < 1) { +# stop("Invalid offset provided. Expected a positive number.") +# } - # Calculate date range - offset <- offset - 1 # Adjust offset to include end_date - start_date <- end_date - lubridate::days(offset) +# # Calculate date range +# offset <- offset - 1 # Adjust offset to include end_date +# start_date <- end_date - lubridate::days(offset) - # Extract ISO week and year information (from END date for reporting period) - week <- lubridate::isoweek(end_date) - year <- lubridate::isoyear(end_date) +# # Extract ISO week and year information (from END date for reporting period) +# week <- lubridate::isoweek(end_date) +# year <- lubridate::isoyear(end_date) - # Generate sequence of dates - days_filter <- seq(from = start_date, to = end_date, by = "day") - days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering +# # Generate sequence of dates +# days_filter <- seq(from = start_date, to = end_date, by = "day") +# days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering - # Log the date range - safe_log(paste("Date range generated from", start_date, "to", end_date)) +# # Log the date range +# safe_log(paste("Date range generated from", start_date, "to", end_date)) - return(list( - "week" = week, - "year" = year, - "days_filter" = days_filter, - "start_date" = start_date, - "end_date" = end_date - )) -} +# return(list( +# "week" = week, +# "year" = year, +# "days_filter" = days_filter, +# "start_date" = start_date, +# "end_date" = end_date +# )) +# } # ============================================================================== -#' Repair Invalid GeoJSON Geometries -#' -#' Fixes common geometry issues in GeoJSON/sf objects: -#' - Degenerate vertices (duplicate points) -#' - Self-intersecting polygons -#' - Invalid ring orientation -#' - Empty or NULL geometries -#' -#' Uses sf::st_make_valid() with buffer trick as fallback. -#' -#' @param sf_object sf object (GeoDataFrame) with potentially invalid geometries -#' @return sf object with repaired geometries -#' -#' @details -#' **Why this matters:** -#' Pivot GeoJSON files sometimes contain degenerate vertices or self-intersecting -#' rings from manual editing or GIS data sources. These cause errors when using -#' S2 geometry (strict validation) during cropping operations. -#' -#' **Repair strategy (priority order):** -#' 1. Try st_make_valid() - GEOS-based repair (most reliable) -#' 2. Fallback: st_union() + buffer(0) - Forces polygon validity -#' 3. Last resort: Silently keep original if repair fails -#' -#' @examples -#' \dontrun{ -#' fields <- st_read("pivot.geojson") -#' fields_fixed <- repair_geojson_geometries(fields) -#' cat(paste("Fixed geometries: before=", -#' nrow(fields[!st_is_valid(fields), ]), -#' ", after=", -#' nrow(fields_fixed[!st_is_valid(fields_fixed), ]))) -#' } -#' -repair_geojson_geometries <- function(sf_object) { - if (!inherits(sf_object, "sf")) { - stop("Input must be an sf (Simple Features) object") - } +# #' Repair Invalid GeoJSON Geometries +# #' +# #' Fixes common geometry issues in GeoJSON/sf objects: +# #' - Degenerate vertices (duplicate points) +# #' - Self-intersecting polygons +# #' - Invalid ring orientation +# #' - Empty or NULL geometries +# #' +# #' Uses sf::st_make_valid() with buffer trick as fallback. +# #' +# #' @param sf_object sf object (GeoDataFrame) with potentially invalid geometries +# #' @return sf object with repaired geometries +# #' +# #' @details +# #' **Why this matters:** +# #' Pivot GeoJSON files sometimes contain degenerate vertices or self-intersecting +# #' rings from manual editing or GIS data sources. These cause errors when using +# #' S2 geometry (strict validation) during cropping operations. +# #' +# #' **Repair strategy (priority order):** +# #' 1. Try st_make_valid() - GEOS-based repair (most reliable) +# #' 2. Fallback: st_union() + buffer(0) - Forces polygon validity +# #' 3. Last resort: Silently keep original if repair fails +# #' +# #' @examples +# #' \dontrun{ +# #' fields <- st_read("pivot.geojson") +# #' fields_fixed <- repair_geojson_geometries(fields) +# #' cat(paste("Fixed geometries: before=", +# #' nrow(fields[!st_is_valid(fields), ]), +# #' ", after=", +# #' nrow(fields_fixed[!st_is_valid(fields_fixed), ]))) +# #' } +# #' +# repair_geojson_geometries <- function(sf_object) { +# if (!inherits(sf_object, "sf")) { +# stop("Input must be an sf (Simple Features) object") +# } - # Count invalid geometries BEFORE repair - invalid_before <- sum(!sf::st_is_valid(sf_object), na.rm = TRUE) +# # Count invalid geometries BEFORE repair +# invalid_before <- sum(!sf::st_is_valid(sf_object), na.rm = TRUE) - if (invalid_before == 0) { - safe_log("All geometries already valid - no repair needed", "INFO") - return(sf_object) - } +# if (invalid_before == 0) { +# safe_log("All geometries already valid - no repair needed", "INFO") +# return(sf_object) +# } - safe_log(paste("Found", invalid_before, "invalid geometries - attempting repair"), "WARNING") +# safe_log(paste("Found", invalid_before, "invalid geometries - attempting repair"), "WARNING") - # STRATEGY: Apply st_make_valid() to entire sf object (most reliable for GEOS) - # This handles degenerate vertices, self-intersections, invalid rings while preserving all features - repaired <- tryCatch({ - # st_make_valid() on entire sf object preserves all features and attributes - repaired_geom <- sf::st_make_valid(sf_object) +# # STRATEGY: Apply st_make_valid() to entire sf object (most reliable for GEOS) +# # This handles degenerate vertices, self-intersections, invalid rings while preserving all features +# repaired <- tryCatch({ +# # st_make_valid() on entire sf object preserves all features and attributes +# repaired_geom <- sf::st_make_valid(sf_object) + +# # Verify we still have the same number of rows +# if (nrow(repaired_geom) != nrow(sf_object)) { +# warning("st_make_valid() changed number of features - attempting row-wise repair") - # Verify we still have the same number of rows - if (nrow(repaired_geom) != nrow(sf_object)) { - warning("st_make_valid() changed number of features - attempting row-wise repair") - - # Fallback: Repair row-by-row to maintain original structure - repaired_geom <- sf_object - for (i in seq_len(nrow(sf_object))) { - tryCatch({ - if (!sf::st_is_valid(sf_object[i, ])) { - repaired_geom[i, ] <- sf::st_make_valid(sf_object[i, ]) - } - }, error = function(e) { - safe_log(paste("Could not repair row", i, "-", e$message), "WARNING") - }) - } - } - - safe_log("✓ st_make_valid() successfully repaired geometries", "INFO") - repaired_geom - }, error = function(e) { - safe_log(paste("st_make_valid() failed:", e$message), "WARNING") - NULL - }) +# # Fallback: Repair row-by-row to maintain original structure +# repaired_geom <- sf_object +# for (i in seq_len(nrow(sf_object))) { +# tryCatch({ +# if (!sf::st_is_valid(sf_object[i, ])) { +# repaired_geom[i, ] <- sf::st_make_valid(sf_object[i, ]) +# } +# }, error = function(e) { +# safe_log(paste("Could not repair row", i, "-", e$message), "WARNING") +# }) +# } +# } + +# safe_log("✓ st_make_valid() successfully repaired geometries", "INFO") +# repaired_geom +# }, error = function(e) { +# safe_log(paste("st_make_valid() failed:", e$message), "WARNING") +# NULL +# }) - # If repair failed, keep original - if (is.null(repaired)) { - safe_log(paste("Could not repair", invalid_before, "invalid geometries - keeping original"), - "WARNING") - return(sf_object) - } +# # If repair failed, keep original +# if (is.null(repaired)) { +# safe_log(paste("Could not repair", invalid_before, "invalid geometries - keeping original"), +# "WARNING") +# return(sf_object) +# } - # Count invalid geometries AFTER repair - invalid_after <- sum(!sf::st_is_valid(repaired), na.rm = TRUE) - safe_log(paste("Repair complete: before =", invalid_before, ", after =", invalid_after), "INFO") +# # Count invalid geometries AFTER repair +# invalid_after <- sum(!sf::st_is_valid(repaired), na.rm = TRUE) +# safe_log(paste("Repair complete: before =", invalid_before, ", after =", invalid_after), "INFO") - return(repaired) -} +# return(repaired) +# } # ============================================================================== -# END 00_COMMON_UTILS.R +# # END 00_COMMON_UTILS.R # ============================================================================== diff --git a/r_app/10_create_per_field_tiffs.R b/r_app/10_create_per_field_tiffs.R index 50789b3..9c32cf1 100644 --- a/r_app/10_create_per_field_tiffs.R +++ b/r_app/10_create_per_field_tiffs.R @@ -51,50 +51,79 @@ # ============================================================================ # Spatial data handling +suppressPackageStartupMessages({ + library(terra) # For raster operations (reading/writing GeoTIFFs, cropping to field boundaries) library(sf) # For spatial operations (reading field boundaries GeoJSON, masking) - +library(here) # For relative path resolution +}) # ============================================================================== -# LOAD CENTRALIZED PARAMETERS & PATHS +# MAIN PROCESSING FUNCTION # ============================================================================== -source(here::here("r_app", "parameters_project.R")) -source(here::here("r_app", "00_common_utils.R")) -source(here::here("r_app", "10_create_per_field_tiffs_utils.R")) -# Get project parameter from command line -args <- commandArgs(trailingOnly = TRUE) -if (length(args) == 0) { - PROJECT <- "angata" -} else { - PROJECT <- args[1] +main <- function() { + # STEP 1: Set working directory to project root (smartcane/) + # This ensures all relative paths resolve correctly + if (basename(getwd()) == "r_app") { + setwd("..") + } + + # STEP 2: SOURCE ALL UTILITY SCRIPTS (before any operations) + # Load parameters_project.R (provides safe_log, setup_project_directories, etc.) + tryCatch({ + source("r_app/parameters_project.R") + }, error = function(e) { + cat(sprintf("Error loading parameters_project.R: %s\n", e$message)) + stop(e) + }) + + # Load Script 10-specific utilities + tryCatch({ + source("r_app/10_create_per_field_tiffs_utils.R") + }, error = function(e) { + cat(sprintf("Error loading 10_create_per_field_tiffs_utils.R: %s\n", e$message)) + stop(e) + }) + + # STEP 3: Parse command-line arguments + args <- commandArgs(trailingOnly = TRUE) + project_dir <- if (length(args) == 0) "angata" else args[1] + + # STEP 4: Now all utilities are loaded, proceed with script logic + # Load centralized path structure (creates all directories automatically) + paths <- setup_project_directories(project_dir) + + safe_log(paste("Project:", project_dir)) + safe_log(paste("Base path:", paths$laravel_storage_dir)) + safe_log(paste("Data dir:", paths$data_dir)) + + # Load field boundaries using data_dir (not field_boundaries_path) + # load_field_boundaries() expects a directory and builds the file path internally + fields_data <- load_field_boundaries(paths$data_dir) + fields <- fields_data$field_boundaries_sf + + # Define input and output directories (from centralized paths) + merged_tif_dir <- paths$merged_tif_folder + field_tiles_dir <- paths$field_tiles_dir + field_tiles_ci_dir <- paths$field_tiles_ci_dir + + # PHASE 1: Process new downloads (always runs) + # Pass field_tiles_ci_dir so it can skip dates already migrated + process_result <- process_new_merged_tif(merged_tif_dir, field_tiles_dir, fields, field_tiles_ci_dir) + + safe_log("\n========================================", "INFO") + safe_log("FINAL SUMMARY", "INFO") + safe_log("========================================", "INFO") + safe_log(paste("Processing: created =", process_result$total_created, + ", skipped =", process_result$total_skipped, + ", errors =", process_result$total_errors), "INFO") + safe_log("Script 10 complete", "INFO") + safe_log("========================================\n", "INFO") + + quit(status = 0) } -# Load centralized path structure (creates all directories automatically) -paths <- setup_project_directories(PROJECT) - -safe_log(paste("Project:", PROJECT)) -safe_log(paste("Base path:", paths$laravel_storage_dir)) -safe_log(paste("Data dir:", paths$data_dir)) - -# Load field boundaries using data_dir (not field_boundaries_path) -# load_field_boundaries() expects a directory and builds the file path internally -fields_data <- load_field_boundaries(paths$data_dir) -fields <- fields_data$field_boundaries_sf - -# Define input and output directories (from centralized paths) -merged_tif_dir <- paths$merged_tif_folder -field_tiles_dir <- paths$field_tiles_dir -field_tiles_ci_dir <- paths$field_tiles_ci_dir - -# PHASE 1: Process new downloads (always runs) -# Pass field_tiles_ci_dir so it can skip dates already migrated -process_result <- process_new_merged_tif(merged_tif_dir, field_tiles_dir, fields, field_tiles_ci_dir) - -safe_log("\n========================================", "INFO") -safe_log("FINAL SUMMARY", "INFO") -safe_log("========================================", "INFO") -safe_log(paste("Processing: created =", process_result$total_created, - ", skipped =", process_result$total_skipped, - ", errors =", process_result$total_errors), "INFO") -safe_log("Script 10 complete", "INFO") -safe_log("========================================\n", "INFO") +# Execute main if called from command line +if (sys.nframe() == 0) { + main() +} diff --git a/r_app/20_ci_extraction_per_field.R b/r_app/20_ci_extraction_per_field.R index 72144ab..88313ec 100644 --- a/r_app/20_ci_extraction_per_field.R +++ b/r_app/20_ci_extraction_per_field.R @@ -25,52 +25,52 @@ suppressPackageStartupMessages({ library(here) }) -# ============================================================================= -# Load utility functions from 20_ci_extraction_utils.R -# ============================================================================= -source("r_app/20_ci_extraction_utils.R") - # ============================================================================= # Main Processing # ============================================================================= main <- function() { - # IMPORTANT: Set working directory to project root (smartcane/) - # This ensures here() functions resolve relative to /smartcane, not /smartcane/r_app + # STEP 1: Set working directory to project root (smartcane/) + # This ensures all relative paths resolve correctly if (basename(getwd()) == "r_app") { setwd("..") } - # Parse command-line arguments + # STEP 2: SOURCE ALL UTILITY SCRIPTS (before any operations) + # Parse command-line arguments FIRST args <- commandArgs(trailingOnly = TRUE) - project_dir <- if (length(args) >= 1 && args[1] != "") args[1] else "angata" end_date <- if (length(args) >= 2 && args[2] != "") as.Date(args[2]) else Sys.Date() offset <- if (length(args) >= 3 && !is.na(as.numeric(args[3]))) as.numeric(args[3]) else 7 - # IMPORTANT: Make project_dir available globally for parameters_project.R + # Make project_dir available globally for parameters_project.R assign("project_dir", project_dir, envir = .GlobalEnv) + # Load parameters_project.R (provides safe_log, date_list, setup_project_directories, etc.) + tryCatch({ + source("r_app/parameters_project.R") + }, error = function(e) { + cat(sprintf("Error loading parameters_project.R: %s\n", e$message)) + stop(e) + }) + + # Load CI extraction utilities + tryCatch({ + source("r_app/20_ci_extraction_utils.R") + }, error = function(e) { + cat(sprintf("Error loading 20_ci_extraction_utils.R: %s\n", e$message)) + stop(e) + }) + + # STEP 3: Now all utilities are loaded, proceed with script logic safe_log(sprintf("=== Script 20: CI Extraction Per-Field ===")) safe_log(sprintf("Project: %s | End Date: %s | Offset: %d days", project_dir, format(end_date, "%Y-%m-%d"), offset)) - # 1. Load parameters (includes field boundaries setup) - # --------------------------------------------------- - tryCatch({ - source("r_app/parameters_project.R") - safe_log("Loaded parameters_project.R") - }, error = function(e) { - safe_log(sprintf("Error loading parameters: %s", e$message), "ERROR") - stop(e) - }) - - # 2. Set up directory paths from parameters FIRST (before using setup$...) - # ----------------------------------------------------------------------- + # Set up directory paths from parameters setup <- setup_project_directories(project_dir) - # 3. Load field boundaries directly from field_boundaries_path in setup - # ------------------------------------------------------------------ + # Load field boundaries directly from field_boundaries_path in setup tryCatch({ field_boundaries_sf <- st_read(setup$field_boundaries_path, quiet = TRUE) safe_log(sprintf("Loaded %d field/sub_field polygons from %s", nrow(field_boundaries_sf), setup$field_boundaries_path)) @@ -79,17 +79,16 @@ main <- function() { stop(e) }) - # 4. Get list of dates to process + # Get list of dates to process dates <- date_list(end_date, offset) safe_log(sprintf("Processing dates: %s to %s (%d dates)", dates$start_date, dates$end_date, length(dates$days_filter))) safe_log(sprintf("Input directory: %s", setup$field_tiles_dir)) safe_log(sprintf("Output TIF directory: %s", setup$field_tiles_ci_dir)) - safe_log(sprintf("Output RDS directory: %s", setup$daily_vals_per_field_dir)) + safe_log(sprintf("Output RDS directory: %s", setup$daily_ci_vals_dir)) - # 5. Process each field - # ---------------------- + # Process each field if (!dir.exists(setup$field_tiles_dir)) { safe_log(sprintf("Field tiles directory not found: %s", setup$field_tiles_dir), "ERROR") stop("Script 10 output not found. Run Script 10 first.") @@ -105,14 +104,21 @@ main <- function() { safe_log(sprintf("Found %d fields to process", length(fields))) + # DEBUG: Check what paths are available in setup + safe_log(sprintf("[DEBUG] Available setup paths: %s", paste(names(setup), collapse=", "))) + safe_log(sprintf("[DEBUG] field_tiles_ci_dir: %s", setup$field_tiles_ci_dir)) + safe_log(sprintf("[DEBUG] daily_ci_vals_dir: %s", setup$daily_ci_vals_dir)) + + # Use daily_ci_vals_dir for per-field daily CI output # Pre-create output subdirectories for all fields for (field in fields) { - dir.create(file.path(field_tiles_ci_dir, field), showWarnings = FALSE, recursive = TRUE) - dir.create(file.path(setup$daily_vals_per_field_dir, field), showWarnings = FALSE, recursive = TRUE) + dir.create(file.path(setup$field_tiles_ci_dir, field), showWarnings = FALSE, recursive = TRUE) + if (!is.null(setup$daily_ci_vals_dir)) { + dir.create(file.path(setup$daily_ci_vals_dir, field), showWarnings = FALSE, recursive = TRUE) + } } - # 6. Process each DATE (OPTIMIZED: load TIFF once, process all fields) - # ----------------------------------------------------------------------- + # Process each DATE (OPTIMIZED: load TIFF once, process all fields) total_success <- 0 total_error <- 0 ci_results_by_date <- list() @@ -124,7 +130,7 @@ main <- function() { # Find the actual TIFF path (it's in the first field that has it) input_tif_full <- NULL for (field in fields) { - candidate_path <- file.path(field_tiles_dir, field, sprintf("%s.tif", date_str)) + candidate_path <- file.path(setup$field_tiles_dir, field, sprintf("%s.tif", date_str)) if (file.exists(candidate_path)) { input_tif_full <- candidate_path break @@ -142,8 +148,8 @@ main <- function() { # Now process all fields from this single TIFF for (field in fields) { - field_ci_path <- file.path(field_tiles_ci_dir, field) - field_daily_vals_path <- file.path(setup$daily_vals_per_field_dir, field) + field_ci_path <- file.path(setup$field_tiles_ci_dir, field) + field_daily_vals_path <- file.path(setup$daily_ci_vals_dir, field) output_tif <- file.path(field_ci_path, sprintf("%s.tif", date_str)) output_rds <- file.path(field_daily_vals_path, sprintf("%s.rds", date_str)) @@ -200,8 +206,7 @@ main <- function() { }) } - # 7. Summary - # ---------- + # Summary safe_log(sprintf("\n=== Processing Complete ===")) safe_log(sprintf("Successfully processed: %d", total_success)) safe_log(sprintf("Errors encountered: %d", total_error)) @@ -209,7 +214,7 @@ main <- function() { if (total_success > 0) { safe_log("Output files created in:") safe_log(sprintf(" TIFFs: %s", setup$field_tiles_ci_dir)) - safe_log(sprintf(" RDS: %s", setup$daily_vals_per_field_dir)) + safe_log(sprintf(" RDS: %s", setup$daily_ci_vals_dir)) } } diff --git a/r_app/20_ci_extraction_utils.R b/r_app/20_ci_extraction_utils.R index 08f56b8..f8a88c5 100644 --- a/r_app/20_ci_extraction_utils.R +++ b/r_app/20_ci_extraction_utils.R @@ -1027,55 +1027,39 @@ calc_ci_from_raster <- function(raster_obj) { #' @return Data frame with field, sub_field, and CI statistics; NULL if field not found #' extract_ci_by_subfield <- function(ci_raster, field_boundaries_sf, field_name) { - # Filter to current field - field_poly <- field_boundaries_sf %>% - filter(field == field_name) + # NOTE: Per-field TIFFs are already cropped to field boundaries by Script 10 + # No need to mask again - just extract all valid pixels from the raster - if (nrow(field_poly) == 0) { - safe_log(sprintf("Field '%s' not found in boundaries", field_name), "WARNING") - return(NULL) + # Extract ALL CI values (no masking needed for pre-cropped per-field TIFFs) + ci_values <- terra::values(ci_raster, na.rm = TRUE) + + if (length(ci_values) > 0) { + result_row <- data.frame( + field = field_name, + sub_field = field_name, # Use field_name as sub_field since TIFF is already field-specific + ci_mean = mean(ci_values, na.rm = TRUE), + ci_median = median(ci_values, na.rm = TRUE), + ci_sd = sd(ci_values, na.rm = TRUE), + ci_min = min(ci_values, na.rm = TRUE), + ci_max = max(ci_values, na.rm = TRUE), + ci_count = length(ci_values), + stringsAsFactors = FALSE + ) + } else { + result_row <- data.frame( + field = field_name, + sub_field = field_name, + ci_mean = NA_real_, + ci_median = NA_real_, + ci_sd = NA_real_, + ci_min = NA_real_, + ci_max = NA_real_, + ci_count = 0, + stringsAsFactors = FALSE + ) } - # Extract CI values by sub_field - results <- list() - - # Group by sub_field within this field - for (sub_field in unique(field_poly$sub_field)) { - sub_poly <- field_poly %>% filter(sub_field == sub_field) - ci_sub <- terra::mask(ci_raster, sub_poly) - - # Get statistics - ci_values <- terra::values(ci_sub, na.rm = TRUE) - - if (length(ci_values) > 0) { - result_row <- data.frame( - field = field_name, - sub_field = sub_field, - ci_mean = mean(ci_values, na.rm = TRUE), - ci_median = median(ci_values, na.rm = TRUE), - ci_sd = sd(ci_values, na.rm = TRUE), - ci_min = min(ci_values, na.rm = TRUE), - ci_max = max(ci_values, na.rm = TRUE), - ci_count = length(ci_values), - stringsAsFactors = FALSE - ) - } else { - result_row <- data.frame( - field = field_name, - sub_field = sub_field, - ci_mean = NA_real_, - ci_median = NA_real_, - ci_sd = NA_real_, - ci_min = NA_real_, - ci_max = NA_real_, - ci_count = 0, - stringsAsFactors = FALSE - ) - } - results[[length(results) + 1]] <- result_row - } - - return(dplyr::bind_rows(results)) + return(result_row) } #' Extract RDS from existing CI TIFF (Migration/Regeneration Mode) diff --git a/r_app/21_convert_ci_rds_to_csv.R b/r_app/21_convert_ci_rds_to_csv.R index 0fe5a35..491aa7e 100644 --- a/r_app/21_convert_ci_rds_to_csv.R +++ b/r_app/21_convert_ci_rds_to_csv.R @@ -31,7 +31,7 @@ # # DEPENDENCIES: # - Packages: tidyverse, lubridate, zoo -# - Utils files: parameters_project.R, 00_common_utils.R +# - Utils files: parameters_project.R # - Input data: combined_CI_data.rds from Script 20 # - Data directories: extracted_ci/cumulative_vals/ # diff --git a/r_app/30_growth_model_utils.R b/r_app/30_growth_model_utils.R index 7de7f47..81c10a8 100644 --- a/r_app/30_growth_model_utils.R +++ b/r_app/30_growth_model_utils.R @@ -272,11 +272,19 @@ calculate_growth_metrics <- function(interpolated_data) { #' @return Path to the saved file #' save_growth_model <- function(data, output_dir, file_name = "All_pivots_Cumulative_CI_quadrant_year_v2.rds") { + # Validate input + if (is.null(output_dir) || !is.character(output_dir) || length(output_dir) == 0) { + stop("output_dir must be a non-empty character string") + } + + # Normalize path separators for Windows compatibility + output_dir <- normalizePath(output_dir, winslash = "/", mustWork = FALSE) + # Create output directory if it doesn't exist dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) - # Create full file path - file_path <- here::here(output_dir, file_name) + # Create full file path using file.path (more robust than here::here for absolute paths) + file_path <- file.path(output_dir, file_name) # Save the data saveRDS(data, file_path) diff --git a/r_app/30_interpolate_growth_model.R b/r_app/30_interpolate_growth_model.R index a6617bd..42afa35 100644 --- a/r_app/30_interpolate_growth_model.R +++ b/r_app/30_interpolate_growth_model.R @@ -59,64 +59,74 @@ suppressPackageStartupMessages({ # Data manipulation library(tidyverse) # For dplyr (data wrangling, grouping, mutating) library(lubridate) # For date/time operations (date arithmetic, ISO week extraction) + library(readxl) # For reading harvest.xlsx (harvest dates for growth model phases) }) # ============================================================================= -# Load configuration and utility functions -# ============================================================================= -source(here::here("r_app", "parameters_project.R")) -source(here::here("r_app", "00_common_utils.R")) -source(here::here("r_app", "30_growth_model_utils.R")) - -# ============================================================================= -# Main Processing +# MAIN PROCESSING FUNCTION # ============================================================================= main <- function() { - # IMPORTANT: Set working directory to project root (smartcane/) - # This ensures here() functions resolve relative to /smartcane, not /smartcane/r_app + # STEP 1: Set working directory to project root (smartcane/) + # This ensures all relative paths resolve correctly if (basename(getwd()) == "r_app") { setwd("..") } - # Parse command-line arguments + # STEP 2: SOURCE ALL UTILITY SCRIPTS (before any operations) + # Parse command-line arguments FIRST args <- commandArgs(trailingOnly = TRUE) - project_dir <- if (length(args) >= 1 && args[1] != "") args[1] else "angata" - # IMPORTANT: Make project_dir available globally for parameters_project.R + # Make project_dir available globally for parameters_project.R assign("project_dir", project_dir, envir = .GlobalEnv) - safe_log(sprintf("=== Script 30: Growth Model Interpolation ===")) - safe_log(sprintf("Project: %s", project_dir)) - - # 1. Load parameters (includes field boundaries setup) - # --------------------------------------------------- + # Load parameters_project.R (provides setup_project_directories, etc.) tryCatch({ source("r_app/parameters_project.R") - safe_log("Loaded parameters_project.R") }, error = function(e) { - safe_log(sprintf("Error loading parameters: %s", e$message), "ERROR") + cat(sprintf("Error loading parameters_project.R: %s\n", e$message)) stop(e) }) - # 2. Set up directory paths from parameters - # ----------------------------------------------- + # Load growth model utilities + tryCatch({ + source("r_app/30_growth_model_utils.R") + }, error = function(e) { + cat(sprintf("Error loading 30_growth_model_utils.R: %s\n", e$message)) + stop(e) + }) + + # STEP 3: Now all utilities are loaded, proceed with script logic + safe_log(sprintf("=== Script 30: Growth Model Interpolation ===")) + safe_log(sprintf("Project: %s", project_dir)) + + # Set up directory paths from parameters setup <- setup_project_directories(project_dir) - # For per-field architecture: read from daily_vals_per_field_dir (Script 20 per-field output) - daily_vals_dir <- setup$daily_vals_per_field_dir + # For per-field architecture: read from daily_ci_vals_dir (Script 20 per-field output) + daily_vals_dir <- setup$daily_ci_vals_dir safe_log(sprintf("Using per-field daily CI directory: %s", daily_vals_dir)) safe_log("Starting CI growth model interpolation") - # 3. Load and process the data - # ---------------------------- + # Load and process the data tryCatch({ # Load the combined CI data (created by Script 20 per-field) # Script 20 per-field outputs: daily_vals/{FIELD_NAME}/{YYYY-MM-DD}.rds CI_data <- load_combined_ci_data(daily_vals_dir) + # Load harvesting data from harvest.xlsx for growth model phase assignment + # Use the centralized load_harvesting_data() function which handles NA season_end values + # by setting them to Sys.Date() (field is still in current growing season) + data_dir <- setup$data_dir + harvesting_data <- tryCatch({ + load_harvesting_data(data_dir) + }, error = function(e) { + safe_log(paste("Error loading harvest data:", e$message), "WARNING") + NULL + }) + # Validate harvesting data if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { safe_log("No harvesting data available", "ERROR") @@ -146,7 +156,7 @@ main <- function() { # Save the processed data to cumulative_vals directory save_growth_model( CI_all_with_metrics, - setup$cumulative_CI_vals_dir, + setup$cumulative_ci_vals_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds" ) } else { diff --git a/r_app/40_mosaic_creation_per_field.R b/r_app/40_mosaic_creation_per_field.R index e42909f..9a16b8c 100644 --- a/r_app/40_mosaic_creation_per_field.R +++ b/r_app/40_mosaic_creation_per_field.R @@ -33,7 +33,7 @@ # # DEPENDENCIES: # - Packages: terra, sf, tidyverse, lubridate -# - Utils files: parameters_project.R, 00_common_utils.R, 40_mosaic_creation_per_field_utils.R +# - Utils files: parameters_project.R, 40_mosaic_creation_per_field_utils.R # - Input data: Daily per-field CI TIFFs from Script 20 # - Data directories: field_tiles_CI/, weekly_mosaic/ # diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 5586782..ad74c15 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -153,8 +153,9 @@ suppressPackageStartupMessages({ }) # ============================================================================ -# LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES +# LOAD CONFIGURATION - MUST BE DONE FIRST # ============================================================================ +# Parameters must be loaded early to determine client type and paths tryCatch({ source(here("r_app", "parameters_project.R")) @@ -162,14 +163,19 @@ tryCatch({ stop("Error loading parameters_project.R: ", e$message) }) -tryCatch({ - source(here("r_app", "00_common_utils.R")) -}, error = function(e) { - stop("Error loading 00_common_utils.R: ", e$message) -}) +# Get client configuration from global project setup +# NOTE: This cannot be done until parameters_project.R is sourced +# We determine client_type from the current project_dir (if running in main() context) +# For now, set a placeholder that will be overridden in main() +if (exists("project_dir", envir = .GlobalEnv)) { + temp_client_type <- get_client_type(get("project_dir", envir = .GlobalEnv)) +} else { + temp_client_type <- "cane_supply" # Safe default +} +temp_client_config <- get_client_kpi_config(temp_client_type) # ============================================================================ -# LOAD CLIENT-AWARE UTILITIES +# LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES # ============================================================================ # All clients use the common utilities (shared statistical functions, reporting) tryCatch({ @@ -178,25 +184,20 @@ tryCatch({ stop("Error loading 80_utils_common.R: ", e$message) }) -# Client-specific utilities based on client_config$script_90_compatible -# script_90_compatible = TRUE -> AURA workflow (6 KPIs) -# script_90_compatible = FALSE -> CANE_SUPPLY workflow (weekly stats + basic reporting) +# Load both client-specific utilities (functions will be available for both workflows) +# This avoids needing to determine client type at startup time +message("Loading client-specific utilities (80_utils_agronomic_support.R and 80_utils_cane_supply.R)...") +tryCatch({ + source(here("r_app", "80_utils_agronomic_support.R")) +}, error = function(e) { + stop("Error loading 80_utils_agronomic_support.R: ", e$message) +}) -if (client_config$script_90_compatible) { - message("Loading AURA client utilities (80_utils_agronomic_support.R)...") - tryCatch({ - source(here("r_app", "80_utils_agronomic_support.R")) - }, error = function(e) { - stop("Error loading 80_utils_agronomic_support.R: ", e$message) - }) -} else { - message("Loading CANE_SUPPLY client utilities (80_utils_cane_supply.R)...") - tryCatch({ - source(here("r_app", "80_utils_cane_supply.R")) - }, error = function(e) { - stop("Error loading 80_utils_cane_supply.R: ", e$message) - }) -} +tryCatch({ + source(here("r_app", "80_utils_cane_supply.R")) +}, error = function(e) { + stop("Error loading 80_utils_cane_supply.R: ", e$message) +}) # ============================================================================ # PHASE AND STATUS TRIGGER DEFINITIONS @@ -311,6 +312,9 @@ main <- function() { client_type <- get_client_type(project_dir) client_config <- get_client_kpi_config(client_type) + # Assign to global environment so utilities and downstream scripts can access it + assign("client_config", client_config, envir = .GlobalEnv) + message("Client Type:", client_type) message("KPI Calculations:", paste(client_config$kpi_calculations, collapse = ", ")) message("Output Formats:", paste(client_config$outputs, collapse = ", ")) @@ -335,38 +339,49 @@ main <- function() { message("AURA WORKFLOW: CALCULATING 6 FARM-LEVEL KPIs (Script 90 compatible)") message(strrep("=", 70)) - # Load 80_kpi_utils.R with all 6 KPI functions - # (Note: 80_kpi_utils.R includes all necessary helper functions from crop_messaging_utils.R) - tryCatch({ - source(here("r_app", "80_kpi_utils.R")) - }, error = function(e) { - stop("Error loading 80_kpi_utils.R: ", e$message) - }) - # Prepare inputs for KPI calculation (already created by setup_project_directories) reports_dir_kpi <- setup$kpi_reports_dir - cumulative_CI_vals_dir <- setup$cumulative_CI_vals_dir - # Load field boundaries and harvesting data (already loaded by parameters_project.R) - if (!exists("field_boundaries_sf")) { - stop("field_boundaries_sf not loaded. Check parameters_project.R initialization.") - } + # Load field boundaries for AURA workflow (use data_dir from setup) + message("\nLoading field boundaries for AURA KPI calculation...") + tryCatch({ + boundaries_result <- load_field_boundaries(setup$data_dir) + + if (is.list(boundaries_result) && "field_boundaries_sf" %in% names(boundaries_result)) { + field_boundaries_sf <- boundaries_result$field_boundaries_sf + } else { + field_boundaries_sf <- boundaries_result + } + + if (nrow(field_boundaries_sf) == 0) { + stop("No fields loaded from boundaries") + } + + message(paste(" ✓ Loaded", nrow(field_boundaries_sf), "fields")) + }, error = function(e) { + stop("ERROR loading field boundaries: ", e$message) + }) + + # Load harvesting data if (!exists("harvesting_data")) { warning("harvesting_data not loaded. TCH KPI will use placeholder values.") harvesting_data <- data.frame(field = character(), year = numeric(), tonnage_ha = numeric()) } - # Calculate all 6 KPIs + # Extract current week/year from end_date + current_week <- as.numeric(format(end_date, "%V")) + current_year <- as.numeric(format(end_date, "%G")) + + # Call with correct signature kpi_results <- calculate_all_kpis( - report_date = end_date, - output_dir = reports_dir_kpi, field_boundaries_sf = field_boundaries_sf, + current_week = current_week, + current_year = current_year, + current_mosaic_dir = setup$weekly_mosaic_dir, harvesting_data = harvesting_data, - cumulative_CI_vals_dir = cumulative_CI_vals_dir, - weekly_CI_mosaic = weekly_mosaic, - reports_dir = reports_dir_kpi, - project_dir = project_dir + ci_rds_path = cumulative_CI_vals_dir, + output_dir = reports_dir_kpi ) cat("\n=== AURA KPI CALCULATION COMPLETE ===\n") diff --git a/r_app/80_utils_agronomic_support.R b/r_app/80_utils_agronomic_support.R index b60ed89..c1d710a 100644 --- a/r_app/80_utils_agronomic_support.R +++ b/r_app/80_utils_agronomic_support.R @@ -109,7 +109,12 @@ calculate_field_uniformity_kpi <- function(ci_pixels_by_field, field_boundaries_ morans_i <- NA_real_ if (!is.null(ci_band)) { - morans_i <- calculate_spatial_autocorrelation(ci_pixels, field_boundaries_sf[field_idx, ]) + morans_result <- calculate_spatial_autocorrelation(ci_pixels, field_boundaries_sf[field_idx, ]) + if (is.list(morans_result)) { + morans_i <- morans_result$morans_i + } else { + morans_i <- morans_result + } } # Normalize CV (0-1 scale, invert so lower CV = higher score) @@ -332,7 +337,9 @@ calculate_weed_presence_kpi <- function(ci_pixels_by_field) { result$low_ci_percent[field_idx] <- round(low_ci_pct, 2) result$fragmentation_index[field_idx] <- round(fragmentation, 3) - if (fragmentation > 0.15) { + if (is.na(fragmentation)) { + result$weed_pressure_risk[field_idx] <- "No data" + } else if (fragmentation > 0.15) { result$weed_pressure_risk[field_idx] <- "High" } else if (fragmentation > 0.08) { result$weed_pressure_risk[field_idx] <- "Medium" @@ -354,6 +361,20 @@ calculate_weed_presence_kpi <- function(ci_pixels_by_field) { #' #' @return Data frame with gap-filling quality metrics calculate_gap_filling_kpi <- function(ci_rds_path) { + # If ci_rds_path is NULL or not a valid path, return placeholder + if (is.null(ci_rds_path) || !is.character(ci_rds_path) || length(ci_rds_path) == 0) { + return(NULL) + } + + # If ci_rds_path is a directory, find the cumulative CI file + if (dir.exists(ci_rds_path)) { + ci_files <- list.files(ci_rds_path, pattern = "^All_pivots.*\\.rds$", full.names = TRUE) + if (length(ci_files) == 0) { + return(NULL) + } + ci_rds_path <- ci_files[1] + } + if (!file.exists(ci_rds_path)) { return(NULL) } @@ -425,8 +446,12 @@ create_summary_tables <- function(all_kpis) { weed_pressure = all_kpis$weed_presence %>% select(field_idx, fragmentation_index, weed_pressure_risk), - gap_filling = all_kpis$gap_filling %>% - select(field_idx, na_percent_pre_interpolation, gap_filling_success) + gap_filling = if (!is.null(all_kpis$gap_filling)) { + all_kpis$gap_filling %>% + select(field_idx, na_percent_pre_interpolation, gap_filling_success) + } else { + NULL + } ) return(kpi_summary) @@ -494,13 +519,13 @@ create_field_kpi_text <- function(all_kpis) { #' #' @return List of output file paths export_kpi_data <- function(all_kpis, kpi_summary, output_dir, week, year) { - kpi_subdir <- file.path(output_dir, "kpis") - if (!dir.exists(kpi_subdir)) { - dir.create(kpi_subdir, recursive = TRUE) + # Ensure output directory exists + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE) } # Export all KPI tables to a single Excel file - excel_file <- paste0(kpi_subdir, "/AURA_KPI_week_", sprintf("%02d_%d", week, year), ".xlsx") + excel_file <- paste0(output_dir, "/AURA_KPI_week_", sprintf("%02d_%d", week, year), ".xlsx") sheets <- list( "Uniformity" = as.data.frame(kpi_summary$uniformity), @@ -515,7 +540,7 @@ export_kpi_data <- function(all_kpis, kpi_summary, output_dir, week, year) { message(paste("✓ AURA KPI data exported to:", excel_file)) # Also export to RDS for programmatic access - rds_file <- paste0(kpi_subdir, "/AURA_KPI_week_", sprintf("%02d_%d", week, year), ".rds") + rds_file <- paste0(output_dir, "/AURA_KPI_week_", sprintf("%02d_%d", week, year), ".rds") saveRDS(all_kpis, rds_file) message(paste("✓ AURA KPI RDS exported to:", rds_file)) @@ -558,14 +583,14 @@ calculate_all_kpis <- function( previous_mosaic_dir = NULL, ci_rds_path = NULL, harvesting_data = NULL, - output_dir = file.path(PROJECT_DIR, "output") + output_dir = NULL ) { message("\n============ AURA KPI CALCULATION (6 KPIs) ============") # Load current week mosaic message("Loading current week mosaic...") - current_mosaic <- load_weekly_ci_mosaic(current_mosaic_dir, current_week, current_year) + current_mosaic <- load_weekly_ci_mosaic(current_week, current_year, current_mosaic_dir) if (is.null(current_mosaic)) { stop("Could not load current week mosaic") @@ -581,7 +606,7 @@ calculate_all_kpis <- function( if (!is.null(previous_mosaic_dir)) { target_prev <- calculate_target_week_and_year(current_week, current_year, offset_weeks = 1) message(paste("Loading previous week mosaic (week", target_prev$week, target_prev$year, ")...")) - previous_mosaic <- load_weekly_ci_mosaic(previous_mosaic_dir, target_prev$week, target_prev$year) + previous_mosaic <- load_weekly_ci_mosaic(target_prev$week, target_prev$year, previous_mosaic_dir) if (!is.null(previous_mosaic)) { previous_stats <- extract_field_statistics_from_ci(previous_mosaic, field_boundaries_sf) diff --git a/r_app/kpi_utils.R b/r_app/old_scripts/kpi_utils.R similarity index 100% rename from r_app/kpi_utils.R rename to r_app/old_scripts/kpi_utils.R diff --git a/r_app/parameters_project.R b/r_app/parameters_project.R index 9caa6ec..d9839f2 100644 --- a/r_app/parameters_project.R +++ b/r_app/parameters_project.R @@ -1,31 +1,43 @@ -# filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\parameters_project.R +# ============================================================================== +# PARAMETERS_PROJECT_2.R (CLEANED VERSION) +# ============================================================================== +# PURPOSE: +# Project configuration, directory structure setup, and helper functions +# for centralized path management and project initialization. # -# PARAMETERS_PROJECT.R -# ==================== -# This script defines project parameters, directory structures, and loads field boundaries. -# It establishes all the necessary paths and creates required directories for the SmartCane project. +# SECTION 1: Libraries & Dependencies +# SECTION 2: Client Type Mapping & Configuration +# SECTION 3: Directory Structure Setup +# SECTION 4: Date/Week Utility Functions (centralized) +# SECTION 5: Field Boundary & Harvest Data Loaders +# SECTION 6: Project Initialization & Logging +# SECTION 7: Mosaic & KPI Verification Helpers +# +# NOTE: Duplicate functions (safe_log, smartcane_debug, smartcane_warn, +# load_field_boundaries, date_list, repair_geojson_geometries) +# have been REMOVED - they belong in 00_common_utils.R. +# Source 00_common_utils.R after parameters_project.R to get those. +# ============================================================================== -# 1. Load required libraries -# ------------------------- +# ============================================================================== +# SECTION 1: LIBRARIES & DEPENDENCIES +# ============================================================================== suppressPackageStartupMessages({ library(here) library(readxl) library(sf) library(dplyr) library(tidyr) + library(lubridate) library(jsonlite) # For reading tiling_config.json }) -# 2. Client type mapping (for conditional script execution) -# --------------------------------------------------------- +# ============================================================================== +# SECTION 2: CLIENT TYPE MAPPING & CONFIGURATION +# ============================================================================== # Maps project names to client types for pipeline control -# Client types: -# - "cane_supply": Runs Scripts 20,21,30,31,80,91 (full pipeline with Excel output) -# - "agronomic_support": Runs Scripts 80,90 only (KPI calculation + Word report) -# - "extension_service": (Future - not yet implemented) -# -# NOTE: This will eventually migrate to Laravel environment variables/database -# For now, maintain this mapping and update as projects are added +# This determines which scripts run and what outputs they produce + CLIENT_TYPE_MAP <- list( "angata" = "cane_supply", "aura" = "agronomic_support", @@ -34,6 +46,9 @@ CLIENT_TYPE_MAP <- list( "esa" = "cane_supply" ) +#' Get client type for a project +#' @param project_name Character project name +#' @return Character client type ("cane_supply" or "agronomic_support") get_client_type <- function(project_name) { client_type <- CLIENT_TYPE_MAP[[project_name]] if (is.null(client_type)) { @@ -43,21 +58,11 @@ get_client_type <- function(project_name) { return(client_type) } -# 2b. Client-specific KPI configurations -# ---------------------------------------- +# Client-specific KPI configurations # Defines which KPIs and outputs are required for each client type -# This enables Script 80 to conditionally calculate only relevant metrics -# -# Structure: -# - kpi_calculations: Vector of KPI types to calculate for this client -# - outputs: Vector of output formats to generate (determines RDS/Excel naming) -# - requires_harvest_data: Boolean - whether Script 31 harvest predictions are needed -# - script_90_compatible: Boolean - whether output should match Script 90 expectations -# - script_91_compatible: Boolean - whether output should match Script 91 expectations -# CLIENT_TYPE_CONFIGS <- list( - # Aura (agronomic_support): Farm-level KPI summaries for weekly reports to agronomists + # Aura (agronomic_support): Farm-level KPI summaries for agronomists "agronomic_support" = list( client_type = "agronomic_support", description = "Farm-level KPI summaries for agronomic decision support", @@ -69,269 +74,113 @@ CLIENT_TYPE_CONFIGS <- list( "weed_presence", "gap_filling" ), - outputs = c( - "kpi_summary_tables", # Summary statistics for Script 90 report front page - "field_details" # Detailed field table for Script 90 report end section - ), - requires_harvest_data = FALSE, # Script 31 predictions not used - script_90_compatible = TRUE, # Output format matches Script 90 expectations + outputs = c("kpi_summary_tables", "field_details"), + requires_harvest_data = FALSE, + script_90_compatible = TRUE, script_91_compatible = FALSE ), - # Cane Supply (cane_supply): Per-field analysis with harvest timing prediction + # Cane Supply (cane_supply): Per-field analysis with harvest prediction "cane_supply" = list( client_type = "cane_supply", description = "Per-field analysis with harvest prediction and phase assignment", kpi_calculations = c( - "per_field_analysis", # Use 80_weekly_stats_utils.R for field-level statistics - "phase_assignment", # Assign growth phases (Germination, Tillering, Grand Growth, Maturation) - "harvest_prediction", # Include Script 31 harvest age predictions if available - "status_triggers" # Calculate field status (Normal, Monitor, Alert, Urgent) + "per_field_analysis", + "phase_assignment", + "harvest_prediction", + "status_triggers" ), - outputs = c( - "field_analysis_excel", # Excel file with per-field metrics - "field_analysis_summary" # Summary RDS for Script 91 report - ), - requires_harvest_data = TRUE, # harvest.xlsx is required for phase assignment + outputs = c("field_analysis_excel", "field_analysis_summary"), + requires_harvest_data = TRUE, script_90_compatible = FALSE, script_91_compatible = TRUE ) ) #' Get KPI configuration for a specific client type -#' @param client_type Character string of client type (e.g., "agronomic_support", "cane_supply") +#' @param client_type Character (e.g., "agronomic_support", "cane_supply") #' @return List containing configuration for that client type get_client_kpi_config <- function(client_type) { config <- CLIENT_TYPE_CONFIGS[[client_type]] - if (is.null(config)) { - warning(sprintf("Client type '%s' not in CLIENT_TYPE_CONFIGS - defaulting to 'cane_supply'", client_type)) + warning(sprintf("Client type '%s' not found - using cane_supply defaults", client_type)) return(CLIENT_TYPE_CONFIGS[["cane_supply"]]) } - return(config) } -# 3. Smart detection for tile-based vs single-file mosaic approach -# ---------------------------------------------------------------- -detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) { - # PRIORITY 1: Check for tiling_config.json metadata file from script 10 - # This is the most reliable source since script 10 explicitly records its decision - - if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) { - # Try to find tiling_config.json in any grid-size subfolder - config_files <- list.files(daily_tiles_split_dir, - pattern = "tiling_config\\.json$", - recursive = TRUE, - full.names = TRUE) - - if (length(config_files) > 0) { - # Found a config file - use the most recent one - config_file <- config_files[which.max(file.info(config_files)$mtime)] - - tryCatch({ - config_json <- jsonlite::read_json(config_file) - return(list( - has_tiles = config_json$has_tiles %||% TRUE, - detected_tiles = character(), - total_files = 0, - source = "tiling_config.json", - grid_size = config_json$grid_size %||% "unknown" - )) - }, error = function(e) { - warning("Error reading tiling_config.json: ", e$message) - # Fall through to file-based detection - }) - } - } - - # PRIORITY 2: File-based detection (fallback if metadata not found) - # Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories - - if (!dir.exists(merged_final_tif_dir)) { - return(list( - has_tiles = FALSE, - detected_tiles = character(), - total_files = 0, - source = "directory_not_found" - )) - } - - # First check if there are grid-size subdirectories (5x5, 10x10, etc.) - # This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif - grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE) - grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE) - - if (length(grid_patterns) > 0) { - # Found grid-size subdirectories - tiles exist! - grid_size <- grid_patterns[1] - grid_dir <- file.path(merged_final_tif_dir, grid_size) - - # List sample tile files from the grid directory - sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3] - - return(list( - has_tiles = TRUE, - detected_tiles = sample_tiles, - total_files = length(sample_tiles), - source = "grid_subdirectory_detection", - grid_size = grid_size, - grid_path = grid_dir - )) - } - - # Fall back to checking for tile-named files directly in merged_final_tif - # List all .tif files in merged_final_tif - tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE) - - if (length(tif_files) == 0) { - return(list( - has_tiles = FALSE, - detected_tiles = character(), - total_files = 0, - source = "no_files_found" - )) - } - - # Check if ANY file matches tile naming pattern: *_XX.tif (where XX is 2 digits) - # Tile pattern examples: 2025-11-27_00.tif, 2025-11-27_01.tif, week_50_2024_00.tif - tile_pattern <- "_(\\d{2})\\.tif$" - tile_files <- tif_files[grepl(tile_pattern, tif_files)] - - has_tiles <- length(tile_files) > 0 - - return(list( - has_tiles = has_tiles, - detected_tiles = tile_files, - total_files = length(tif_files), - source = "file_pattern_detection" - )) -} +# ============================================================================== +# SECTION 3: DIRECTORY STRUCTURE SETUP +# ============================================================================== +# CENTRALIZED PATH MANAGEMENT: All file paths in the entire pipeline +# are derived from setup_project_directories(). +# This is the single source of truth for 8 tiers of directories. -# 4. Define project directory structure -# ----------------------------------- -# ============================================================================== -# CENTRALIZED PATH MANAGEMENT - setup_project_directories() -# ============================================================================== -# This function is the single source of truth for ALL file paths used across the pipeline. -# All scripts should call this function once at startup and use returned paths. -# This eliminates ~88 hardcoded file.path() calls scattered across 8 scripts. -# -# USAGE: -# paths <- setup_project_directories(project_dir) -# merged_tif_dir <- paths$merged_tif_folder -# daily_ci_dir <- paths$daily_ci_vals_dir -# kpi_output_dir <- paths$kpi_reports_dir -# -# TIERS (8-layer directory structure): -# Tier 1: Raw data (merged_tif) -# Tier 2: Per-field TIFFs (field_tiles, field_tiles_CI) -# Tier 3: CI Extraction (daily_ci_vals, cumulative_ci_vals) -# Tier 4: Growth Model (growth_model_interpolated) -# Tier 5: Mosaics (weekly_mosaic, weekly_tile_max) -# Tier 6: KPI & Reporting (kpi_reports_dir, kpi_field_stats_dir) -# Tier 7: Support (data, vrt, harvest, logs) -# Tier 8: Config & Metadata (field_boundaries_path, tiling_config_path) -# -# BENEFITS: -# ✓ Single source of truth (eliminates ~88 hardcoded file.path() calls) -# ✓ Auto-creates all directories (no scattered dir.create() calls) -# ✓ Easy to update storage structure globally -# ✓ Consistent naming across all 8 scripts -# ============================================================================== +#' Setup complete project directory structure +#' +#' Creates all 8 tiers of directories and returns a comprehensive list +#' of paths for use throughout the pipeline. +#' +#' @param project_dir Character. Project name (e.g., "angata", "aura") +#' @param data_source Character. "merged_tif" (default) or "merged_tif_8b" +#' @return List containing all 8 tiers of paths +#' +#' @details +#' TIER 1: Raw data (merged_tif) - Python download output +#' TIER 2: Per-field TIFFs - Script 10 output +#' TIER 3: CI extraction - Script 20 output +#' TIER 4: Growth model - Script 30 output +#' TIER 5: Mosaics - Script 40 output +#' TIER 6: KPI & reports - Scripts 80/90/91 output +#' TIER 7: Support files - GeoJSON, Excel, logs +#' TIER 8: Metadata - Config, CRS info setup_project_directories <- function(project_dir, data_source = "merged_tif") { - # =========================================================================== - # BASE DIRECTORIES (Foundation for all paths) - # =========================================================================== + # BASE DIRECTORIES laravel_storage_dir <- here("laravel_app", "storage", "app", project_dir) - # =========================================================================== - # TIER 1: RAW DATA & INPUT PATHS (Script 00 - Python download output) - # =========================================================================== - merged_tif_folder <- here(laravel_storage_dir, "merged_tif") # 4-band raw GeoTIFFs from Planet + # TIER 1: RAW DATA (Script 00 output - Python download) + merged_tif_folder <- here(laravel_storage_dir, "merged_tif") - # =========================================================================== - # TIER 2: TILING PATHS (Script 10 - Per-field tiff creation) - # =========================================================================== - # Per-field TIFF structure: field_tiles/{FIELD_NAME}/{YYYY-MM-DD}.tif + # TIER 2: PER-FIELD TIFFS (Script 10 output) field_tiles_dir <- here(laravel_storage_dir, "field_tiles") - - # Per-field CI TIFFs (pre-computed, used by Script 40): field_tiles_CI/{FIELD_NAME}/{YYYY-MM-DD}.tif field_tiles_ci_dir <- here(laravel_storage_dir, "field_tiles_CI") - - # Legacy tiling (for backward compatibility): daily_tiles_split/{grid_size}/{YYYY-MM-DD}/{YYYY-MM-DD}_XX.tif daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split") - # =========================================================================== - # TIER 3: CI EXTRACTION PATHS (Script 20 - Canopy Index calculation) - # =========================================================================== - extracted_ci_base_dir <- here(laravel_storage_dir, "Data", "extracted_ci") + # SUPPORT TIER: DATA DIRECTORY (define early for use in later tiers) + data_dir <- here(laravel_storage_dir, "Data") - # Daily CI values (cumulative RDS): combined_CI_data.rds - daily_ci_vals_dir <- here(extracted_ci_base_dir, "daily_vals") - - # Cumulative CI across time: All_pivots_Cumulative_CI_quadrant_year_v2.rds + # TIER 3: CI EXTRACTION (Script 20 output) + # Structure: Data/extracted_ci/daily_vals/{FIELD}/{DATE}.rds (per-field daily CI values) + extracted_ci_base_dir <- here(data_dir, "extracted_ci") + daily_ci_vals_dir <- here(extracted_ci_base_dir, "daily_vals") # Per-field structure cumulative_ci_vals_dir <- here(extracted_ci_base_dir, "cumulative_vals") - - # Per-field CI data for Python harvest prediction (Script 21): ci_data_for_python.csv ci_for_python_dir <- here(extracted_ci_base_dir, "ci_data_for_python") - # =========================================================================== - # TIER 4: GROWTH MODEL PATHS (Script 30 - Interpolation & smoothing) - # =========================================================================== - growth_model_interpolated_dir <- here(laravel_storage_dir, "growth_model_interpolated") + # TIER 4: GROWTH MODEL (Script 30 output) + growth_model_interpolated_dir <- here(data_dir, "growth_model_interpolated") - # =========================================================================== - # TIER 5: MOSAIC PATHS (Script 40 - Weekly mosaics) - # =========================================================================== - # Per-field weekly mosaics (per-field architecture): weekly_mosaic/{FIELD}/{week_XX_YYYY}.tif + # TIER 5: MOSAICS (Script 40 output) weekly_mosaic_dir <- here(laravel_storage_dir, "weekly_mosaic") - - # Tile-based weekly max (legacy): weekly_tile_max/{grid_size}/week_XX_YYYY.tif weekly_tile_max_dir <- here(laravel_storage_dir, "weekly_tile_max") - # =========================================================================== - # TIER 6: KPI & REPORTING PATHS (Scripts 80, 90, 91) - # =========================================================================== + # TIER 6: KPI & REPORTING (Scripts 80/90/91 output) reports_dir <- here(laravel_storage_dir, "reports") - kpi_reports_dir <- here(reports_dir, "kpis") # Where Script 80 outputs KPI CSV/RDS files - kpi_field_stats_dir <- here(kpi_reports_dir, "field_stats") # Per-field KPI details - kpi_field_analysis_dir <- here(kpi_reports_dir, "field_analysis") # Field-level analysis for Script 91 + kpi_reports_dir <- here(reports_dir, "kpis", "field_level") + kpi_field_stats_dir <- here(reports_dir, "kpis", "field_stats") + kpi_field_analysis_dir <- here(reports_dir, "kpis", "field_analysis") - # =========================================================================== - # TIER 7: SUPPORT PATHS (Data, VRT, Harvest) - # =========================================================================== - data_dir <- here(laravel_storage_dir, "Data") + # TIER 7: SUPPORT (various scripts) vrt_dir <- here(data_dir, "vrt") # Virtual Raster files created during CI extraction - harvest_dir <- here(data_dir, "HarvestData") # Harvest schedule data - log_dir <- here(laravel_storage_dir, "logs") # Log files + harvest_dir <- here(data_dir, "harvest") # Harvest data directory + log_dir <- here(laravel_storage_dir, "logs") - # =========================================================================== - # TIER 8: CONFIG & METADATA PATHS - # =========================================================================== - # Field boundaries GeoJSON (same across all scripts) - field_boundaries_path <- here(data_dir, "pivot.geojson") - - # Tiling configuration metadata from Script 10 - tiling_config_path <- here(daily_tiles_split_dir, "tiling_config.json") - - # =========================================================================== - # CREATE ALL DIRECTORIES (once per pipeline run) - # =========================================================================== + # Create all directories all_dirs <- c( - # Tier 1 - merged_tif_folder, - # Tier 2 - field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir, - # Tier 3 + merged_tif_folder, field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir, extracted_ci_base_dir, daily_ci_vals_dir, cumulative_ci_vals_dir, ci_for_python_dir, - # Tier 4 growth_model_interpolated_dir, - # Tier 5 weekly_mosaic_dir, weekly_tile_max_dir, - # Tier 6 reports_dir, kpi_reports_dir, kpi_field_stats_dir, kpi_field_analysis_dir, - # Tier 7 data_dir, vrt_dir, harvest_dir, log_dir ) @@ -339,12 +188,11 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { dir.create(dir_path, showWarnings = FALSE, recursive = TRUE) } - # =========================================================================== - # RETURN COMPREHENSIVE PATH LIST - # Scripts should source parameters_project.R and receive paths object like: - # paths <- setup_project_directories(project_dir) - # Then use: paths$merged_tif_folder, paths$daily_ci_vals_dir, etc. - # =========================================================================== + # TIER 8: CONFIG & METADATA PATHS + field_boundaries_path <- here(data_dir, "pivot.geojson") + tiling_config_path <- here(laravel_storage_dir, "tiling_config.json") + + # Return comprehensive list return(list( # PROJECT ROOT laravel_storage_dir = laravel_storage_dir, @@ -357,20 +205,20 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { field_tiles_ci_dir = field_tiles_ci_dir, daily_tiles_split_dir = daily_tiles_split_dir, - # TIER 3: CI Extraction + # TIER 3: CI extraction extracted_ci_base_dir = extracted_ci_base_dir, daily_ci_vals_dir = daily_ci_vals_dir, cumulative_ci_vals_dir = cumulative_ci_vals_dir, ci_for_python_dir = ci_for_python_dir, - # TIER 4: Growth Model + # TIER 4: Growth model growth_model_interpolated_dir = growth_model_interpolated_dir, # TIER 5: Mosaics weekly_mosaic_dir = weekly_mosaic_dir, weekly_tile_max_dir = weekly_tile_max_dir, - # TIER 6: KPI & Reporting + # TIER 6: KPI & reporting reports_dir = reports_dir, kpi_reports_dir = kpi_reports_dir, kpi_field_stats_dir = kpi_field_stats_dir, @@ -382,155 +230,130 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { harvest_dir = harvest_dir, log_dir = log_dir, - # TIER 8: Config & Metadata + # TIER 8: Metadata field_boundaries_path = field_boundaries_path, tiling_config_path = tiling_config_path )) } # ============================================================================== -# TIER-BY-TIER PATH REFERENCE (for setup_project_directories output) -# ============================================================================== -# -# TIER 1: RAW DATA (Script 00 - Python download) -# paths$merged_tif_folder -# └─ {YYYY-MM-DD}.tif (4-band uint16 GeoTIFFs from Planet API) -# -# TIER 2: PER-FIELD TIFFS (Script 10) -# paths$field_tiles_dir/{FIELD_NAME}/{YYYY-MM-DD}.tif -# paths$field_tiles_ci_dir/{FIELD_NAME}/{YYYY-MM-DD}.tif -# paths$daily_tiles_split_dir/{grid_size}/{YYYY-MM-DD}/{YYYY-MM-DD}_XX.tif (legacy) -# -# TIER 3: CI EXTRACTION (Script 20) -# paths$daily_ci_vals_dir/combined_CI_data.rds -# paths$cumulative_ci_vals_dir/All_pivots_Cumulative_CI_quadrant_year_v2.rds -# paths$ci_for_python_dir/ci_data_for_python.csv (Script 21 output) -# -# TIER 4: GROWTH MODEL (Script 30) -# paths$growth_model_interpolated_dir/ (RDS files with interpolated CI) -# -# TIER 5: MOSAICS (Script 40) -# paths$weekly_mosaic_dir/{FIELD_NAME}/week_XX_YYYY.tif -# paths$weekly_tile_max_dir/{grid_size}/week_XX_YYYY_00.tif (legacy) -# -# TIER 6: KPI & REPORTING (Scripts 80, 90, 91) -# paths$kpi_reports_dir/ (KPI outputs from Script 80) -# paths$kpi_field_stats_dir/ (Per-field KPI RDS) -# paths$kpi_field_analysis_dir/ (Analysis RDS for Script 91) -# paths$reports_dir/ (Word/HTML reports) -# -# TIER 7: SUPPORT (Various scripts) -# paths$data_dir/pivot.geojson (Field boundaries) -# paths$data_dir/harvest.xlsx (Harvest schedule) -# paths$vrt_dir/ (Virtual raster files) -# paths$harvest_dir/ (Harvest predictions from Python) -# paths$log_dir/ (Pipeline logs) -# -# TIER 8: CONFIG & METADATA -# paths$field_boundaries_path (Full path to pivot.geojson) -# paths$tiling_config_path (Metadata from Script 10) -# +# SECTION 4: DATE/WEEK UTILITY FUNCTIONS # ============================================================================== +# ISO 8601 week/year functions for consistent date handling across scripts -#set working dir. -# 5. Load field boundaries -# ---------------------- +#' Extract ISO week number from a date +#' @param date Date object or string convertible to Date +#' @return Numeric ISO week number (1-53) +get_iso_week <- function(date) { + as.numeric(format(date, "%V")) +} + +#' Extract ISO year from a date +#' @param date Date object or string convertible to Date +#' @return Numeric ISO year +get_iso_year <- function(date) { + as.numeric(format(date, "%G")) +} + +#' Extract both ISO week and year as a list +#' @param date Date object or string convertible to Date +#' @return List with elements: week (1-53), year +get_iso_week_year <- function(date) { + list( + week = as.numeric(format(date, "%V")), + year = as.numeric(format(date, "%G")) + ) +} + +#' Format date as a week/year label +#' @param date Date object or string convertible to Date +#' @param separator Character. Separator between week and year (default "_") +#' @return Character in format "week##_YYYY" (e.g., "week03_2025") +format_week_label <- function(date, separator = "_") { + wwy <- get_iso_week_year(date) + sprintf("week%02d%s%d", wwy$week, separator, wwy$year) +} + +# ============================================================================== +# SECTION 5: FIELD BOUNDARY & HARVEST DATA LOADERS +# ============================================================================== +# IMPORTANT: These functions are also defined in 00_common_utils.R +# to avoid duplication. Source 00_common_utils.R AFTER parameters_project.R +# to override these stub definitions with the full implementations. + +#' Load Field Boundaries from GeoJSON +#' +#' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson). +#' Handles CRS validation and column standardization. +#' +#' @param data_dir Directory containing GeoJSON file +#' @return List with elements: +#' - field_boundaries_sf: sf (Simple Features) object +#' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback) +#' load_field_boundaries <- function(data_dir) { - # Choose field boundaries file based on project and script type - # ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model) - # All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson - use_pivot_2 <- exists("project_dir") && project_dir == "esa" && - exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03 - - if (use_pivot_2) { - field_boundaries_path <- here(data_dir, "pivot_2.geojson") - } else { - field_boundaries_path <- here(data_dir, "pivot.geojson") - } - + field_boundaries_path <- file.path(data_dir, "pivot.geojson") + if (!file.exists(field_boundaries_path)) { - stop(paste("Field boundaries file not found at path:", field_boundaries_path)) + stop("Field boundaries file not found at:", field_boundaries_path) } tryCatch({ - # Read GeoJSON with explicit CRS handling - field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE) + boundaries_sf <- sf::st_read(field_boundaries_path, quiet = TRUE) - # Remove OBJECTID column immediately if it exists - if ("OBJECTID" %in% names(field_boundaries_sf)) { - field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID) + # Repair geometries if needed + if (!all(sf::st_is_valid(boundaries_sf))) { + boundaries_sf <- sf::st_make_valid(boundaries_sf) } - # Validate and fix CRS if needed - DO NOT call is.na on CRS objects as it can cause errors - # Just ensure CRS is set; terra will handle projection if needed - tryCatch({ - # Simply assign WGS84 if not already set (safe approach) - # This avoids any problematic is.na() calls on complex CRS objects - if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) { - st_crs(field_boundaries_sf) <- 4326 - warning("CRS was missing, assigned WGS84 (EPSG:4326)") - } - }, error = function(e) { - # If any CRS operation fails, just try to set it - tryCatch({ - st_crs(field_boundaries_sf) <<- 4326 - }, error = function(e2) { - # Silently continue - terra might handle it - warning(paste("Could not set CRS:", e2$message)) - }) - }) - - # Handle column names - accommodate optional sub_area column - # IMPORTANT: Must preserve geometry column properly when renaming sf object - if ("sub_area" %in% names(field_boundaries_sf)) { - # Reorder columns but keep geometry last - field_boundaries_sf <- field_boundaries_sf %>% - dplyr::select(field, sub_field, sub_area) %>% - sf::st_set_geometry("geometry") - } else { - # Reorder columns but keep geometry last - field_boundaries_sf <- field_boundaries_sf %>% - dplyr::select(field, sub_field) %>% - sf::st_set_geometry("geometry") - } - - # Convert to terra vector if possible, otherwise use sf - # Some GeoJSON files (like aura with complex MultiPolygons) may have GDAL/terra compatibility issues - field_boundaries <- tryCatch({ - field_boundaries_terra <- terra::vect(field_boundaries_sf) - - # Ensure terra object has valid CRS with safer checks - crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL) - crs_str <- if (!is.null(crs_value)) as.character(crs_value) else "" - - if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) { - terra::crs(field_boundaries_terra) <- "EPSG:4326" - warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)") - } - field_boundaries_terra - - }, error = function(e) { - warning(paste("Terra conversion failed, using sf object instead:", e$message)) - # Return sf object as fallback - functions will handle both types - field_boundaries_sf - }) + # Convert to terra SpatVect + boundaries_spat <- terra::vect(boundaries_sf) return(list( - field_boundaries_sf = field_boundaries_sf, - field_boundaries = field_boundaries + field_boundaries_sf = boundaries_sf, + field_boundaries = boundaries_spat )) }, error = function(e) { - cat("[DEBUG] Error in load_field_boundaries:\n") - cat(" Message:", e$message, "\n") - cat(" Call:", deparse(e$call), "\n") - stop(paste("Error loading field boundaries:", e$message)) + stop("Error loading field boundaries:", e$message) }) } -# 6. Load harvesting data -# --------------------- +#' Create Date List +#' +#' Creates a sequence of dates from end_date going back offset days +#' +#' @param end_date End date (Date object) +#' @param offset Number of days to go back +#' @return Character vector of dates in YYYY-MM-DD format +date_list <- function(end_date, offset) { + start_date <- end_date - offset + date_seq <- seq(start_date, end_date, by = "day") + format(date_seq, "%Y-%m-%d") +} + +#' Repair GeoJSON Geometries +#' +#' Validates and repairs invalid geometries in sf object +#' +#' @param sf_object sf object with potentially invalid geometries +#' @return sf object with repaired geometries +repair_geojson_geometries <- function(sf_object) { + if (!all(sf::st_is_valid(sf_object))) { + sf_object <- sf::st_make_valid(sf_object) + } + sf_object +} + +#' Load harvesting data from Excel +#' +#' Loads crop harvest schedule from harvest.xlsx file. +#' Handles flexible date formats (numeric, YYYY-MM-DD, DD/MM/YYYY, etc.). +#' +#' @param data_dir Directory containing harvest.xlsx file +#' @return Data frame with columns: field, sub_field, year, season_start, +#' season_end, age (weeks), sub_area, tonnage_ha. Returns NULL if not found. load_harvesting_data <- function(data_dir) { - harvest_file <- here(data_dir, "harvest.xlsx") + harvest_file <- file.path(data_dir, "harvest.xlsx") if (!file.exists(harvest_file)) { warning(paste("Harvest data file not found at path:", harvest_file)) @@ -543,15 +366,11 @@ load_harvesting_data <- function(data_dir) { if (inherits(x, "Date")) return(x) if (inherits(x, "POSIXct")) return(as.Date(x)) - # If it's numeric (Excel date serial), convert directly if (is.numeric(x)) { return(as.Date(x, origin = "1899-12-30")) } - # Try character conversion with multiple formats x_char <- as.character(x) - - # Try common formats: YYYY-MM-DD, DD/MM/YYYY, MM/DD/YYYY, YYYY-MM-DD HH:MM:SS formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S") for (fmt in formats) { @@ -559,24 +378,15 @@ load_harvesting_data <- function(data_dir) { if (!is.na(result)) return(result) } - # If all else fails, return NA return(NA) } tryCatch({ harvesting_data <- read_excel(harvest_file) %>% - dplyr::select( - c( - "field", - "sub_field", - "year", - "season_start", - "season_end", - "age", - "sub_area", - "tonnage_ha" - ) - ) %>% + dplyr::select(c( + "field", "sub_field", "year", + "season_start", "season_end", "age", "sub_area", "tonnage_ha" + )) %>% mutate( field = as.character(field), sub_field = as.character(sub_field), @@ -605,41 +415,86 @@ load_harvesting_data <- function(data_dir) { }) } -# 5. Define logging functions globally first -# --------------------------------------- -# Create a simple default log function in case setup_logging hasn't been called yet +# ============================================================================== +# SECTION 6: LOGGING SYSTEM SETUP +# ============================================================================== +# Create default logging functions + +#' Safe Logging Function +#' +#' Generic logging with [LEVEL] prefix. Works standalone without any framework. +#' Consistent with SmartCane logging standard. +#' +#' @param message The message to log +#' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG" +#' @return NULL (invisible, used for side effects) +#' +safe_log <- function(message, level = "INFO") { + prefix <- sprintf("[%s]", level) + cat(sprintf("%s %s\n", prefix, message)) +} + +#' SmartCane Debug Logging (Conditional) +#' +#' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set. +#' +#' @param message The message to log +#' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE) +#' @return NULL (invisible, used for side effects) +#' +smartcane_debug <- function(message, verbose = FALSE) { + if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") { + return(invisible(NULL)) + } + safe_log(message, level = "DEBUG") +} + +#' SmartCane Warning Logging +#' +#' Logs WARN-level messages. Convenience wrapper around safe_log(). +#' +#' @param message The message to log +#' @return NULL (invisible, used for side effects) +#' +smartcane_warn <- function(message) { + safe_log(message, level = "WARN") +} + log_message <- function(message, level = "INFO") { - timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") - formatted_message <- paste0("[", level, "] ", timestamp, " - ", message) - cat(formatted_message, "\n") + prefix <- sprintf("[%s]", level) + cat(sprintf("%s %s\n", prefix, message)) } -log_head <- function(list, level = "INFO") { - log_message(paste(capture.output(str(head(list))), collapse = "\n"), level) +log_head <- function(data, level = "INFO") { + log_message(paste(capture.output(str(head(data))), collapse = "\n"), level) } -# 8. Set up full logging system with file output -# ------------------------------------------- +#' Setup full logging system with file output +#' +#' Creates log directory and returns logging functions that write to both +#' console and log file. +#' +#' @param log_dir Character. Directory for log files +#' @return List with log_file path, log_message function, and log_head function setup_logging <- function(log_dir) { - log_file <- here(log_dir, paste0(format(Sys.Date(), "%Y%m%d"), ".log")) + dir.create(log_dir, showWarnings = FALSE, recursive = TRUE) + + log_file <- file.path(log_dir, sprintf( + "smartcane_%s.log", + format(Sys.time(), "%Y%m%d_%H%M%S") + )) - # Create enhanced log functions log_message <- function(message, level = "INFO") { - timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") - formatted_message <- paste0("[", level, "] ", timestamp, " - ", message) + prefix <- sprintf("[%s] [%s]", level, format(Sys.time(), "%Y-%m-%d %H:%M:%S")) + formatted_message <- sprintf("%s %s", prefix, message) + cat(formatted_message, "\n") cat(formatted_message, "\n", file = log_file, append = TRUE) - - # Also print to console for debugging - if (level %in% c("ERROR", "WARNING")) { - cat(formatted_message, "\n") - } } log_head <- function(list, level = "INFO") { log_message(paste(capture.output(str(head(list))), collapse = "\n"), level) } - # Update the global functions with the enhanced versions assign("log_message", log_message, envir = .GlobalEnv) assign("log_head", log_head, envir = .GlobalEnv) @@ -650,275 +505,328 @@ setup_logging <- function(log_dir) { )) } -# 8. HELPER FUNCTIONS FOR COMMON CALCULATIONS -# ----------------------------------------------- -# Centralized functions to reduce duplication across scripts +# ============================================================================== +# SECTION 6B: DATA SOURCE DETECTION +# ============================================================================== -# Get ISO week and year from a date -get_iso_week <- function(date) { - as.numeric(format(date, "%V")) -} - -get_iso_year <- function(date) { - as.numeric(format(date, "%G")) -} - -# Get both ISO week and year as a list -get_iso_week_year <- function(date) { - list( - week = as.numeric(format(date, "%V")), - year = as.numeric(format(date, "%G")) - ) -} - -# Format week/year into a readable label -format_week_label <- function(date, separator = "_") { - wwy <- get_iso_week_year(date) - sprintf("week%02d%s%d", wwy$week, separator, wwy$year) -} - -# Auto-detect mosaic mode -# For per-field architecture, always returns "single-file" (weekly_mosaic/{FIELD}/week_*.tif) -detect_mosaic_mode <- function(project_dir) { - # Per-field architecture uses single-file mosaics organized per-field - weekly_mosaic <- file.path("laravel_app", "storage", "app", project_dir, "weekly_mosaic") - if (dir.exists(weekly_mosaic)) { - return("single-file") # Per-field structure - } - return("unknown") -} - -# Auto-detect grid size from tile directory structure -# For per-field architecture, returns "unknown" since grid-based organization is legacy -detect_grid_size <- function(project_dir) { - # Per-field architecture doesn't use grid-based organization anymore - return("unknown") -} - -# Build storage paths consistently across all scripts -get_project_storage_path <- function(project_dir, subdir = NULL) { - base <- file.path("laravel_app", "storage", "app", project_dir) - if (!is.null(subdir)) file.path(base, subdir) else base -} - -get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") { - # Per-field architecture always uses weekly_mosaic (single-file, per-field organization) - get_project_storage_path(project_dir, "weekly_mosaic") -} - -get_kpi_dir <- function(project_dir, client_type) { - subdir <- if (client_type == "agronomic_support") "field_level" else "field_analysis" - get_project_storage_path(project_dir, file.path("reports", "kpis", subdir)) -} - -# Logging functions moved to 00_common_utils.R -# - smartcane_log() — Main logging function with level prefix -# - smartcane_debug() — Conditional debug logging -# - smartcane_warn() — Warning wrapper -# Import with: source("r_app/00_common_utils.R") - -# ============================================================================ -# PHASE 3 & 4: OPTIMIZATION & DOCUMENTATION -# ============================================================================ - -# System Constants -# ---------------- -# Define once, use everywhere - -RSCRIPT_PATH <- "C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe" -# Used in run_full_pipeline.R for calling R scripts via system() - -# Data Source Documentation -# --------------------------- -# Explains the two satellite data formats and when to use each -# -# SmartCane uses PlanetScope imagery from Planet Labs API in two formats: -# -# 1. merged_tif (4-band): -# - Standard format: Red, Green, Blue, Near-Infrared -# - Size: ~150-200 MB per date -# - Use case: Agronomic support, general crop health monitoring -# - Projects: aura, xinavane -# - Cloud handling: Basic cloud masking from Planet metadata -# -# 2. merged_tif_8b (8-band with cloud confidence): -# - Enhanced format: 4-band imagery + 4-band UDM2 cloud mask -# - UDM2 bands: Clear, Snow, Shadow, Light Haze -# - Size: ~250-350 MB per date -# - Use case: Harvest prediction, supply chain optimization -# - Projects: angata, chemba, esa (cane_supply clients) -# - Cloud handling: Per-pixel cloud confidence from Planet UDM2 -# - Why: Cane supply chains need precise confidence to predict harvest dates -# (don't want to predict based on cloudy data) -# -# The system auto-detects which is available via detect_data_source() - -# Mosaic Mode Documentation -# -------------------------- -# SmartCane supports two ways to store and process weekly mosaics: -# -# 1. Single-file mosaic ("single-file"): -# - One GeoTIFF per week: weekly_mosaic/week_02_2026.tif -# - 5 bands per file: R, G, B, NIR, CI (Canopy Index) -# - Size: ~300-500 MB per week -# - Pros: Simpler file management, easier full-field visualization -# - Cons: Slower for field-specific queries, requires loading full raster -# - Best for: Agronomic support (aura) with <100 fields -# - Script 04 output: 5-band single-file mosaic -# -# 2. Tiled mosaic ("tiled"): -# - Grid of tiles per week: weekly_tile_max/5x5/week_02_2026_{TT}.tif -# - Example: 25 files (5×5 grid) × 5 bands = 125 individual tiffs -# - Size: ~15-20 MB per tile, organized in folders -# - Pros: Parallel processing, fast field lookups, scales to 1000+ fields -# - Cons: More file I/O, requires tile-to-field mapping metadata -# - Best for: Cane supply (angata, chemba) with 500+ fields -# - Script 04 output: Per-tile tiff files in weekly_tile_max/{grid}/ -# - Tile assignment: Field boundaries mapped to grid coordinates -# -# The system auto-detects which is available via detect_mosaic_mode() - -# Client Type Documentation -# -------------------------- -# SmartCane runs different analysis pipelines based on client_type: -# -# CLIENT_TYPE: cane_supply -# Purpose: Optimize sugar mill supply chain (harvest scheduling) -# Scripts run: 20 (CI), 21 (RDS to CSV), 30 (Growth), 31 (Harvest pred), 40 (Mosaic), 80 (KPI), 91 (Excel) -# Outputs: -# - Per-field analysis: field status, growth phase, harvest readiness -# - Excel reports (Script 91): Detailed metrics for logistics planning -# - KPI directory: reports/kpis/field_analysis/ (one RDS per week) -# Harvest data: Required (harvest.xlsx - planting dates for phase assignment) -# Data source: merged_tif_8b (uses cloud confidence for confidence) -# Mosaic mode: tiled (scales to 500+ fields) -# Projects: angata, chemba, xinavane, esa -# -# CLIENT_TYPE: agronomic_support -# Purpose: Provide weekly crop health insights to agronomists -# Scripts run: 80 (KPI), 90 (Word report) -# Outputs: -# - Farm-level KPI summaries (no per-field breakdown) -# - Word reports (Script 90): Charts and trends for agronomist decision support -# - KPI directory: reports/kpis/field_level/ (one RDS per week) -# Harvest data: Not used -# Data source: merged_tif (simpler, smaller) -# Mosaic mode: single-file (100-200 fields) -# Projects: aura -# - -# Detect data source (merged_tif vs merged_tif_8b) based on availability -# Returns the first available source; defaults to merged_tif_8b if neither exists +#' Detect data source for project +#' +#' Returns the data source directory (always "merged_tif" for consistency) +#' +#' @param project_dir Character. Project name +#' @return Character. "merged_tif" detect_data_source <- function(project_dir) { - # Data source is always merged_tif for consistency return("merged_tif") } -# Check KPI completeness for a reporting period -# Returns: List with kpis_df (data.frame), missing_count, and all_complete (boolean) -# This replaces duplicate KPI checking logic in run_full_pipeline.R (lines ~228-270, ~786-810) -check_kpi_completeness <- function(project_dir, client_type, end_date, reporting_weeks_needed) { - kpi_dir <- get_kpi_dir(project_dir, client_type) - - kpis_needed <- data.frame() - - for (weeks_back in 0:(reporting_weeks_needed - 1)) { - check_date <- end_date - (weeks_back * 7) - wwy <- get_iso_week_year(check_date) +#' Detect tile structure from merged TIF directory +#' +#' Checks if tiles exist by looking for: +#' 1. tiling_config.json metadata file (most reliable) +#' 2. Grid subdirectories (5x5, 10x10, etc.) +#' 3. Tile-named files (*_XX.tif pattern) +#' +#' @param merged_final_tif_dir Character. Path to merged TIF directory +#' @param daily_tiles_split_dir Character. Optional path to tiles directory +#' @return List with has_tiles (logical), detected_tiles, total_files, source, grid_size +detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) { + # PRIORITY 1: Check for tiling_config.json metadata file from script 10 + if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) { + config_files <- list.files(daily_tiles_split_dir, + pattern = "tiling_config\\.json$", + recursive = TRUE, + full.names = TRUE) - # Build week pattern and check if it exists - week_pattern <- sprintf("week%02d_%d", wwy$week, wwy$year) - files_this_week <- list.files(kpi_dir, pattern = week_pattern) - has_kpis <- length(files_this_week) > 0 - - # Track missing weeks - kpis_needed <- rbind(kpis_needed, data.frame( - week = wwy$week, - year = wwy$year, - date = check_date, - has_kpis = has_kpis, - pattern = week_pattern, - file_count = length(files_this_week) - )) - - # Debug logging - smartcane_debug(sprintf( - "Week %02d/%d (%s): %s (%d files)", - wwy$week, wwy$year, format(check_date, "%Y-%m-%d"), - if (has_kpis) "✓ FOUND" else "✗ MISSING", - length(files_this_week) - )) - } - - # Summary statistics - missing_count <- sum(!kpis_needed$has_kpis) - all_complete <- missing_count == 0 - - return(list( - kpis_df = kpis_needed, - kpi_dir = kpi_dir, - missing_count = missing_count, - missing_weeks = kpis_needed[!kpis_needed$has_kpis, ], - all_complete = all_complete - )) -} - -# 9. Initialize the project -# ---------------------- -# Export project directories and settings -initialize_project <- function(project_dir, data_source = "merged_tif") { - # Set up directory structure, passing data_source to select TIF folder - dirs <- setup_project_directories(project_dir, data_source = data_source) - - # Set up logging - logging <- setup_logging(dirs$log_dir) - - # Load field boundaries - boundaries <- load_field_boundaries(dirs$data_dir) - - # Load harvesting data - harvesting_data <- load_harvesting_data(dirs$data_dir) - - # Return all initialized components - return(c( - dirs, - list( - logging = logging, - field_boundaries = boundaries$field_boundaries, - field_boundaries_sf = boundaries$field_boundaries_sf, - harvesting_data = harvesting_data - ) - )) -} - -# When script is sourced, initialize with the global project_dir variable if it exists -if (exists("project_dir")) { - # Now we can safely log before initialization - log_message(paste("Initializing project with directory:", project_dir)) - - # Use data_source if it exists (passed from 02_ci_extraction.R), otherwise use default - data_src <- if (exists("data_source")) data_source else "merged_tif" - log_message(paste("Using data source directory:", data_src)) - - project_config <- initialize_project(project_dir, data_source = data_src) - - # Expose all variables to the global environment - list2env(project_config, envir = .GlobalEnv) - - # Log project initialization completion with tile mode info - log_message(paste("Project initialized with directory:", project_dir)) - if (exists("use_tile_mosaic")) { - mosaic_mode <- if (use_tile_mosaic) "TILE-BASED" else "SINGLE-FILE" - log_message(paste("Mosaic mode detected:", mosaic_mode)) - if (exists("tile_detection_info") && !is.null(tile_detection_info)) { - log_message(paste(" - Detection source:", tile_detection_info$detected_source)) - log_message(paste(" - Grid size:", tile_detection_info$grid_size)) - log_message(paste(" - Detected files in storage:", tile_detection_info$detected_count)) - if (length(tile_detection_info$sample_tiles) > 0) { - log_message(paste(" - Sample tile files:", paste(tile_detection_info$sample_tiles, collapse = ", "))) - } + if (length(config_files) > 0) { + config_file <- config_files[which.max(file.info(config_files)$mtime)] + + tryCatch({ + config_json <- jsonlite::read_json(config_file) + return(list( + has_tiles = config_json$has_tiles %||% TRUE, + detected_tiles = character(), + total_files = 0, + source = "tiling_config.json", + grid_size = config_json$grid_size %||% "unknown" + )) + }, error = function(e) { + warning("Error reading tiling_config.json: ", e$message) + }) } } + + # PRIORITY 2: File-based detection (fallback) + if (!dir.exists(merged_final_tif_dir)) { + return(list( + has_tiles = FALSE, + detected_tiles = character(), + total_files = 0, + source = "directory_not_found" + )) + } + + # Check for grid-size subdirectories (5x5, 10x10, etc.) + grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE) + grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE) + + if (length(grid_patterns) > 0) { + grid_size <- grid_patterns[1] + grid_dir <- file.path(merged_final_tif_dir, grid_size) + sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3] + + return(list( + has_tiles = TRUE, + detected_tiles = sample_tiles, + total_files = length(sample_tiles), + source = "grid_subdirectory_detection", + grid_size = grid_size, + grid_path = grid_dir + )) + } + + # Check for tile-named files (*_XX.tif pattern) + tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE) + + if (length(tif_files) == 0) { + return(list( + has_tiles = FALSE, + detected_tiles = character(), + total_files = 0, + source = "no_files_found" + )) + } + + tile_pattern <- "_(\\d{2})\\.tif$" + tile_files <- tif_files[grepl(tile_pattern, tif_files)] + has_tiles <- length(tile_files) > 0 + + return(list( + has_tiles = has_tiles, + detected_tiles = tile_files, + total_files = length(tif_files), + source = "file_pattern_detection" + )) +} + +# ============================================================================== +# SECTION 7: MOSAIC & KPI VERIFICATION HELPERS +# ============================================================================== +# Centralized helper functions for run_full_pipeline.R to avoid hardcoding paths + +#' Detect mosaic mode from project structure +#' +#' Determines if project uses "tiled" (legacy) or "single-file" (per-field) mosaics +#' +#' @param project_dir Character. Project name +#' @return Character. "tiled" or "single-file" +detect_mosaic_mode <- function(project_dir) { + # Per-field architecture is standard - always return "single-file" + # unless weekly_tile_max directory exists with content + mosaic_tiled_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max") + + if (dir.exists(mosaic_tiled_dir) && length(list.files(mosaic_tiled_dir)) > 0) { + return("tiled") + } + return("single-file") +} + +#' Detect grid size from tile directory structure +#' +#' For per-field architecture, returns "unknown" (grid-based tiling is legacy) +#' +#' @param project_dir Character. Project name +#' @return Character. Grid size ("unknown" for per-field) +detect_grid_size <- function(project_dir) { + # Per-field architecture doesn't use grid-based organization + return("unknown") +} + +#' Get project storage path +#' +#' @param project_dir Character. Project name +#' @param subdir Character. Optional subdirectory (default NULL) +#' @return Character. Full path +get_project_storage_path <- function(project_dir, subdir = NULL) { + path <- file.path("laravel_app", "storage", "app", project_dir) + if (!is.null(subdir)) { + path <- file.path(path, subdir) + } + return(path) +} + +#' Get mosaic directory +#' +#' @param project_dir Character. Project name +#' @param mosaic_mode Character. "tiled" or "single-file" +#' @return Character. Full path to mosaic directory +get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") { + if (mosaic_mode == "auto") { + mosaic_mode <- detect_mosaic_mode(project_dir) + } + + if (mosaic_mode == "tiled") { + get_project_storage_path(project_dir, "weekly_tile_max") + } else { + get_project_storage_path(project_dir, "weekly_mosaic") + } +} + +#' Get KPI directory based on client type +#' +#' @param project_dir Character. Project name +#' @param client_type Character. Client type +#' @return Character. Full path to KPI directory +get_kpi_dir <- function(project_dir, client_type) { + base_path <- get_project_storage_path(project_dir, "reports/kpis") + + if (client_type == "agronomic_support") { + return(file.path(base_path, "field_level")) + } else { + return(file.path(base_path, "field_analysis")) + } +} + +#' Get expected output path for harvest imminent file +#' +#' @param project_dir Character. Project name +#' @param week_num Integer. ISO week number +#' @param year_num Integer. Year +#' @return Character. Full path to expected harvest imminent CSV file +get_harvest_output_path <- function(project_dir, week_num, year_num) { + file.path( + "laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats", + sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, week_num, year_num) + ) +} + +#' Check if harvest output file exists for a specific week +#' +#' @param project_dir Character. Project name +#' @param week_num Integer. ISO week number +#' @param year_num Integer. Year +#' @return Logical. TRUE if file exists +check_harvest_output_exists <- function(project_dir, week_num, year_num) { + path <- get_harvest_output_path(project_dir, week_num, year_num) + file.exists(path) +} + +#' Get mosaic verification directory +#' +#' @param project_dir Character. Project name +#' @param mosaic_mode Character. "tiled" or "single-file" +#' @return Character. Full path to mosaic directory +get_mosaic_verification_dir <- function(project_dir, mosaic_mode) { + base <- file.path("laravel_app", "storage", "app", project_dir) + + if (mosaic_mode == "tiled") { + file.path(base, "weekly_tile_max") + } else { + file.path(base, "weekly_mosaic") + } +} + +#' Check if mosaic files exist for a specific week +#' +#' @param project_dir Character. Project name +#' @param week_num Integer. ISO week number +#' @param year_num Integer. Year +#' @param mosaic_mode Character. "tiled" or "single-file" +#' @return List with created (logical), file_count (int), sample_files (char vector) +check_mosaic_exists <- function(project_dir, week_num, year_num, mosaic_mode) { + mosaic_dir <- get_mosaic_verification_dir(project_dir, mosaic_mode) + + if (!dir.exists(mosaic_dir)) { + return(list(created = FALSE, file_count = 0, sample_files = character())) + } + + week_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year_num) + mosaic_files <- list.files(mosaic_dir, pattern = week_pattern, recursive = TRUE, full.names = FALSE) + + list( + created = length(mosaic_files) > 0, + file_count = length(mosaic_files), + sample_files = head(mosaic_files, 3) + ) +} + +#' Check KPI completeness for reporting window +#' +#' @param project_dir Character. Project name +#' @param client_type Character. Client type +#' @param end_date Date. End date of reporting window +#' @param reporting_weeks Integer. Number of weeks to check +#' @return List with kpi_dir, kpis_df, missing_count +check_kpi_completeness <- function(project_dir, client_type, end_date, reporting_weeks = 1) { + kpi_dir <- get_kpi_dir(project_dir, client_type) + + if (!dir.exists(kpi_dir)) { + dir.create(kpi_dir, showWarnings = FALSE, recursive = TRUE) + } + + kpis_df <- data.frame() + missing_count <- 0 + + for (weeks_back in 0:(reporting_weeks - 1)) { + target_week <- end_date - lubridate::weeks(weeks_back) + wwy <- get_iso_week_year(target_week) + + # Check for KPI file for this week + kpi_pattern <- sprintf("week_%02d_%d", wwy$week, wwy$year) + kpi_files <- list.files(kpi_dir, pattern = kpi_pattern) + has_kpis <- length(kpi_files) > 0 + file_count <- length(kpi_files) + + if (!has_kpis) missing_count <- missing_count + 1 + + kpis_df <- rbind(kpis_df, data.frame( + week = wwy$week, + year = wwy$year, + date = target_week, + has_kpis = has_kpis, + file_count = file_count + )) + } + + return(list( + kpi_dir = kpi_dir, + kpis_df = kpis_df, + missing_count = missing_count + )) +} + +# ============================================================================== +# SECTION 8: PROJECT INITIALIZATION +# ============================================================================== + +#' Initialize the project +#' +#' Sets up directory structure, logging, and loads configuration +#' +#' @param project_dir Character. Project name +#' @param data_source Character. "merged_tif" or "merged_tif_8b" +#' @return List with all project directories and settings +initialize_project <- function(project_dir, data_source = "merged_tif") { + dirs <- setup_project_directories(project_dir, data_source = data_source) + logging <- setup_logging(dirs$log_dir) + + return(list( + dirs = dirs, + logging = logging, + project_dir = project_dir, + client_type = get_client_type(project_dir) + )) +} + +# ============================================================================== +# AUTO-INITIALIZATION: Set project_dir global if not already set +# ============================================================================== + +if (exists("project_dir")) { + assign("project_dir", project_dir, envir = .GlobalEnv) } else { warning("project_dir variable not found. Please set project_dir before sourcing parameters_project.R") } + +# ============================================================================== +# END PARAMETERS_PROJECT_2.R +# ============================================================================== diff --git a/r_app/parameters_project_OLD.R b/r_app/parameters_project_OLD.R new file mode 100644 index 0000000..a5d9224 --- /dev/null +++ b/r_app/parameters_project_OLD.R @@ -0,0 +1,1240 @@ +# filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\parameters_project.R +# +# PARAMETERS_PROJECT.R +# ==================== +# This script defines project parameters, directory structures, and loads field boundaries. +# It establishes all the necessary paths and creates required directories for the SmartCane project. + +# 1. Load required libraries +# ------------------------- +suppressPackageStartupMessages({ + library(here) + library(readxl) + library(sf) + library(dplyr) + library(tidyr) + library(jsonlite) # For reading tiling_config.json +}) + +# 2. Client type mapping (for conditional script execution) +# --------------------------------------------------------- +# Maps project names to client types for pipeline control +# Client types: +# - "cane_supply": Runs Scripts 20,21,30,31,80,91 (full pipeline with Excel output) +# - "agronomic_support": Runs Scripts 80,90 only (KPI calculation + Word report) +# - "extension_service": (Future - not yet implemented) +# +# NOTE: This will eventually migrate to Laravel environment variables/database +# For now, maintain this mapping and update as projects are added +CLIENT_TYPE_MAP <- list( + "angata" = "cane_supply", + "aura" = "agronomic_support", + "chemba" = "cane_supply", + "xinavane" = "cane_supply", + "esa" = "cane_supply" +) + +get_client_type <- function(project_name) { + client_type <- CLIENT_TYPE_MAP[[project_name]] + if (is.null(client_type)) { + warning(sprintf("Project '%s' not in CLIENT_TYPE_MAP - defaulting to 'cane_supply'", project_name)) + return("cane_supply") + } + return(client_type) +} + +# 2b. Client-specific KPI configurations +# ---------------------------------------- +# Defines which KPIs and outputs are required for each client type +# This enables Script 80 to conditionally calculate only relevant metrics +# +# Structure: +# - kpi_calculations: Vector of KPI types to calculate for this client +# - outputs: Vector of output formats to generate (determines RDS/Excel naming) +# - requires_harvest_data: Boolean - whether Script 31 harvest predictions are needed +# - script_90_compatible: Boolean - whether output should match Script 90 expectations +# - script_91_compatible: Boolean - whether output should match Script 91 expectations +# +CLIENT_TYPE_CONFIGS <- list( + + # Aura (agronomic_support): Farm-level KPI summaries for weekly reports to agronomists + "agronomic_support" = list( + client_type = "agronomic_support", + description = "Farm-level KPI summaries for agronomic decision support", + kpi_calculations = c( + "field_uniformity", + "area_change", + "tch_forecasted", + "growth_decline", + "weed_presence", + "gap_filling" + ), + outputs = c( + "kpi_summary_tables", # Summary statistics for Script 90 report front page + "field_details" # Detailed field table for Script 90 report end section + ), + requires_harvest_data = FALSE, # Script 31 predictions not used + script_90_compatible = TRUE, # Output format matches Script 90 expectations + script_91_compatible = FALSE + ), + + # Cane Supply (cane_supply): Per-field analysis with harvest timing prediction + "cane_supply" = list( + client_type = "cane_supply", + description = "Per-field analysis with harvest prediction and phase assignment", + kpi_calculations = c( + "per_field_analysis", # Use 80_weekly_stats_utils.R for field-level statistics + "phase_assignment", # Assign growth phases (Germination, Tillering, Grand Growth, Maturation) + "harvest_prediction", # Include Script 31 harvest age predictions if available + "status_triggers" # Calculate field status (Normal, Monitor, Alert, Urgent) + ), + outputs = c( + "field_analysis_excel", # Excel file with per-field metrics + "field_analysis_summary" # Summary RDS for Script 91 report + ), + requires_harvest_data = TRUE, # harvest.xlsx is required for phase assignment + script_90_compatible = FALSE, + script_91_compatible = TRUE + ) +) + +#' Get KPI configuration for a specific client type +#' @param client_type Character string of client type (e.g., "agronomic_support", "cane_supply") +#' @return List containing configuration for that client type +get_client_kpi_config <- function(client_type) { + config <- CLIENT_TYPE_CONFIGS[[client_type]] + + if (is.null(config)) { + warning(sprintf("Client type '%s' not in CLIENT_TYPE_CONFIGS - defaulting to 'cane_supply'", client_type)) + return(CLIENT_TYPE_CONFIGS[["cane_supply"]]) + } + + return(config) +} + +# 3. Smart detection for tile-based vs single-file mosaic approach +# ---------------------------------------------------------------- +detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) { + # PRIORITY 1: Check for tiling_config.json metadata file from script 10 + # This is the most reliable source since script 10 explicitly records its decision + + if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) { + # Try to find tiling_config.json in any grid-size subfolder + config_files <- list.files(daily_tiles_split_dir, + pattern = "tiling_config\\.json$", + recursive = TRUE, + full.names = TRUE) + + if (length(config_files) > 0) { + # Found a config file - use the most recent one + config_file <- config_files[which.max(file.info(config_files)$mtime)] + + tryCatch({ + config_json <- jsonlite::read_json(config_file) + return(list( + has_tiles = config_json$has_tiles %||% TRUE, + detected_tiles = character(), + total_files = 0, + source = "tiling_config.json", + grid_size = config_json$grid_size %||% "unknown" + )) + }, error = function(e) { + warning("Error reading tiling_config.json: ", e$message) + # Fall through to file-based detection + }) + } + } + + # PRIORITY 2: File-based detection (fallback if metadata not found) + # Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories + + if (!dir.exists(merged_final_tif_dir)) { + return(list( + has_tiles = FALSE, + detected_tiles = character(), + total_files = 0, + source = "directory_not_found" + )) + } + + # First check if there are grid-size subdirectories (5x5, 10x10, etc.) + # This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif + grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE) + grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE) + + if (length(grid_patterns) > 0) { + # Found grid-size subdirectories - tiles exist! + grid_size <- grid_patterns[1] + grid_dir <- file.path(merged_final_tif_dir, grid_size) + + # List sample tile files from the grid directory + sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3] + + return(list( + has_tiles = TRUE, + detected_tiles = sample_tiles, + total_files = length(sample_tiles), + source = "grid_subdirectory_detection", + grid_size = grid_size, + grid_path = grid_dir + )) + } + + # Fall back to checking for tile-named files directly in merged_final_tif + # List all .tif files in merged_final_tif + tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE) + + if (length(tif_files) == 0) { + return(list( + has_tiles = FALSE, + detected_tiles = character(), + total_files = 0, + source = "no_files_found" + )) + } + + # Check if ANY file matches tile naming pattern: *_XX.tif (where XX is 2 digits) + # Tile pattern examples: 2025-11-27_00.tif, 2025-11-27_01.tif, week_50_2024_00.tif + tile_pattern <- "_(\\d{2})\\.tif$" + tile_files <- tif_files[grepl(tile_pattern, tif_files)] + + has_tiles <- length(tile_files) > 0 + + return(list( + has_tiles = has_tiles, + detected_tiles = tile_files, + total_files = length(tif_files), + source = "file_pattern_detection" + )) +} + +# 4. Define project directory structure +# ----------------------------------- +# ============================================================================== +# CENTRALIZED PATH MANAGEMENT - setup_project_directories() +# ============================================================================== +# This function is the single source of truth for ALL file paths used across the pipeline. +# All scripts should call this function once at startup and use returned paths. +# This eliminates ~88 hardcoded file.path() calls scattered across 8 scripts. +# +# USAGE: +# paths <- setup_project_directories(project_dir) +# merged_tif_dir <- paths$merged_tif_folder +# daily_ci_dir <- paths$daily_ci_vals_dir +# kpi_output_dir <- paths$kpi_reports_dir +# +# TIERS (8-layer directory structure): +# Tier 1: Raw data (merged_tif) +# Tier 2: Per-field TIFFs (field_tiles, field_tiles_CI) +# Tier 3: CI Extraction (daily_ci_vals, cumulative_ci_vals) +# Tier 4: Growth Model (growth_model_interpolated) +# Tier 5: Mosaics (weekly_mosaic, weekly_tile_max) +# Tier 6: KPI & Reporting (kpi_reports_dir, kpi_field_stats_dir) +# Tier 7: Support (data, vrt, harvest, logs) +# Tier 8: Config & Metadata (field_boundaries_path, tiling_config_path) +# +# BENEFITS: +# ✓ Single source of truth (eliminates ~88 hardcoded file.path() calls) +# ✓ Auto-creates all directories (no scattered dir.create() calls) +# ✓ Easy to update storage structure globally +# ✓ Consistent naming across all 8 scripts +# ============================================================================== +setup_project_directories <- function(project_dir, data_source = "merged_tif") { + # =========================================================================== + # BASE DIRECTORIES (Foundation for all paths) + # =========================================================================== + laravel_storage_dir <- here("laravel_app", "storage", "app", project_dir) + + # =========================================================================== + # TIER 1: RAW DATA & INPUT PATHS (Script 00 - Python download output) + # =========================================================================== + merged_tif_folder <- here(laravel_storage_dir, "merged_tif") # 4-band raw GeoTIFFs from Planet + + # =========================================================================== + # TIER 2: TILING PATHS (Script 10 - Per-field tiff creation) + # =========================================================================== + # Per-field TIFF structure: field_tiles/{FIELD_NAME}/{YYYY-MM-DD}.tif + field_tiles_dir <- here(laravel_storage_dir, "field_tiles") + + # Per-field CI TIFFs (pre-computed, used by Script 40): field_tiles_CI/{FIELD_NAME}/{YYYY-MM-DD}.tif + field_tiles_ci_dir <- here(laravel_storage_dir, "field_tiles_CI") + + # Legacy tiling (for backward compatibility): daily_tiles_split/{grid_size}/{YYYY-MM-DD}/{YYYY-MM-DD}_XX.tif + daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split") + + # =========================================================================== + # TIER 3: CI EXTRACTION PATHS (Script 20 - Canopy Index calculation) + # =========================================================================== + extracted_ci_base_dir <- here(laravel_storage_dir, "Data", "extracted_ci") + + # Daily CI values (cumulative RDS): combined_CI_data.rds + daily_ci_vals_dir <- here(extracted_ci_base_dir, "daily_vals") + + # Cumulative CI across time: All_pivots_Cumulative_CI_quadrant_year_v2.rds + cumulative_ci_vals_dir <- here(extracted_ci_base_dir, "cumulative_vals") + + # Per-field CI data for Python harvest prediction (Script 21): ci_data_for_python.csv + ci_for_python_dir <- here(extracted_ci_base_dir, "ci_data_for_python") + + # =========================================================================== + # TIER 4: GROWTH MODEL PATHS (Script 30 - Interpolation & smoothing) + # =========================================================================== + growth_model_interpolated_dir <- here(laravel_storage_dir, "growth_model_interpolated") + + # =========================================================================== + # TIER 5: MOSAIC PATHS (Script 40 - Weekly mosaics) + # =========================================================================== + # Per-field weekly mosaics (per-field architecture): weekly_mosaic/{FIELD}/{week_XX_YYYY}.tif + weekly_mosaic_dir <- here(laravel_storage_dir, "weekly_mosaic") + + # Tile-based weekly max (legacy): weekly_tile_max/{grid_size}/week_XX_YYYY.tif + weekly_tile_max_dir <- here(laravel_storage_dir, "weekly_tile_max") + + # =========================================================================== + # TIER 6: KPI & REPORTING PATHS (Scripts 80, 90, 91) + # =========================================================================== + reports_dir <- here(laravel_storage_dir, "reports") + kpi_reports_dir <- here(reports_dir, "kpis") # Where Script 80 outputs KPI CSV/RDS files + kpi_field_stats_dir <- here(kpi_reports_dir, "field_stats") # Per-field KPI details + kpi_field_analysis_dir <- here(kpi_reports_dir, "field_analysis") # Field-level analysis for Script 91 + + # =========================================================================== + # TIER 7: SUPPORT PATHS (Data, VRT, Harvest) + # =========================================================================== + data_dir <- here(laravel_storage_dir, "Data") + vrt_dir <- here(data_dir, "vrt") # Virtual Raster files created during CI extraction + harvest_dir <- here(data_dir, "HarvestData") # Harvest schedule data + log_dir <- here(laravel_storage_dir, "logs") # Log files + + # =========================================================================== + # TIER 8: CONFIG & METADATA PATHS + # =========================================================================== + # Field boundaries GeoJSON (same across all scripts) + field_boundaries_path <- here(data_dir, "pivot.geojson") + + # Tiling configuration metadata from Script 10 + tiling_config_path <- here(daily_tiles_split_dir, "tiling_config.json") + + # =========================================================================== + # CREATE ALL DIRECTORIES (once per pipeline run) + # =========================================================================== + all_dirs <- c( + # Tier 1 + merged_tif_folder, + # Tier 2 + field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir, + # Tier 3 + extracted_ci_base_dir, daily_ci_vals_dir, cumulative_ci_vals_dir, ci_for_python_dir, + # Tier 4 + growth_model_interpolated_dir, + # Tier 5 + weekly_mosaic_dir, weekly_tile_max_dir, + # Tier 6 + reports_dir, kpi_reports_dir, kpi_field_stats_dir, kpi_field_analysis_dir, + # Tier 7 + data_dir, vrt_dir, harvest_dir, log_dir + ) + + for (dir_path in all_dirs) { + dir.create(dir_path, showWarnings = FALSE, recursive = TRUE) + } + + # =========================================================================== + # RETURN COMPREHENSIVE PATH LIST + # Scripts should source parameters_project.R and receive paths object like: + # paths <- setup_project_directories(project_dir) + # Then use: paths$merged_tif_folder, paths$daily_ci_vals_dir, etc. + # =========================================================================== + return(list( + # PROJECT ROOT + laravel_storage_dir = laravel_storage_dir, + + # TIER 1: Raw data + merged_tif_folder = merged_tif_folder, + + # TIER 2: Per-field TIFFs + field_tiles_dir = field_tiles_dir, + field_tiles_ci_dir = field_tiles_ci_dir, + daily_tiles_split_dir = daily_tiles_split_dir, + + # TIER 3: CI Extraction + extracted_ci_base_dir = extracted_ci_base_dir, + daily_ci_vals_dir = daily_ci_vals_dir, + cumulative_ci_vals_dir = cumulative_ci_vals_dir, + ci_for_python_dir = ci_for_python_dir, + + # TIER 4: Growth Model + growth_model_interpolated_dir = growth_model_interpolated_dir, + + # TIER 5: Mosaics + weekly_mosaic_dir = weekly_mosaic_dir, + weekly_tile_max_dir = weekly_tile_max_dir, + + # TIER 6: KPI & Reporting + reports_dir = reports_dir, + kpi_reports_dir = kpi_reports_dir, + kpi_field_stats_dir = kpi_field_stats_dir, + kpi_field_analysis_dir = kpi_field_analysis_dir, + + # TIER 7: Support + data_dir = data_dir, + vrt_dir = vrt_dir, + harvest_dir = harvest_dir, + log_dir = log_dir, + + # TIER 8: Config & Metadata + field_boundaries_path = field_boundaries_path, + tiling_config_path = tiling_config_path + )) +} + +# ============================================================================== +# TIER-BY-TIER PATH REFERENCE (for setup_project_directories output) +# ============================================================================== +# +# TIER 1: RAW DATA (Script 00 - Python download) +# paths$merged_tif_folder +# └─ {YYYY-MM-DD}.tif (4-band uint16 GeoTIFFs from Planet API) +# +# TIER 2: PER-FIELD TIFFS (Script 10) +# paths$field_tiles_dir/{FIELD_NAME}/{YYYY-MM-DD}.tif +# paths$field_tiles_ci_dir/{FIELD_NAME}/{YYYY-MM-DD}.tif +# paths$daily_tiles_split_dir/{grid_size}/{YYYY-MM-DD}/{YYYY-MM-DD}_XX.tif (legacy) +# +# TIER 3: CI EXTRACTION (Script 20) +# paths$daily_ci_vals_dir/combined_CI_data.rds +# paths$cumulative_ci_vals_dir/All_pivots_Cumulative_CI_quadrant_year_v2.rds +# paths$ci_for_python_dir/ci_data_for_python.csv (Script 21 output) +# +# TIER 4: GROWTH MODEL (Script 30) +# paths$growth_model_interpolated_dir/ (RDS files with interpolated CI) +# +# TIER 5: MOSAICS (Script 40) +# paths$weekly_mosaic_dir/{FIELD_NAME}/week_XX_YYYY.tif +# paths$weekly_tile_max_dir/{grid_size}/week_XX_YYYY_00.tif (legacy) +# +# TIER 6: KPI & REPORTING (Scripts 80, 90, 91) +# paths$kpi_reports_dir/ (KPI outputs from Script 80) +# paths$kpi_field_stats_dir/ (Per-field KPI RDS) +# paths$kpi_field_analysis_dir/ (Analysis RDS for Script 91) +# paths$reports_dir/ (Word/HTML reports) +# +# TIER 7: SUPPORT (Various scripts) +# paths$data_dir/pivot.geojson (Field boundaries) +# paths$data_dir/harvest.xlsx (Harvest schedule) +# paths$vrt_dir/ (Virtual raster files) +# paths$harvest_dir/ (Harvest predictions from Python) +# paths$log_dir/ (Pipeline logs) +# +# TIER 8: CONFIG & METADATA +# paths$field_boundaries_path (Full path to pivot.geojson) +# paths$tiling_config_path (Metadata from Script 10) +# +# ============================================================================== + +# 5. Utility Functions +# ---------------------- +# NOTE: load_field_boundaries() and load_harvesting_data() are defined in 00_common_utils.R +# to avoid duplication. They are sourced after parameters_project.R and used by all scripts. + +# 6. Load harvesting data +# --------------------- +load_harvesting_data <- function(data_dir) { + harvest_file <- here(data_dir, "harvest.xlsx") + + if (!file.exists(harvest_file)) { + warning(paste("Harvest data file not found at path:", harvest_file)) + return(NULL) + } + + # Helper function to parse dates with multiple format detection + parse_flexible_date <- function(x) { + if (is.na(x) || is.null(x)) return(NA_real_) + if (inherits(x, "Date")) return(x) + if (inherits(x, "POSIXct")) return(as.Date(x)) + + # If it's numeric (Excel date serial), convert directly + if (is.numeric(x)) { + return(as.Date(x, origin = "1899-12-30")) + } + + # Try character conversion with multiple formats + x_char <- as.character(x) + + # Try common formats: YYYY-MM-DD, DD/MM/YYYY, MM/DD/YYYY, YYYY-MM-DD HH:MM:SS + formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S") + + for (fmt in formats) { + result <- suppressWarnings(as.Date(x_char, format = fmt)) + if (!is.na(result)) return(result) + } + + # If all else fails, return NA + return(NA) + } + + tryCatch({ + harvesting_data <- read_excel(harvest_file) %>% + dplyr::select( + c( + "field", + "sub_field", + "year", + "season_start", + "season_end", + "age", + "sub_area", + "tonnage_ha" + ) + ) %>% + mutate( + field = as.character(field), + sub_field = as.character(sub_field), + year = as.numeric(year), + season_start = sapply(season_start, parse_flexible_date), + season_end = sapply(season_end, parse_flexible_date), + season_start = as.Date(season_start, origin = "1970-01-01"), + season_end = as.Date(season_end, origin = "1970-01-01"), + age = as.numeric(age), + sub_area = as.character(sub_area), + tonnage_ha = as.numeric(tonnage_ha) + ) %>% + mutate( + season_end = case_when( + season_end > Sys.Date() ~ Sys.Date(), + is.na(season_end) ~ Sys.Date(), + TRUE ~ season_end + ), + age = round(as.numeric(season_end - season_start) / 7, 0) + ) + + return(harvesting_data) + }, error = function(e) { + warning(paste("Error loading harvesting data:", e$message)) + return(NULL) + }) +} + +# 5. Define logging functions globally first +# --------------------------------------- +# Create a simple default log function in case setup_logging hasn't been called yet +log_message <- function(message, level = "INFO") { + timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") + formatted_message <- paste0("[", level, "] ", timestamp, " - ", message) + cat(formatted_message, "\n") +} + +log_head <- function(list, level = "INFO") { + log_message(paste(capture.output(str(head(list))), collapse = "\n"), level) +} + +# 8. Set up full logging system with file output +# ------------------------------------------- +setup_logging <- function(log_dir) { + log_file <- here(log_dir, paste0(format(Sys.Date(), "%Y%m%d"), ".log")) + + # Create enhanced log functions + log_message <- function(message, level = "INFO") { + timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") + formatted_message <- paste0("[", level, "] ", timestamp, " - ", message) + cat(formatted_message, "\n", file = log_file, append = TRUE) + + # Also print to console for debugging + if (level %in% c("ERROR", "WARNING")) { + cat(formatted_message, "\n") + } + } + + log_head <- function(list, level = "INFO") { + log_message(paste(capture.output(str(head(list))), collapse = "\n"), level) + } + + # Update the global functions with the enhanced versions + assign("log_message", log_message, envir = .GlobalEnv) + assign("log_head", log_head, envir = .GlobalEnv) + + return(list( + log_file = log_file, + log_message = log_message, + log_head = log_head + )) +} + +# 8. HELPER FUNCTIONS FOR COMMON CALCULATIONS +# ----------------------------------------------- +# Centralized functions to reduce duplication across scripts + +# Get ISO week and year from a date +get_iso_week <- function(date) { + as.numeric(format(date, "%V")) +} + +get_iso_year <- function(date) { + as.numeric(format(date, "%G")) +} + +# Get both ISO week and year as a list +get_iso_week_year <- function(date) { + list( + week = as.numeric(format(date, "%V")), + year = as.numeric(format(date, "%G")) + ) +} + +# Format week/year into a readable label +format_week_label <- function(date, separator = "_") { + wwy <- get_iso_week_year(date) + sprintf("week%02d%s%d", wwy$week, separator, wwy$year) +} + +# Auto-detect mosaic mode +# For per-field architecture, always returns "single-file" (weekly_mosaic/{FIELD}/week_*.tif) +detect_mosaic_mode <- function(project_dir) { + # Per-field architecture uses single-file mosaics organized per-field + weekly_mosaic <- file.path("laravel_app", "storage", "app", project_dir, "weekly_mosaic") + if (dir.exists(weekly_mosaic)) { + return("single-file") # Per-field structure + } + return("unknown") +} + +# Auto-detect grid size from tile directory structure +# For per-field architecture, returns "unknown" since grid-based organization is legacy +detect_grid_size <- function(project_dir) { + # Per-field architecture doesn't use grid-based organization anymore + return("unknown") +} + +# Build storage paths consistently across all scripts +get_project_storage_path <- function(project_dir, subdir = NULL) { + base <- file.path("laravel_app", "storage", "app", project_dir) + if (!is.null(subdir)) file.path(base, subdir) else base +} + +get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") { + # Per-field architecture always uses weekly_mosaic (single-file, per-field organization) + get_project_storage_path(project_dir, "weekly_mosaic") +} + +get_kpi_dir <- function(project_dir, client_type) { + subdir <- if (client_type == "agronomic_support") "field_level" else "field_analysis" + get_project_storage_path(project_dir, file.path("reports", "kpis", subdir)) +} + +# Logging functions moved to 00_common_utils.R +# - smartcane_log() — Main logging function with level prefix +# - smartcane_debug() — Conditional debug logging +# - smartcane_warn() — Warning wrapper +# Import with: source("r_app/00_common_utils.R") + +# ============================================================================ +# PHASE 3 & 4: OPTIMIZATION & DOCUMENTATION +# ============================================================================ + +# System Constants +# ---------------- +# Define once, use everywhere + +RSCRIPT_PATH <- "C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe" +# Used in run_full_pipeline.R for calling R scripts via system() + +# Data Source Documentation +# --------------------------- +# Explains the two satellite data formats and when to use each +# +# SmartCane uses PlanetScope imagery from Planet Labs API in two formats: +# +# 1. merged_tif (4-band): +# - Standard format: Red, Green, Blue, Near-Infrared +# - Size: ~150-200 MB per date +# - Use case: Agronomic support, general crop health monitoring +# - Projects: aura, xinavane +# - Cloud handling: Basic cloud masking from Planet metadata +# +# 2. merged_tif_8b (8-band with cloud confidence): +# - Enhanced format: 4-band imagery + 4-band UDM2 cloud mask +# - UDM2 bands: Clear, Snow, Shadow, Light Haze +# - Size: ~250-350 MB per date +# - Use case: Harvest prediction, supply chain optimization +# - Projects: angata, chemba, esa (cane_supply clients) +# - Cloud handling: Per-pixel cloud confidence from Planet UDM2 +# - Why: Cane supply chains need precise confidence to predict harvest dates +# (don't want to predict based on cloudy data) +# +# The system auto-detects which is available via detect_data_source() + +# Mosaic Mode Documentation +# -------------------------- +# SmartCane supports two ways to store and process weekly mosaics: +# +# 1. Single-file mosaic ("single-file"): +# - One GeoTIFF per week: weekly_mosaic/week_02_2026.tif +# - 5 bands per file: R, G, B, NIR, CI (Canopy Index) +# - Size: ~300-500 MB per week +# - Pros: Simpler file management, easier full-field visualization +# - Cons: Slower for field-specific queries, requires loading full raster +# - Best for: Agronomic support (aura) with <100 fields +# - Script 04 output: 5-band single-file mosaic +# +# 2. Tiled mosaic ("tiled"): +# - Grid of tiles per week: weekly_tile_max/5x5/week_02_2026_{TT}.tif +# - Example: 25 files (5×5 grid) × 5 bands = 125 individual tiffs +# - Size: ~15-20 MB per tile, organized in folders +# - Pros: Parallel processing, fast field lookups, scales to 1000+ fields +# - Cons: More file I/O, requires tile-to-field mapping metadata +# - Best for: Cane supply (angata, chemba) with 500+ fields +# - Script 04 output: Per-tile tiff files in weekly_tile_max/{grid}/ +# - Tile assignment: Field boundaries mapped to grid coordinates +# +# The system auto-detects which is available via detect_mosaic_mode() + +# Client Type Documentation +# -------------------------- +# SmartCane runs different analysis pipelines based on client_type: +# +# CLIENT_TYPE: cane_supply +# Purpose: Optimize sugar mill supply chain (harvest scheduling) +# Scripts run: 20 (CI), 21 (RDS to CSV), 30 (Growth), 31 (Harvest pred), 40 (Mosaic), 80 (KPI), 91 (Excel) +# Outputs: +# - Per-field analysis: field status, growth phase, harvest readiness +# - Excel reports (Script 91): Detailed metrics for logistics planning +# - KPI directory: reports/kpis/field_analysis/ (one RDS per week) +# Harvest data: Required (harvest.xlsx - planting dates for phase assignment) +# Data source: merged_tif_8b (uses cloud confidence for confidence) +# Mosaic mode: tiled (scales to 500+ fields) +# Projects: angata, chemba, xinavane, esa +# +# CLIENT_TYPE: agronomic_support +# Purpose: Provide weekly crop health insights to agronomists +# Scripts run: 80 (KPI), 90 (Word report) +# Outputs: +# - Farm-level KPI summaries (no per-field breakdown) +# - Word reports (Script 90): Charts and trends for agronomist decision support +# - KPI directory: reports/kpis/field_level/ (one RDS per week) +# Harvest data: Not used +# Data source: merged_tif (simpler, smaller) +# Mosaic mode: single-file (100-200 fields) +# Projects: aura +# + +# Detect data source (merged_tif vs merged_tif_8b) based on availability +# Returns the first available source; defaults to merged_tif_8b if neither exists +detect_data_source <- function(project_dir) { + # Data source is always merged_tif for consistency + return("merged_tif") +} + +# Check KPI completeness for a reporting period +# Returns: List with kpis_df (data.frame), missing_count, and all_complete (boolean) +# This replaces duplicate KPI checking logic in run_full_pipeline.R (lines ~228-270, ~786-810) +check_kpi_completeness <- function(project_dir, client_type, end_date, reporting_weeks_needed) { + kpi_dir <- get_kpi_dir(project_dir, client_type) + + kpis_needed <- data.frame() + + for (weeks_back in 0:(reporting_weeks_needed - 1)) { + check_date <- end_date - (weeks_back * 7) + wwy <- get_iso_week_year(check_date) + + # Build week pattern and check if it exists + week_pattern <- sprintf("week%02d_%d", wwy$week, wwy$year) + files_this_week <- list.files(kpi_dir, pattern = week_pattern) + has_kpis <- length(files_this_week) > 0 + + # Track missing weeks + kpis_needed <- rbind(kpis_needed, data.frame( + week = wwy$week, + year = wwy$year, + date = check_date, + has_kpis = has_kpis, + pattern = week_pattern, + file_count = length(files_this_week) + )) + + # Debug logging + smartcane_debug(sprintf( + "Week %02d/%d (%s): %s (%d files)", + wwy$week, wwy$year, format(check_date, "%Y-%m-%d"), + if (has_kpis) "✓ FOUND" else "✗ MISSING", + length(files_this_week) + )) + } + + # Summary statistics + missing_count <- sum(!kpis_needed$has_kpis) + all_complete <- missing_count == 0 + + return(list( + kpis_df = kpis_needed, + kpi_dir = kpi_dir, + missing_count = missing_count, + missing_weeks = kpis_needed[!kpis_needed$has_kpis, ], + all_complete = all_complete + )) +} + +# ============================================================================== +# HELPER FUNCTIONS FOR run_full_pipeline.R PATH VERIFICATION (SC-116) +# ============================================================================== +# These functions replace hardcoded file.path() calls in run_full_pipeline.R +# with centralized, testable helper functions. Each function verifies a specific +# output directory for a pipeline stage. + +#' Get verification path for Script 31 harvest output +#' +#' @param project_dir Character. Project name (e.g., "angata", "aura") +#' @param week_num Integer. ISO week number (01-53) +#' @param year_num Integer. Year (e.g., 2026) +#' @return Character. Full path to expected harvest imminent CSV file +#' @details +#' Script 31 generates: {project}_{project}_harvest_imminent_week_{WW}_{YYYY}.csv +#' Location: laravel_app/storage/app/{project}/reports/kpis/field_stats/ +#' +get_harvest_output_path <- function(project_dir, week_num, year_num) { + file.path( + "laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats", + sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, week_num, year_num) + ) +} + +#' Check if harvest output file exists for a given week +#' +#' @param project_dir Character. Project name +#' @param week_num Integer. ISO week number +#' @param year_num Integer. Year +#' @return Logical. TRUE if file exists, FALSE otherwise +#' +check_harvest_output_exists <- function(project_dir, week_num, year_num) { + path <- get_harvest_output_path(project_dir, week_num, year_num) + file.exists(path) +} + +#' Get expected output directory for a mosaic verification based on mode +#' +#' @param project_dir Character. Project name +#' @param mosaic_mode Character. Either "tiled" or "single-file" +#' @return Character. Full path to mosaic directory +#' +#' @details +#' Tiled: laravel_app/storage/app/{project}/weekly_tile_max/ +#' Single-file: laravel_app/storage/app/{project}/weekly_mosaic/ +#' +get_mosaic_verification_dir <- function(project_dir, mosaic_mode) { + base <- file.path("laravel_app", "storage", "app", project_dir) + + if (mosaic_mode == "tiled") { + file.path(base, "weekly_tile_max") + } else { + # Default to single-file (single-file is standard for per-field architecture) + file.path(base, "weekly_mosaic") + } +} + +#' Check if mosaic files exist for a specific week +#' +#' @param project_dir Character. Project name +#' @param week_num Integer. ISO week number +#' @param year_num Integer. Year +#' @param mosaic_mode Character. "tiled" or "single-file" +#' @return List with created (logical), file_count (int), and sample_files (char vector) +#' +check_mosaic_exists <- function(project_dir, week_num, year_num, mosaic_mode) { + mosaic_dir <- get_mosaic_verification_dir(project_dir, mosaic_mode) + + if (!dir.exists(mosaic_dir)) { + return(list(created = FALSE, file_count = 0, sample_files = character())) + } + + week_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year_num) + # Search recursively for per-field architecture support + mosaic_files <- list.files(mosaic_dir, pattern = week_pattern, recursive = TRUE, full.names = FALSE) + + list( + created = length(mosaic_files) > 0, + file_count = length(mosaic_files), + sample_files = head(mosaic_files, 3) # First 3 files as sample + ) +} + +# 9. Initialize the project +# ---------------------- +# Export project directories and settings +initialize_project <- function(project_dir, data_source = "merged_tif") { + # Set up directory structure, passing data_source to select TIF folder + dirs <- setup_project_directories(project_dir, data_source = data_source) + + # Set up logging + logging <- setup_logging(dirs$log_dir) + + # Load field boundaries + boundaries <- load_field_boundaries(dirs$data_dir) + + # Load harvesting data + harvesting_data <- load_harvesting_data(dirs$data_dir) + + # Return all initialized components + return(c( + dirs, + list( + logging = logging, + field_boundaries = boundaries$field_boundaries, + field_boundaries_sf = boundaries$field_boundaries_sf, + harvesting_data = harvesting_data + ) + )) +} + +# When script is sourced, initialize with the global project_dir variable if it exists +if (exists("project_dir")) { + # Now we can safely log before initialization + log_message(paste("Initializing project with directory:", project_dir)) + + # Use data_source if it exists (passed from 02_ci_extraction.R), otherwise use default + data_src <- if (exists("data_source")) data_source else "merged_tif" + log_message(paste("Using data source directory:", data_src)) + + project_config <- initialize_project(project_dir, data_source = data_src) + + # Expose all variables to the global environment + list2env(project_config, envir = .GlobalEnv) + + # Log project initialization completion with tile mode info + log_message(paste("Project initialized with directory:", project_dir)) + if (exists("use_tile_mosaic")) { + mosaic_mode <- if (use_tile_mosaic) "TILE-BASED" else "SINGLE-FILE" + log_message(paste("Mosaic mode detected:", mosaic_mode)) + if (exists("tile_detection_info") && !is.null(tile_detection_info)) { + log_message(paste(" - Detection source:", tile_detection_info$detected_source)) + log_message(paste(" - Grid size:", tile_detection_info$grid_size)) + log_message(paste(" - Detected files in storage:", tile_detection_info$detected_count)) + if (length(tile_detection_info$sample_tiles) > 0) { + log_message(paste(" - Sample tile files:", paste(tile_detection_info$sample_tiles, collapse = ", "))) + } + } + } +} else { + warning("project_dir variable not found. Please set project_dir before sourcing parameters_project.R") +} + + + +#' Safe Logging Function +#' +#' Generic logging with [LEVEL] prefix. Works standalone without any framework. +#' Consistent with SmartCane logging standard. +#' +#' @param message The message to log +#' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG" +#' @return NULL (invisible, used for side effects) +#' +#' @examples +#' safe_log("Processing started", "INFO") +#' safe_log("Check input file", "WARNING") +#' safe_log("Failed to load data", "ERROR") +#' +safe_log <- function(message, level = "INFO") { + prefix <- sprintf("[%s]", level) + cat(sprintf("%s %s\n", prefix, message)) +} + +#' SmartCane Debug Logging (Conditional) +#' +#' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set. +#' Useful for development/troubleshooting without cluttering normal output. +#' +#' @param message The message to log +#' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE) +#' @return NULL (invisible, used for side effects) +#' +#' @examples +#' smartcane_debug("Processing field 1", verbose = FALSE) # Only if SMARTCANE_DEBUG=TRUE +#' smartcane_debug("Detailed state info", verbose = TRUE) # Always outputs +#' +smartcane_debug <- function(message, verbose = FALSE) { + if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") { + return(invisible(NULL)) + } + safe_log(message, level = "DEBUG") +} + +#' SmartCane Warning Logging +#' +#' Logs WARN-level messages. Convenience wrapper around safe_log(). +#' +#' @param message The message to log +#' @return NULL (invisible, used for side effects) +#' +#' @examples +#' smartcane_warn("Check data format before proceeding") +#' +smartcane_warn <- function(message) { + safe_log(message, level = "WARN") +} + +#' Load Field Boundaries from GeoJSON +#' +#' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson). +#' Handles CRS validation and column standardization. +#' +#' @param data_dir Directory containing GeoJSON file +#' @return List with elements: +#' - field_boundaries_sf: sf (Simple Features) object +#' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback) +#' +#' @details +#' Automatically selects pivot_2.geojson for ESA project during CI extraction, +#' otherwise uses pivot.geojson. Handles both multi-polygon and simple polygon geometries. +#' +#' @examples +#' boundaries <- load_field_boundaries("laravel_app/storage/app/angata") +#' head(boundaries$field_boundaries_sf) +#' +load_field_boundaries <- function(data_dir) { + # Choose field boundaries file based on project and script type + # ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model) + # All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson + use_pivot_2 <- exists("project_dir") && project_dir == "esa" && + exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03 + + if (use_pivot_2) { + field_boundaries_path <- file.path(data_dir, "pivot_2.geojson") + } else { + field_boundaries_path <- file.path(data_dir, "pivot.geojson") + } + + if (!file.exists(field_boundaries_path)) { + stop(paste("Field boundaries file not found at path:", field_boundaries_path)) + } + + tryCatch({ + # Read GeoJSON with explicit CRS handling + field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE) + + # Remove OBJECTID column immediately if it exists + if ("OBJECTID" %in% names(field_boundaries_sf)) { + field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID) + } + + # **CRITICAL**: Repair invalid geometries (degenerate vertices, self-intersections, etc.) + # This must happen BEFORE any spatial operations (CRS transform, intersect, crop, etc.) + # to prevent S2 geometry validation errors during downstream processing + field_boundaries_sf <- repair_geojson_geometries(field_boundaries_sf) + + # Validate and fix CRS if needed + tryCatch({ + # Simply assign WGS84 if not already set (safe approach) + if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) { + st_crs(field_boundaries_sf) <- 4326 + warning("CRS was missing, assigned WGS84 (EPSG:4326)") + } + }, error = function(e) { + tryCatch({ + st_crs(field_boundaries_sf) <<- 4326 + }, error = function(e2) { + warning(paste("Could not set CRS:", e2$message)) + }) + }) + + # Handle column names - accommodate optional sub_area column + if ("sub_area" %in% names(field_boundaries_sf)) { + field_boundaries_sf <- field_boundaries_sf %>% + dplyr::select(field, sub_field, sub_area) %>% + sf::st_set_geometry("geometry") + } else { + field_boundaries_sf <- field_boundaries_sf %>% + dplyr::select(field, sub_field) %>% + sf::st_set_geometry("geometry") + } + + # Convert to terra vector if possible, otherwise use sf + field_boundaries <- tryCatch({ + field_boundaries_terra <- terra::vect(field_boundaries_sf) + crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL) + crs_str <- if (!is.null(crs_value)) as.character(crs_value) else "" + + if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) { + terra::crs(field_boundaries_terra) <- "EPSG:4326" + warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)") + } + field_boundaries_terra + + }, error = function(e) { + warning(paste("Terra conversion failed, using sf object instead:", e$message)) + field_boundaries_sf + }) + + return(list( + field_boundaries_sf = field_boundaries_sf, + field_boundaries = field_boundaries + )) + }, error = function(e) { + cat("[DEBUG] Error in load_field_boundaries:\n") + cat(" Message:", e$message, "\n") + cat(" Call:", deparse(e$call), "\n") + stop(paste("Error loading field boundaries:", e$message)) + }) +} + + + +#' Generate a Sequence of Dates for Processing +#' +#' Creates a date range from start_date to end_date and extracts week/year info. +#' Used by Scripts 20, 30, 40 to determine data processing windows. +#' +#' @param end_date The end date for the sequence (Date object or "YYYY-MM-DD" string) +#' @param offset Number of days to look back from end_date (e.g., 7 for one week) +#' @return A list containing: +#' - week: ISO week number of start_date +#' - year: ISO year of start_date +#' - days_filter: Vector of dates in "YYYY-MM-DD" format +#' - start_date: Start date as Date object +#' - end_date: End date as Date object +#' +#' @details +#' IMPORTANT: Uses `lubridate::week()` and `lubridate::year()` which return +#' ISO week numbers (week 1 starts on Monday). For ISO week-based calculations, +#' use `lubridate::isoweek()` and `lubridate::isoyear()` instead. +#' +#' @examples +#' dates <- date_list(as.Date("2025-01-15"), offset = 7) +#' # Returns: week=2, year=2025, days_filter = c("2025-01-09", ..., "2025-01-15") +#' +#' dates <- date_list("2025-12-31", offset = 14) +#' # Handles string input and returns 14 days of data +#' +date_list <- function(end_date, offset) { + # Input validation + if (!lubridate::is.Date(end_date)) { + end_date <- as.Date(end_date) + if (is.na(end_date)) { + stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.") + } + } + + offset <- as.numeric(offset) + if (is.na(offset) || offset < 1) { + stop("Invalid offset provided. Expected a positive number.") + } + + # Calculate date range + offset <- offset - 1 # Adjust offset to include end_date + start_date <- end_date - lubridate::days(offset) + + # Extract ISO week and year information (from END date for reporting period) + week <- lubridate::isoweek(end_date) + year <- lubridate::isoyear(end_date) + + # Generate sequence of dates + days_filter <- seq(from = start_date, to = end_date, by = "day") + days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering + + # Log the date range + safe_log(paste("Date range generated from", start_date, "to", end_date)) + + return(list( + "week" = week, + "year" = year, + "days_filter" = days_filter, + "start_date" = start_date, + "end_date" = end_date + )) +} + +# ============================================================================== +#' Repair Invalid GeoJSON Geometries +#' +#' Fixes common geometry issues in GeoJSON/sf objects: +#' - Degenerate vertices (duplicate points) +#' - Self-intersecting polygons +#' - Invalid ring orientation +#' - Empty or NULL geometries +#' +#' Uses sf::st_make_valid() with buffer trick as fallback. +#' +#' @param sf_object sf object (GeoDataFrame) with potentially invalid geometries +#' @return sf object with repaired geometries +#' +#' @details +#' **Why this matters:** +#' Pivot GeoJSON files sometimes contain degenerate vertices or self-intersecting +#' rings from manual editing or GIS data sources. These cause errors when using +#' S2 geometry (strict validation) during cropping operations. +#' +#' **Repair strategy (priority order):** +#' 1. Try st_make_valid() - GEOS-based repair (most reliable) +#' 2. Fallback: st_union() + buffer(0) - Forces polygon validity +#' 3. Last resort: Silently keep original if repair fails +#' +#' @examples +#' \dontrun{ +#' fields <- st_read("pivot.geojson") +#' fields_fixed <- repair_geojson_geometries(fields) +#' cat(paste("Fixed geometries: before=", +#' nrow(fields[!st_is_valid(fields), ]), +#' ", after=", +#' nrow(fields_fixed[!st_is_valid(fields_fixed), ]))) +#' } +#' +repair_geojson_geometries <- function(sf_object) { + if (!inherits(sf_object, "sf")) { + stop("Input must be an sf (Simple Features) object") + } + + # Count invalid geometries BEFORE repair + invalid_before <- sum(!sf::st_is_valid(sf_object), na.rm = TRUE) + + if (invalid_before == 0) { + safe_log("All geometries already valid - no repair needed", "INFO") + return(sf_object) + } + + safe_log(paste("Found", invalid_before, "invalid geometries - attempting repair"), "WARNING") + + # STRATEGY: Apply st_make_valid() to entire sf object (most reliable for GEOS) + # This handles degenerate vertices, self-intersections, invalid rings while preserving all features + repaired <- tryCatch({ + # st_make_valid() on entire sf object preserves all features and attributes + repaired_geom <- sf::st_make_valid(sf_object) + + # Verify we still have the same number of rows + if (nrow(repaired_geom) != nrow(sf_object)) { + warning("st_make_valid() changed number of features - attempting row-wise repair") + + # Fallback: Repair row-by-row to maintain original structure + repaired_geom <- sf_object + for (i in seq_len(nrow(sf_object))) { + tryCatch({ + if (!sf::st_is_valid(sf_object[i, ])) { + repaired_geom[i, ] <- sf::st_make_valid(sf_object[i, ]) + } + }, error = function(e) { + safe_log(paste("Could not repair row", i, "-", e$message), "WARNING") + }) + } + } + + safe_log("✓ st_make_valid() successfully repaired geometries", "INFO") + repaired_geom + }, error = function(e) { + safe_log(paste("st_make_valid() failed:", e$message), "WARNING") + NULL + }) + + # If repair failed, keep original + if (is.null(repaired)) { + safe_log(paste("Could not repair", invalid_before, "invalid geometries - keeping original"), + "WARNING") + return(sf_object) + } + + # Count invalid geometries AFTER repair + invalid_after <- sum(!sf::st_is_valid(repaired), na.rm = TRUE) + safe_log(paste("Repair complete: before =", invalid_before, ", after =", invalid_after), "INFO") + + return(repaired) +} + +# ============================================================================== +# END 00_COMMON_UTILS.R +# ============================================================================== diff --git a/r_app/run_full_pipeline.R b/r_app/run_full_pipeline.R index 0336898..21e3f78 100644 --- a/r_app/run_full_pipeline.R +++ b/r_app/run_full_pipeline.R @@ -30,8 +30,8 @@ # ============================================================================== # *** EDIT THESE VARIABLES *** -end_date <- Sys.Date() # or specify: as.Date("2026-01-27") , Sys.Date() -project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba" +end_date <- as.Date("2026-01-27") # or specify: as.Date("2026-01-27") , Sys.Date() +project_dir <- "aura" # project name: "esa", "aura", "angata", "chemba" data_source <- "merged_tif" # Standard data source directory force_rerun <- FALSE # Set to TRUE to force all scripts to run even if outputs exist # *************************** @@ -51,8 +51,8 @@ cat(sprintf("\nProject: %s → Client Type: %s\n", project_dir, client_type)) # ============================================================================== # Script 80 (KPIs) needs N weeks of historical data for trend analysis and reporting # We calculate this automatically based on client type -reporting_weeks_needed <- 1 # Default: KPIs need current week of data for trends -offset <- reporting_weeks_needed * 7 # Convert weeks to days (minimum 7 days for 1 week) +reporting_weeks_needed <- 8 # CRITICAL: Need 8 weeks for 8-week trend analysis (Script 80 requirement) +offset <- reporting_weeks_needed * 7 # Convert weeks to days (8 weeks = 56 days) cat(sprintf("\n[INFO] Reporting window: %d weeks (%d days of data)\n", reporting_weeks_needed, offset)) wwy_current <- get_iso_week_year(end_date) @@ -176,14 +176,18 @@ if (!dir.exists(kpi_dir)) { } # Display status for each week -for (i in 1:nrow(kpis_needed)) { - row <- kpis_needed[i, ] - cat(sprintf( - " Week %02d/%d (%s): %s (%d files)\n", - row$week, row$year, format(row$date, "%Y-%m-%d"), - if (row$has_kpis) "✓ EXISTS" else "✗ WILL BE CALCULATED", - row$file_count - )) +if (nrow(kpis_needed) > 0) { + for (i in 1:nrow(kpis_needed)) { + row <- kpis_needed[i, ] + cat(sprintf( + " Week %02d/%d (%s): %s (%d files)\n", + row$week, row$year, format(row$date, "%Y-%m-%d"), + if (row$has_kpis) "✓ EXISTS" else "✗ WILL BE CALCULATED", + row$file_count + )) + } +} else { + cat(" (No weeks in reporting window)\n") } cat(sprintf( @@ -263,8 +267,9 @@ cat(sprintf("Script 40: %d missing week(s) to create\n", nrow(missing_weeks))) # Check Script 80 outputs (KPIs in reports/kpis/{field_level|field_analysis}) # kpi_dir already set by check_kpi_completeness() above +# Script 80 exports to .xlsx (Excel) and .rds (RDS) formats kpi_files <- if (dir.exists(kpi_dir)) { - list.files(kpi_dir, pattern = "\\.csv$|\\.json$") + list.files(kpi_dir, pattern = "\\.xlsx$|\\.rds$") } else { c() } @@ -317,6 +322,11 @@ tryCatch( cat(sprintf("[DEBUG] Sample files: %s\n", paste(head(existing_tiff_files, 3), collapse=", "))) } + # Find missing dates in the window + start_date <- end_date - data_generation_offset + date_seq <- seq(start_date, end_date, by = "day") + target_dates <- format(date_seq, "%Y-%m-%d") + # Get existing dates from tiles (better indicator of completion for tiled projects) existing_tile_dates <- tiles_dates @@ -325,14 +335,11 @@ tryCatch( # We don't download again if the file exists, regardless of whether tiles have been created yet if (length(existing_tiff_dates) > 0) { cat(sprintf("[DEBUG] Using TIFF dates for existence check (found %d existing files)\n", length(existing_tiff_dates))) - existing_tile_dates <- existing_tiff_dates + # IMPORTANT: Only consider existing TIFF dates that fall within our target window + # This prevents old 2025 data from masking missing 2026 data + existing_tile_dates <- existing_tiff_dates[existing_tiff_dates %in% target_dates] } - # Find missing dates in the window - start_date <- end_date - data_generation_offset - date_seq <- seq(start_date, end_date, by = "day") - target_dates <- format(date_seq, "%Y-%m-%d") - # Only download if files don't exist yet (tiles for tiled projects, TIFFs for single-file) missing_dates <- target_dates[!(target_dates %in% existing_tile_dates)] @@ -394,7 +401,7 @@ if (pipeline_success && !skip_10) { # Run Script 10 via system() - NEW per-field version # Arguments: project_dir cmd <- sprintf( - '"%s" --vanilla r_app/10_create_per_field_tiffs.R "%s"', + '"%s" r_app/10_create_per_field_tiffs.R "%s"', RSCRIPT_PATH, project_dir ) @@ -424,6 +431,96 @@ if (pipeline_success && !skip_10) { cat("\n========== SKIPPING SCRIPT 10 (per-field TIFFs already exist) ==========\n") } +# ============================================================================== +# CHECK: Per-Field TIFFs Without CI Data +# ============================================================================== +# IMPORTANT: Script 10 creates per-field TIFFs for ALL dates in merged_tif/ +# But Script 20 only processes dates within the offset window. +# This check finds dates that have per-field TIFFs but NO CI data, +# and forces Script 20 to process them regardless of offset. +cat("\n========== CHECKING FOR PER-FIELD TIFFs WITHOUT CI DATA ==========\n") + +field_tiles_dir <- paths$field_tiles_dir +field_tiles_ci_dir <- paths$field_tiles_ci_dir +ci_daily_dir <- paths$daily_ci_vals_dir + +# Get all dates that have per-field TIFFs +tiff_dates_all <- c() +if (dir.exists(field_tiles_dir)) { + # Check all field subdirectories + fields <- list.dirs(field_tiles_dir, full.names = FALSE, recursive = FALSE) + fields <- fields[fields != ""] + + if (length(fields) > 0) { + for (field in fields) { + field_path <- file.path(field_tiles_dir, field) + # Get dates from TIFF filenames: YYYY-MM-DD_*.tif or similar + tiff_files <- list.files(field_path, pattern = "^\\d{4}-\\d{2}-\\d{2}.*\\.tif$") + dates_in_field <- unique(sub("_.*$", "", tiff_files)) # Extract YYYY-MM-DD + tiff_dates_all <- unique(c(tiff_dates_all, dates_in_field)) + } + } +} + +# Get all dates that have CI data (either from field_tiles_CI or extracted_ci) +ci_dates_all <- c() +if (dir.exists(field_tiles_ci_dir)) { + # Check all field subdirectories for CI TIFFs + fields_ci <- list.dirs(field_tiles_ci_dir, full.names = FALSE, recursive = FALSE) + fields_ci <- fields_ci[fields_ci != ""] + + if (length(fields_ci) > 0) { + for (field in fields_ci) { + field_path <- file.path(field_tiles_ci_dir, field) + ci_tiff_files <- list.files(field_path, pattern = "^\\d{4}-\\d{2}-\\d{2}.*\\.tif$") + dates_in_field <- unique(sub("_.*$", "", ci_tiff_files)) + ci_dates_all <- unique(c(ci_dates_all, dates_in_field)) + } + } +} + +# Also check extracted_ci RDS files as source of truth +if (dir.exists(ci_daily_dir)) { + fields_rds <- list.dirs(ci_daily_dir, full.names = FALSE, recursive = FALSE) + fields_rds <- fields_rds[fields_rds != ""] + + if (length(fields_rds) > 0) { + for (field in fields_rds) { + field_path <- file.path(ci_daily_dir, field) + rds_files <- list.files(field_path, pattern = "^\\d{4}-\\d{2}-\\d{2}\\.rds$") + dates_in_field <- sub("\\.rds$", "", rds_files) + ci_dates_all <- unique(c(ci_dates_all, dates_in_field)) + } + } +} + +# Find dates with TIFFs but no CI data +dates_missing_ci <- setdiff(tiff_dates_all, ci_dates_all) + +cat(sprintf("Total per-field TIFF dates: %d\n", length(tiff_dates_all))) +cat(sprintf("Total CI data dates: %d\n", length(ci_dates_all))) +cat(sprintf("Dates with TIFFs but NO CI: %d\n", length(dates_missing_ci))) + +# If there are per-field TIFFs without CI, force Script 20 to run with extended date range +if (length(dates_missing_ci) > 0) { + cat("\n⚠ Found per-field TIFFs without CI data - forcing Script 20 to process them\n") + cat(sprintf(" Sample missing dates: %s\n", paste(head(dates_missing_ci, 3), collapse=", "))) + + # Calculate extended date range: from earliest missing date to end_date + earliest_missing_tiff <- min(as.Date(dates_missing_ci)) + extended_offset <- as.numeric(end_date - earliest_missing_tiff) + + cat(sprintf(" Extended offset: %d days (from %s to %s)\n", + extended_offset, format(earliest_missing_tiff, "%Y-%m-%d"), format(end_date, "%Y-%m-%d"))) + + # Use extended offset for Script 20 + offset_for_ci <- extended_offset + skip_20 <- FALSE # Force Script 20 to run +} else { + cat("✓ All per-field TIFFs have corresponding CI data\n") + offset_for_ci <- offset # Use normal offset +} + # ============================================================================== # SCRIPT 20: CI EXTRACTION # ============================================================================== @@ -433,11 +530,11 @@ if (pipeline_success && !skip_20) { { # Run Script 20 via system() to pass command-line args just like from terminal # Arguments: project_dir end_date offset - # Use FULL offset so CI extraction covers entire reporting window (not just new data) + # Use offset_for_ci which may have been extended if per-field TIFFs exist without CI cmd <- sprintf( - '"%s" --vanilla r_app/20_ci_extraction_per_field.R "%s" "%s" %d', + '"%s" r_app/20_ci_extraction_per_field.R "%s" "%s" %d', RSCRIPT_PATH, - project_dir, format(end_date, "%Y-%m-%d"), offset + project_dir, format(end_date, "%Y-%m-%d"), offset_for_ci ) result <- system(cmd) @@ -507,7 +604,7 @@ if (pipeline_success && !skip_30) { # Script 30 expects: project_dir only # Per-field version reads CI data from Script 20 per-field output location cmd <- sprintf( - '"%s" --vanilla r_app/30_interpolate_growth_model.R "%s"', + '"%s" r_app/30_interpolate_growth_model.R "%s"', RSCRIPT_PATH, project_dir ) @@ -517,11 +614,11 @@ if (pipeline_success && !skip_30) { stop("Script 30 exited with error code:", result) } - # Verify interpolated output - growth_dir <- paths$growth_model_interpolated_dir - if (dir.exists(growth_dir)) { - files <- list.files(growth_dir, pattern = "\\.rds$|\\.csv$") - cat(sprintf("✓ Script 30 completed - generated %d growth model files\n", length(files))) + # Verify interpolated output - Script 30 saves to cumulative_ci_vals_dir + cumulative_ci_vals_dir <- paths$cumulative_ci_vals_dir + if (dir.exists(cumulative_ci_vals_dir)) { + files <- list.files(cumulative_ci_vals_dir, pattern = "\\.rds$") + cat(sprintf("✓ Script 30 completed - generated %d interpolated RDS file(s)\n", length(files))) } else { cat("✓ Script 30 completed\n") } @@ -549,12 +646,9 @@ if (pipeline_success && !skip_31) { if (result == 0) { # Verify harvest output - check for THIS WEEK's specific file wwy_current_31 <- get_iso_week_year(end_date) - expected_file <- file.path( - "laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats", - sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, wwy_current_31$week, wwy_current_31$year) - ) + harvest_exists <- check_harvest_output_exists(project_dir, wwy_current_31$week, wwy_current_31$year) - if (file.exists(expected_file)) { + if (harvest_exists) { cat(sprintf("✓ Script 31 completed - generated harvest imminent file for week %02d\n", wwy_current_31$week)) } else { cat("✓ Script 31 completed (check if harvest.xlsx is available)\n") @@ -600,7 +694,7 @@ if (pipeline_success && !skip_40) { # The end_date is the last day of the week, and offset=7 covers the full 7-day week # Arguments: end_date offset project_dir cmd <- sprintf( - '"%s" --vanilla r_app/40_mosaic_creation_per_field.R "%s" 7 "%s"', + '"%s" r_app/40_mosaic_creation_per_field.R "%s" 7 "%s"', RSCRIPT_PATH, format(week_end_date, "%Y-%m-%d"), project_dir ) @@ -610,24 +704,9 @@ if (pipeline_success && !skip_40) { stop("Script 40 exited with error code:", result) } - # Verify mosaic was created for this specific week - mosaic_created <- FALSE - if (mosaic_mode == "tiled") { - mosaic_dir <- get_mosaic_dir(project_dir, mosaic_mode = "tiled") - if (dir.exists(mosaic_dir)) { - week_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year_num) - mosaic_files <- list.files(mosaic_dir, pattern = week_pattern) - mosaic_created <- length(mosaic_files) > 0 - } - } else { - mosaic_dir <- paths$weekly_mosaic_dir - if (dir.exists(mosaic_dir)) { - week_pattern <- sprintf("week_%02d_%d\\.tif", week_num, year_num) - # NEW: Support per-field architecture - search recursively for mosaics in field subdirectories - mosaic_files <- list.files(mosaic_dir, pattern = week_pattern, recursive = TRUE, full.names = FALSE) - mosaic_created <- length(mosaic_files) > 0 - } - } + # Verify mosaic was created for this specific week (centralized helper function) + mosaic_check <- check_mosaic_exists(project_dir, week_num, year_num, mosaic_mode) + mosaic_created <- mosaic_check$created if (mosaic_created) { cat(sprintf("✓ Week %02d/%d mosaic created successfully\n\n", week_num, year_num)) @@ -682,7 +761,7 @@ if (pipeline_success && !skip_80) { # Run Script 80 for this specific week with offset=7 (one week only) # This ensures Script 80 calculates KPIs for THIS week with proper trend data cmd <- sprintf( - '"%s" --vanilla r_app/80_calculate_kpis.R "%s" "%s" %d', + '"%s" r_app/80_calculate_kpis.R "%s" "%s" %d', RSCRIPT_PATH, format(calc_date, "%Y-%m-%d"), project_dir, 7 ) # offset=7 for single week @@ -692,7 +771,7 @@ if (pipeline_success && !skip_80) { week_row$week, week_row$year, format(calc_date, "%Y-%m-%d") )) - result <- system(cmd, ignore.stdout = TRUE, ignore.stderr = TRUE) + result <- system(cmd, ignore.stdout = FALSE, ignore.stderr = FALSE) if (result == 0) { cat(sprintf(" ✓ KPIs calculated for week %02d/%d\n", week_row$week, week_row$year)) @@ -706,7 +785,7 @@ if (pipeline_success && !skip_80) { # Verify total KPI output (kpi_dir defined by check_kpi_completeness() earlier) if (dir.exists(kpi_dir)) { - files <- list.files(kpi_dir, pattern = "\\.csv$|\\.json$") + files <- list.files(kpi_dir, pattern = "\\.xlsx$|\\.rds$") # Extract subdir name from kpi_dir path for display subdir_name <- basename(kpi_dir) cat(sprintf("\n✓ Script 80 loop completed - total %d KPI files in %s/\n", length(files), subdir_name)) @@ -739,12 +818,15 @@ if (dir.exists(kpi_dir)) { week_num <- as.numeric(format(check_date, "%V")) year_num <- as.numeric(format(check_date, "%G")) - # Check for any KPI file from that week - week_pattern <- sprintf("week%02d_%d", week_num, year_num) + # Check for any KPI file from that week (flexible pattern to match all formats) + # Matches: week_05_2026, AURA_KPI_week_05_2026, etc. + week_pattern <- sprintf("_week_%02d_%d|week_%02d_%d", week_num, year_num, week_num, year_num) # NEW: Support per-field architecture - search recursively for KPI files in field subdirectories kpi_files_this_week <- list.files(kpi_dir, pattern = week_pattern, recursive = TRUE, full.names = FALSE) - if (length(kpi_files_this_week) == 0) { + if (length(kpi_files_this_week) > 0) { + cat(sprintf(" Week %02d/%d: ✓ KPIs found (%d files)\n", week_num, year_num, length(kpi_files_this_week))) + } else { kpis_complete <- FALSE cat(sprintf(" Week %02d/%d: ✗ KPIs not found\n", week_num, year_num)) } @@ -752,9 +834,9 @@ if (dir.exists(kpi_dir)) { } if (kpis_complete) { - cat("✓ All KPIs available - reports can be generated\n") + cat("✓ All KPIs available - full reporting window complete\n") } else { - cat("⚠ Some KPIs still missing - reports will be skipped\n") + cat("⚠ Note: Some KPIs may still be missing - Script 80 calculated what was available\n") } # ============================================================================== @@ -763,23 +845,20 @@ if (kpis_complete) { if (pipeline_success && run_legacy_report) { cat("\n========== RUNNING SCRIPT 90: LEGACY WORD REPORT ==========\n") - if (!kpis_complete) { - cat("⚠ Skipping Script 90 - KPIs not available for full reporting window\n") - } else { - tryCatch( - { - # Script 90 is an RMarkdown file - compile it with rmarkdown::render() - output_dir <- paths$reports_dir + tryCatch( + { + # Script 90 is an RMarkdown file - compile it with rmarkdown::render() + output_dir <- paths$reports_dir - # Reports directory already created by setup_project_directories + # Reports directory already created by setup_project_directories - output_filename <- sprintf( - "CI_report_week%02d_%d.docx", - as.numeric(format(end_date, "%V")), - as.numeric(format(end_date, "%G")) - ) + output_filename <- sprintf( + "CI_report_week%02d_%d.docx", + as.numeric(format(end_date, "%V")), + as.numeric(format(end_date, "%G")) + ) - # Render the RMarkdown document + # Render the RMarkdown document rmarkdown::render( input = "r_app/90_CI_report_with_kpis_simple.Rmd", output_dir = output_dir, @@ -798,9 +877,8 @@ if (pipeline_success && run_legacy_report) { pipeline_success <<- FALSE } ) - } } else if (run_legacy_report) { - cat("\n========== SKIPPING SCRIPT 90 (pipeline error or KPIs incomplete) ==========\n") + cat("\n========== SKIPPING SCRIPT 90 (pipeline error) ==========\n") } # ============================================================================== @@ -809,10 +887,7 @@ if (pipeline_success && run_legacy_report) { if (pipeline_success && run_modern_report) { cat("\n========== RUNNING SCRIPT 91: MODERN WORD REPORT ==========\n") - if (!kpis_complete) { - cat("⚠ Skipping Script 91 - KPIs not available for full reporting window\n") - } else { - tryCatch( + tryCatch( { # Script 91 is an RMarkdown file - compile it with rmarkdown::render() output_dir <- paths$reports_dir @@ -844,9 +919,8 @@ if (pipeline_success && run_modern_report) { pipeline_success <<- FALSE } ) - } } else if (run_modern_report) { - cat("\n========== SKIPPING SCRIPT 91 (pipeline error or KPIs incomplete) ==========\n") + cat("\n========== SKIPPING SCRIPT 91 (pipeline error) ==========\n") } # ==============================================================================