SmartCane/r_app/00_common_utils.R

501 lines
17 KiB
R

# ==============================================================================
# 00_COMMON_UTILS.R
# ==============================================================================
# GENERIC UTILITY FUNCTIONS FOR SMARTCANE PIPELINE
#
# PURPOSE:
# Centralized location for foundational utilities used across multiple scripts.
# These functions have NO project knowledge, NO client-type dependencies,
# NO domain-specific logic.
#
# USAGE:
# All scripts (10, 20, 21, 30, 40, 80, 90, 91) should source this file:
#
# source(here::here("r_app", "parameters_project.R")) # Config first
# source(here::here("r_app", "00_common_utils.R")) # Then common utilities
#
# FUNCTIONS:
# 1. safe_log() — Generic logging with [LEVEL] prefix
# 2. smartcane_debug() — Conditional debug logging
# 3. smartcane_warn() — Convenience wrapper for WARN-level messages
# 4. date_list() — Generate date sequences for processing windows
# 5. get_iso_week() — Extract ISO week number from date
# 6. get_iso_year() — Extract ISO year from date
# 7. get_iso_week_year() — Extract both ISO week and year as list
# 8. format_week_label() — Format date as week/year label (e.g., "week01_2025")
# 9. load_field_boundaries() — Load field geometries from GeoJSON
# 10. load_harvesting_data() — Load harvest schedule from Excel
#
# ==============================================================================
#' Safe Logging Function
#'
#' Generic logging with [LEVEL] prefix. Works standalone without any framework.
#' Consistent with SmartCane logging standard.
#'
#' @param message The message to log
#' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG"
#' @return NULL (invisible, used for side effects)
#'
#' @examples
#' safe_log("Processing started", "INFO")
#' safe_log("Check input file", "WARNING")
#' safe_log("Failed to load data", "ERROR")
#'
safe_log <- function(message, level = "INFO") {
prefix <- sprintf("[%s]", level)
cat(sprintf("%s %s\n", prefix, message))
}
#' SmartCane Debug Logging (Conditional)
#'
#' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set.
#' Useful for development/troubleshooting without cluttering normal output.
#'
#' @param message The message to log
#' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE)
#' @return NULL (invisible, used for side effects)
#'
#' @examples
#' smartcane_debug("Processing field 1", verbose = FALSE) # Only if SMARTCANE_DEBUG=TRUE
#' smartcane_debug("Detailed state info", verbose = TRUE) # Always outputs
#'
smartcane_debug <- function(message, verbose = FALSE) {
if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") {
return(invisible(NULL))
}
safe_log(message, level = "DEBUG")
}
#' SmartCane Warning Logging
#'
#' Logs WARN-level messages. Convenience wrapper around safe_log().
#'
#' @param message The message to log
#' @return NULL (invisible, used for side effects)
#'
#' @examples
#' smartcane_warn("Check data format before proceeding")
#'
smartcane_warn <- function(message) {
safe_log(message, level = "WARN")
}
#' Extract ISO Week Number from Date
#'
#' Extracts ISO week number (1-53) from a date using %V format.
#' ISO weeks follow the international standard: Week 1 starts on Monday.
#'
#' @param date A Date object or string convertible to Date
#' @return Numeric: ISO week number (1-53)
#'
#' @examples
#' get_iso_week(as.Date("2025-01-15")) # Returns: 3
#'
get_iso_week <- function(date) {
as.numeric(format(date, "%V"))
}
#' Extract ISO Year from Date
#'
#' Extracts ISO year from a date using %G format.
#' ISO year can differ from calendar year around year boundaries.
#'
#' @param date A Date object or string convertible to Date
#' @return Numeric: ISO year
#'
#' @examples
#' get_iso_year(as.Date("2025-01-01")) # Returns: 2025
#'
get_iso_year <- function(date) {
as.numeric(format(date, "%G"))
}
#' Extract ISO Week and Year as List
#'
#' Combines get_iso_week() and get_iso_year() for convenience.
#'
#' @param date A Date object or string convertible to Date
#' @return List with elements: week (1-53), year
#'
#' @examples
#' wwy <- get_iso_week_year(as.Date("2025-01-15"))
#' # Returns: list(week = 3, year = 2025)
#'
get_iso_week_year <- function(date) {
list(
week = as.numeric(format(date, "%V")),
year = as.numeric(format(date, "%G"))
)
}
#' Format Date as Week/Year Label
#'
#' Converts a date into a readable week label format.
#' Useful for filenames, directory names, and output identification.
#'
#' @param date A Date object or string convertible to Date
#' @param separator Separator between week number and year (default: "_")
#' @return String in format "week##_YYYY" (e.g., "week03_2025")
#'
#' @examples
#' format_week_label(as.Date("2025-01-15")) # "week03_2025"
#' format_week_label(as.Date("2025-01-15"), "-") # "week03-2025"
#'
format_week_label <- function(date, separator = "_") {
wwy <- get_iso_week_year(date)
sprintf("week%02d%s%d", wwy$week, separator, wwy$year)
}
#' Load Field Boundaries from GeoJSON
#'
#' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson).
#' Handles CRS validation and column standardization.
#'
#' @param data_dir Directory containing GeoJSON file
#' @return List with elements:
#' - field_boundaries_sf: sf (Simple Features) object
#' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback)
#'
#' @details
#' Automatically selects pivot_2.geojson for ESA project during CI extraction,
#' otherwise uses pivot.geojson. Handles both multi-polygon and simple polygon geometries.
#'
#' @examples
#' boundaries <- load_field_boundaries("laravel_app/storage/app/angata")
#' head(boundaries$field_boundaries_sf)
#'
load_field_boundaries <- function(data_dir) {
# Choose field boundaries file based on project and script type
# ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model)
# All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson
use_pivot_2 <- exists("project_dir") && project_dir == "esa" &&
exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03
if (use_pivot_2) {
field_boundaries_path <- here(data_dir, "pivot_2.geojson")
} else {
field_boundaries_path <- here(data_dir, "Data", "pivot.geojson")
}
if (!file.exists(field_boundaries_path)) {
stop(paste("Field boundaries file not found at path:", field_boundaries_path))
}
tryCatch({
# Read GeoJSON with explicit CRS handling
field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE)
# Remove OBJECTID column immediately if it exists
if ("OBJECTID" %in% names(field_boundaries_sf)) {
field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID)
}
# **CRITICAL**: Repair invalid geometries (degenerate vertices, self-intersections, etc.)
# This must happen BEFORE any spatial operations (CRS transform, intersect, crop, etc.)
# to prevent S2 geometry validation errors during downstream processing
field_boundaries_sf <- repair_geojson_geometries(field_boundaries_sf)
# Validate and fix CRS if needed
tryCatch({
# Simply assign WGS84 if not already set (safe approach)
if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) {
st_crs(field_boundaries_sf) <- 4326
warning("CRS was missing, assigned WGS84 (EPSG:4326)")
}
}, error = function(e) {
tryCatch({
st_crs(field_boundaries_sf) <<- 4326
}, error = function(e2) {
warning(paste("Could not set CRS:", e2$message))
})
})
# Handle column names - accommodate optional sub_area column
if ("sub_area" %in% names(field_boundaries_sf)) {
field_boundaries_sf <- field_boundaries_sf %>%
dplyr::select(field, sub_field, sub_area) %>%
sf::st_set_geometry("geometry")
} else {
field_boundaries_sf <- field_boundaries_sf %>%
dplyr::select(field, sub_field) %>%
sf::st_set_geometry("geometry")
}
# Convert to terra vector if possible, otherwise use sf
field_boundaries <- tryCatch({
field_boundaries_terra <- terra::vect(field_boundaries_sf)
crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL)
crs_str <- if (!is.null(crs_value)) as.character(crs_value) else ""
if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) {
terra::crs(field_boundaries_terra) <- "EPSG:4326"
warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)")
}
field_boundaries_terra
}, error = function(e) {
warning(paste("Terra conversion failed, using sf object instead:", e$message))
field_boundaries_sf
})
return(list(
field_boundaries_sf = field_boundaries_sf,
field_boundaries = field_boundaries
))
}, error = function(e) {
cat("[DEBUG] Error in load_field_boundaries:\n")
cat(" Message:", e$message, "\n")
cat(" Call:", deparse(e$call), "\n")
stop(paste("Error loading field boundaries:", e$message))
})
}
#' Load Harvesting Data from Excel
#'
#' Loads crop harvest schedule from harvest.xlsx file.
#' Handles flexible date formats (numeric, YYYY-MM-DD, DD/MM/YYYY, etc.).
#'
#' @param data_dir Directory containing harvest.xlsx file
#' @return Data frame with columns: field, sub_field, year, season_start, season_end,
#' age (weeks), sub_area, tonnage_ha. Returns NULL if file not found.
#'
#' @examples
#' harvest <- load_harvesting_data("laravel_app/storage/app/angata")
#' head(harvest)
#'
load_harvesting_data <- function(data_dir) {
harvest_file <- here(data_dir, "harvest.xlsx")
if (!file.exists(harvest_file)) {
warning(paste("Harvest data file not found at path:", harvest_file))
return(NULL)
}
# Helper function to parse dates with multiple format detection
parse_flexible_date <- function(x) {
if (is.na(x) || is.null(x)) return(NA_real_)
if (inherits(x, "Date")) return(x)
if (inherits(x, "POSIXct")) return(as.Date(x))
# If it's numeric (Excel date serial), convert directly
if (is.numeric(x)) {
return(as.Date(x, origin = "1899-12-30"))
}
# Try character conversion with multiple formats
x_char <- as.character(x)
formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S")
for (fmt in formats) {
result <- suppressWarnings(as.Date(x_char, format = fmt))
if (!is.na(result)) return(result)
}
return(NA)
}
tryCatch({
harvesting_data <- read_excel(harvest_file) %>%
dplyr::select(
c(
"field",
"sub_field",
"year",
"season_start",
"season_end",
"age",
"sub_area",
"tonnage_ha"
)
) %>%
mutate(
field = as.character(field),
sub_field = as.character(sub_field),
year = as.numeric(year),
season_start = sapply(season_start, parse_flexible_date),
season_end = sapply(season_end, parse_flexible_date),
season_start = as.Date(season_start, origin = "1970-01-01"),
season_end = as.Date(season_end, origin = "1970-01-01"),
age = as.numeric(age),
sub_area = as.character(sub_area),
tonnage_ha = as.numeric(tonnage_ha)
) %>%
mutate(
season_end = case_when(
season_end > Sys.Date() ~ Sys.Date(),
is.na(season_end) ~ Sys.Date(),
TRUE ~ season_end
),
age = round(as.numeric(season_end - season_start) / 7, 0)
)
return(harvesting_data)
}, error = function(e) {
warning(paste("Error loading harvesting data:", e$message))
return(NULL)
})
}
#' Generate a Sequence of Dates for Processing
#'
#' Creates a date range from start_date to end_date and extracts week/year info.
#' Used by Scripts 20, 30, 40 to determine data processing windows.
#'
#' @param end_date The end date for the sequence (Date object or "YYYY-MM-DD" string)
#' @param offset Number of days to look back from end_date (e.g., 7 for one week)
#' @return A list containing:
#' - week: ISO week number of start_date
#' - year: ISO year of start_date
#' - days_filter: Vector of dates in "YYYY-MM-DD" format
#' - start_date: Start date as Date object
#' - end_date: End date as Date object
#'
#' @details
#' IMPORTANT: Uses `lubridate::week()` and `lubridate::year()` which return
#' ISO week numbers (week 1 starts on Monday). For ISO week-based calculations,
#' use `lubridate::isoweek()` and `lubridate::isoyear()` instead.
#'
#' @examples
#' dates <- date_list(as.Date("2025-01-15"), offset = 7)
#' # Returns: week=2, year=2025, days_filter = c("2025-01-09", ..., "2025-01-15")
#'
#' dates <- date_list("2025-12-31", offset = 14)
#' # Handles string input and returns 14 days of data
#'
date_list <- function(end_date, offset) {
# Input validation
if (!lubridate::is.Date(end_date)) {
end_date <- as.Date(end_date)
if (is.na(end_date)) {
stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
}
}
offset <- as.numeric(offset)
if (is.na(offset) || offset < 1) {
stop("Invalid offset provided. Expected a positive number.")
}
# Calculate date range
offset <- offset - 1 # Adjust offset to include end_date
start_date <- end_date - lubridate::days(offset)
# Extract ISO week and year information (from END date for reporting period)
week <- lubridate::isoweek(end_date)
year <- lubridate::isoyear(end_date)
# Generate sequence of dates
days_filter <- seq(from = start_date, to = end_date, by = "day")
days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering
# Log the date range
safe_log(paste("Date range generated from", start_date, "to", end_date))
return(list(
"week" = week,
"year" = year,
"days_filter" = days_filter,
"start_date" = start_date,
"end_date" = end_date
))
}
# ==============================================================================
#' Repair Invalid GeoJSON Geometries
#'
#' Fixes common geometry issues in GeoJSON/sf objects:
#' - Degenerate vertices (duplicate points)
#' - Self-intersecting polygons
#' - Invalid ring orientation
#' - Empty or NULL geometries
#'
#' Uses sf::st_make_valid() with buffer trick as fallback.
#'
#' @param sf_object sf object (GeoDataFrame) with potentially invalid geometries
#' @return sf object with repaired geometries
#'
#' @details
#' **Why this matters:**
#' Pivot GeoJSON files sometimes contain degenerate vertices or self-intersecting
#' rings from manual editing or GIS data sources. These cause errors when using
#' S2 geometry (strict validation) during cropping operations.
#'
#' **Repair strategy (priority order):**
#' 1. Try st_make_valid() - GEOS-based repair (most reliable)
#' 2. Fallback: st_union() + buffer(0) - Forces polygon validity
#' 3. Last resort: Silently keep original if repair fails
#'
#' @examples
#' \dontrun{
#' fields <- st_read("pivot.geojson")
#' fields_fixed <- repair_geojson_geometries(fields)
#' cat(paste("Fixed geometries: before=",
#' nrow(fields[!st_is_valid(fields), ]),
#' ", after=",
#' nrow(fields_fixed[!st_is_valid(fields_fixed), ])))
#' }
#'
repair_geojson_geometries <- function(sf_object) {
if (!inherits(sf_object, "sf")) {
stop("Input must be an sf (Simple Features) object")
}
# Count invalid geometries BEFORE repair
invalid_before <- sum(!sf::st_is_valid(sf_object), na.rm = TRUE)
if (invalid_before == 0) {
safe_log("All geometries already valid - no repair needed", "INFO")
return(sf_object)
}
safe_log(paste("Found", invalid_before, "invalid geometries - attempting repair"), "WARNING")
# STRATEGY: Apply st_make_valid() to entire sf object (most reliable for GEOS)
# This handles degenerate vertices, self-intersections, invalid rings while preserving all features
repaired <- tryCatch({
# st_make_valid() on entire sf object preserves all features and attributes
repaired_geom <- sf::st_make_valid(sf_object)
# Verify we still have the same number of rows
if (nrow(repaired_geom) != nrow(sf_object)) {
warning("st_make_valid() changed number of features - attempting row-wise repair")
# Fallback: Repair row-by-row to maintain original structure
repaired_geom <- sf_object
for (i in seq_len(nrow(sf_object))) {
tryCatch({
if (!sf::st_is_valid(sf_object[i, ])) {
repaired_geom[i, ] <- sf::st_make_valid(sf_object[i, ])
}
}, error = function(e) {
safe_log(paste("Could not repair row", i, "-", e$message), "WARNING")
})
}
}
safe_log("✓ st_make_valid() successfully repaired geometries", "INFO")
repaired_geom
}, error = function(e) {
safe_log(paste("st_make_valid() failed:", e$message), "WARNING")
NULL
})
# If repair failed, keep original
if (is.null(repaired)) {
safe_log(paste("Could not repair", invalid_before, "invalid geometries - keeping original"),
"WARNING")
return(sf_object)
}
# Count invalid geometries AFTER repair
invalid_after <- sum(!sf::st_is_valid(repaired), na.rm = TRUE)
safe_log(paste("Repair complete: before =", invalid_before, ", after =", invalid_after), "INFO")
return(repaired)
}
# ==============================================================================
# END 00_COMMON_UTILS.R
# ==============================================================================