diff --git a/CODE_REVIEW_FINDINGS.md b/CODE_REVIEW_FINDINGS.md deleted file mode 100644 index 54a9a88..0000000 --- a/CODE_REVIEW_FINDINGS.md +++ /dev/null @@ -1,751 +0,0 @@ -# SmartCane Pipeline Code Review -## Efficiency, Cleanup, and Architecture Analysis - -**Date**: January 29, 2026 -**Scope**: `run_full_pipeline.R` + all called scripts (10, 20, 21, 30, 31, 40, 80, 90, 91) + utility files -**Status**: Comprehensive review completed - ---- - -## EXECUTIVE SUMMARY - -Your pipeline is **well-structured and intentional**, but has accumulated significant technical debt through development iterations. The main issues are: - -1. **🔴 HIGH IMPACT**: **3 separate mosaic mode detection functions** doing identical work -2. **🔴 HIGH IMPACT**: **Week/year calculations duplicated 10+ times** across 6+ files -3. **🟡 MEDIUM IMPACT**: **40+ debug statements** cluttering output -4. **🟡 MEDIUM IMPACT**: **File existence checks repeated** in multiple places (especially KPI checks) -5. **🟢 LOW IMPACT**: Minor redundancy in command construction, but manageable - -**Estimated cleanup effort**: 2-3 hours for core refactoring; significant code quality gains. - -**Workflow clarity issue**: The split between `merged_tif` vs `merged_tif_8b` and `weekly_mosaic` vs `weekly_tile_max` is **not clearly documented**. This should be clarified. - ---- - -## 1. DUPLICATED FUNCTIONS & LOGIC - -### 1.1 Mosaic Mode Detection (CRITICAL REDUNDANCY) - -**Problem**: Three identical implementations of `detect_mosaic_mode()`: - -| Location | Function Name | Lines | Issue | -|----------|---------------|-------|-------| -| `run_full_pipeline.R` | `detect_mosaic_mode_early()` | ~20 lines | Detects tiled vs single-file | -| `run_full_pipeline.R` | `detect_mosaic_mode_simple()` | ~20 lines | Detects tiled vs single-file (duplicate) | -| `parameters_project.R` | `detect_mosaic_mode()` | ~30 lines | Detects tiled vs single-file (different signature) | - -**Impact**: If you change the detection logic, you must update 3 places. Bug risk is high. - -**Solution**: Create **single canonical function in `parameters_project.R`**: -```r -# SINGLE SOURCE OF TRUTH -detect_mosaic_mode <- function(project_dir) { - weekly_tile_max <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max") - if (dir.exists(weekly_tile_max)) { - subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE) - if (length(grep("^\\d+x\\d+$", subfolders)) > 0) return("tiled") - } - - weekly_mosaic <- file.path("laravel_app", "storage", "app", project_dir, "weekly_mosaic") - if (dir.exists(weekly_mosaic) && - length(list.files(weekly_mosaic, pattern = "^week_.*\\.tif$")) > 0) { - return("single-file") - } - - return("unknown") -} -``` - -Then replace all three calls in `run_full_pipeline.R` with this single function. - ---- - -### 1.2 Week/Year Calculations (CRITICAL REDUNDANCY) - -**Problem**: The pattern `week_num <- as.numeric(format(..., "%V"))` + `year_num <- as.numeric(format(..., "%G"))` appears **13+ times** across multiple files. - -**Locations**: -- `run_full_pipeline.R`: Lines 82, 126-127, 229-230, 630, 793-794 (5 times) -- `80_calculate_kpis.R`: Lines 323-324 (1 time) -- `80_weekly_stats_utils.R`: Lines 829-830 (1 time) -- `kpi_utils.R`: Line 45 (1 time) -- `80_kpi_utils.R`: Lines 177-178 (1 time) -- Plus inline in sprintf statements: ~10+ additional times - -**Impact**: -- High maintenance burden -- Risk of inconsistency (%V vs %Y confusion noted at line 82 in `run_full_pipeline.R`) -- Code verbosity - -**Solution**: Create **utility function in `parameters_project.R`**: -```r -get_iso_week_year <- function(date) { - list( - week = as.numeric(format(date, "%V")), - year = as.numeric(format(date, "%G")) # ISO year, not calendar year - ) -} - -# Usage: -wwy <- get_iso_week_year(end_date) -cat(sprintf("Week %02d/%d\n", wwy$week, wwy$year)) -``` - -**Also add convenience function**: -```r -format_week_year <- function(date, separator = "_") { - wwy <- get_iso_week_year(date) - sprintf("week_%02d%s%d", wwy$week, separator, wwy$year) -} - -# Usage: format_week_year(end_date) # "week_02_2026" -``` - ---- - -### 1.3 File Path Construction (MEDIUM REDUNDANCY) - -**Problem**: Repeated patterns like: -```r -file.path("laravel_app", "storage", "app", project_dir, "weekly_mosaic") -file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", kpi_subdir) -``` - -**Solution**: Centralize in `parameters_project.R`: -```r -# Project-agnostic path builders -get_project_storage_path <- function(project_dir, subdir = NULL) { - base <- file.path("laravel_app", "storage", "app", project_dir) - if (!is.null(subdir)) file.path(base, subdir) else base -} - -get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") { - if (mosaic_mode == "auto") mosaic_mode <- detect_mosaic_mode(project_dir) - if (mosaic_mode == "tiled") { - get_project_storage_path(project_dir, "weekly_tile_max/5x5") - } else { - get_project_storage_path(project_dir, "weekly_mosaic") - } -} - -get_kpi_dir <- function(project_dir, client_type) { - subdir <- if (client_type == "agronomic_support") "field_level" else "field_analysis" - get_project_storage_path(project_dir, file.path("reports", "kpis", subdir)) -} -``` - ---- - -## 2. DEBUG STATEMENTS & LOGGING CLUTTER - -### 2.1 Excessive Debug Output - -The pipeline prints **40+ debug statements** that pollute the terminal output. Examples: - -**In `run_full_pipeline.R`**: -```r -Line 82: cat(sprintf(" Running week: %02d / %d\n", ...)) # Note: %d (calendar year) should be %G -Line 218: cat(sprintf("[KPI_DIR_CREATED] Created directory: %s\n", ...)) -Line 223: cat(sprintf("[KPI_DIR_EXISTS] %s\n", ...)) -Line 224: cat(sprintf("[KPI_DEBUG] Total files in directory: %d\n", ...)) -Line 225: cat(sprintf("[KPI_DEBUG] Sample files: %s\n", ...)) -Line 240: cat(sprintf("[KPI_DEBUG_W%02d_%d] Pattern: '%s' | Found: %d files\n", ...)) -Line 630: cat("DEBUG: Running command:", cmd, "\n") -Line 630 in Script 31 execution - prints full conda command -``` - -**In `80_calculate_kpis.R`**: -``` -Line 323: message(paste("Calculating statistics for all fields - Week", week_num, year)) -Line 417: # Plus many more ... -``` - -**Impact**: -- Makes output hard to scan for real issues -- Test developers skip important messages -- Production logs become noise - -**Solution**: Replace with **structured logging** (3 levels): - -```r -# Add to parameters_project.R -smartcane_log <- function(message, level = "INFO") { - timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") - prefix <- sprintf("[%s] %s", level, timestamp) - cat(sprintf("%s | %s\n", prefix, message)) -} - -smartcane_debug <- function(message) { - if (Sys.getenv("SMARTCANE_DEBUG") == "TRUE") { - smartcane_log(message, level = "DEBUG") - } -} - -smartcane_warn <- function(message) { - smartcane_log(message, level = "WARN") -} -``` - -**Usage**: -```r -# Keep important messages -smartcane_log(sprintf("Downloaded %d dates, %d failed", download_count, download_failed)) - -# Hide debug clutter (only show if DEBUG=TRUE) -smartcane_debug(sprintf("KPI directory exists: %s", kpi_dir)) - -# Warnings stay visible -smartcane_warn("Some downloads failed, but continuing pipeline") -``` - ---- - -### 2.2 Redundant Status Checks in KPI Section - -**Lines 218-270 in `run_full_pipeline.R`**: The KPI requirement check has **deeply nested debug statements**. - -```r -if (dir.exists(kpi_dir)) { - cat(sprintf("[KPI_DIR_EXISTS] %s\n", kpi_dir)) - all_kpi_files <- list.files(kpi_dir) - cat(sprintf("[KPI_DEBUG] Total files in directory: %d\n", length(all_kpi_files))) - if (length(all_kpi_files) > 0) { - cat(sprintf("[KPI_DEBUG] Sample files: %s\n", ...)) - } -} else { - cat(sprintf("[KPI_DIR_MISSING] Directory does not exist: %s\n", kpi_dir)) -} -``` - -**Solution**: Simplify to: -```r -if (!dir.exists(kpi_dir)) { - dir.create(kpi_dir, recursive = TRUE, showWarnings = FALSE) -} - -all_kpi_files <- list.files(kpi_dir) -smartcane_debug(sprintf("KPI directory: %d files found", length(all_kpi_files))) -``` - ---- - -## 3. DOUBLE CALCULATIONS & INEFFICIENCIES - -### 3.1 KPI Existence Check (Calculated Twice) - -**Problem**: KPI existence is checked **twice** in `run_full_pipeline.R`: - -1. **First check (Lines 228-270)**: Initial KPI requirement check that calculates `kpis_needed` dataframe -2. **Second check (Lines 786-810)**: Verification after Script 80 runs (almost identical logic) - -Both loops do: -```r -for (weeks_back in 0:(reporting_weeks_needed - 1)) { - check_date <- end_date - (weeks_back * 7) - week_num <- as.numeric(format(check_date, "%V")) - year_num <- as.numeric(format(check_date, "%G")) - - week_pattern <- sprintf("week%02d_%d", week_num, year_num) - kpi_files_this_week <- list.files(kpi_dir, pattern = week_pattern) - - has_kpis <- length(kpi_files_this_week) > 0 - # ... same logic again -} -``` - -**Impact**: Slower pipeline execution, code duplication - -**Solution**: Create **reusable function in utility file**: -```r -check_kpi_completeness <- function(project_dir, client_type, end_date, reporting_weeks_needed) { - kpi_dir <- get_kpi_dir(project_dir, client_type) - - kpis_needed <- data.frame() - for (weeks_back in 0:(reporting_weeks_needed - 1)) { - check_date <- end_date - (weeks_back * 7) - wwy <- get_iso_week_year(check_date) - - week_pattern <- sprintf("week%02d_%d", wwy$week, wwy$year) - has_kpis <- any(grepl(week_pattern, list.files(kpi_dir))) - - kpis_needed <- rbind(kpis_needed, data.frame( - week = wwy$week, - year = wwy$year, - date = check_date, - has_kpis = has_kpis - )) - } - - return(list( - kpis_df = kpis_needed, - missing_count = sum(!kpis_needed$has_kpis), - all_complete = all(kpis_needed$has_kpis) - )) -} - -# Then in run_full_pipeline.R: -initial_kpi_check <- check_kpi_completeness(project_dir, client_type, end_date, reporting_weeks_needed) - -# ... after Script 80 runs: -final_kpi_check <- check_kpi_completeness(project_dir, client_type, end_date, reporting_weeks_needed) -if (final_kpi_check$all_complete) { - smartcane_log("✓ All KPIs available") -} -``` - ---- - -### 3.2 Mosaic Mode Detection (Called 3+ Times per Run) - -**Current code**: -- Line 99-117: `detect_mosaic_mode_early()` called once -- Line 301-324: `detect_mosaic_mode_simple()` called again -- Result: **Same detection logic runs twice unnecessarily** - -**Solution**: Call once, store result: -```r -mosaic_mode <- detect_mosaic_mode(project_dir) # Once at top - -# Then reuse throughout: -if (mosaic_mode == "tiled") { ... } -else if (mosaic_mode == "single-file") { ... } -``` - ---- - -### 3.3 Missing Weeks Calculation Inefficiency - -**Lines 126-170**: The loop builds `weeks_needed` dataframe, then **immediately** iterates again to find which ones are missing. - -**Current code**: -```r -# First: build all weeks -weeks_needed <- data.frame() -for (weeks_back in 0:(reporting_weeks_needed - 1)) { - # ... build weeks_needed -} - -# Then: check which are missing (loop again) -missing_weeks <- data.frame() -for (i in 1:nrow(weeks_needed)) { - # ... check each week -} -``` - -**Solution**: Combine into **single loop**: -```r -weeks_needed <- data.frame() -missing_weeks <- data.frame() -earliest_missing_date <- end_date - -for (weeks_back in 0:(reporting_weeks_needed - 1)) { - check_date <- end_date - (weeks_back * 7) - wwy <- get_iso_week_year(check_date) - - # Add to weeks_needed - weeks_needed <- rbind(weeks_needed, data.frame( - week = wwy$week, year = wwy$year, date = check_date - )) - - # Check if missing, add to missing_weeks if so - week_pattern <- sprintf("week_%02d_%d", wwy$week, wwy$year) - mosaic_dir <- get_mosaic_dir(project_dir, mosaic_mode) - - if (length(list.files(mosaic_dir, pattern = week_pattern)) == 0) { - missing_weeks <- rbind(missing_weeks, data.frame( - week = wwy$week, year = wwy$year, week_end_date = check_date - )) - if (check_date - 6 < earliest_missing_date) { - earliest_missing_date <- check_date - 6 - } - } -} -``` - ---- - -### 3.4 Data Source Detection Logic - -**Lines 58-84**: The `data_source_used` detection is overly complex: - -```r -data_source_used <- "merged_tif_8b" # Default -if (dir.exists(merged_tif_path)) { - tif_files <- list.files(merged_tif_path, pattern = "\\.tif$") - if (length(tif_files) > 0) { - data_source_used <- "merged_tif" - # ... - } else if (dir.exists(merged_tif_8b_path)) { - tif_files_8b <- list.files(merged_tif_8b_path, pattern = "\\.tif$") - # ... - } -} else if (dir.exists(merged_tif_8b_path)) { - # ... -} -``` - -**Issues**: -- Multiple nested conditions doing the same check -- `tif_files` and `tif_files_8b` are listed but only counts checked (not used later) -- Logic could be cleaner - -**Solution**: Create utility function: -```r -detect_data_source <- function(project_dir, preferred = "auto") { - storage_dir <- get_project_storage_path(project_dir) - - for (source in c("merged_tif", "merged_tif_8b")) { - source_dir <- file.path(storage_dir, source) - if (dir.exists(source_dir)) { - tifs <- list.files(source_dir, pattern = "\\.tif$") - if (length(tifs) > 0) return(source) - } - } - - smartcane_warn("No data source found - defaulting to merged_tif_8b") - return("merged_tif_8b") -} -``` - ---- - -## 4. WORKFLOW CLARITY ISSUES - -### 4.1 TIFF Data Format Confusion - -**Problem**: Why are there TWO different TIFF folders? - -- `merged_tif`: 4-band data (RGB + NIR) -- `merged_tif_8b`: 8-band data (appears to include UDM cloud masking from Planet) - -**Currently in code**: -```r -data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif" -``` - -**Issues**: -- Hard-coded per project, not based on what's actually available -- Not documented **why** angata uses 8-band -- Unclear what the 8-band data adds (cloud masking? extra bands?) -- Scripts handle both, but it's not clear when to use which - -**Recommendation**: -1. **Document in `parameters_project.R`** what each data source contains: -```r -DATA_SOURCE_FORMATS <- list( - "merged_tif" = list( - bands = 4, - description = "4-band PlanetScope: Red, Green, Blue, NIR", - projects = c("aura", "chemba", "xinavane"), - note = "Standard format from Planet API" - ), - "merged_tif_8b" = list( - bands = 8, - description = "8-band PlanetScope with UDM: RGB+NIR + 4-band cloud mask", - projects = c("angata"), - note = "Enhanced with cloud confidence from UDM2 (Unusable Data Mask)" - ) -) -``` - -2. **Update hard-coded assignment** to be data-driven: -```r -# OLD: data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif" -# NEW: detect what's actually available -data_source <- detect_data_source(project_dir) -``` - ---- - -### 4.2 Mosaic Storage Format Confusion - -**Problem**: Why are there TWO different mosaic storage styles? - -- `weekly_mosaic/`: Single TIF file per week (monolithic) -- `weekly_tile_max/5x5/`: Tiled TIFFs per week (25+ files per week) - -**Currently in code**: -- Detected automatically via `detect_mosaic_mode()` -- But **no documentation** on when/why each is used - -**Recommendation**: -1. **Document the trade-offs in `parameters_project.R`**: -```r -MOSAIC_MODES <- list( - "single-file" = list( - description = "One TIF per week", - storage_path = "weekly_mosaic/", - files_per_week = 1, - pros = c("Simpler file management", "Easier to load full mosaic"), - cons = c("Slower for field-specific analysis", "Large file I/O"), - suitable_for = c("agronomic_support", "dashboard visualization") - ), - "tiled" = list( - description = "5×5 grid of tiles per week", - storage_path = "weekly_tile_max/5x5/", - files_per_week = 25, - pros = c("Parallel field processing", "Faster per-field queries", "Scalable to 1000+ fields"), - cons = c("More file management", "Requires tile_grid metadata"), - suitable_for = c("cane_supply", "large-scale operations") - ) -) -``` - -2. **Document why angata uses tiled, aura uses single-file**: - - Is it a function of field count? (Angata = cane_supply, large fields → tiled) - - Is it historical? (Legacy decision?) - - Should new projects choose based on client type? - ---- - -### 4.3 Client Type Mapping Clarity - -**Current structure** in `parameters_project.R`: - -```r -CLIENT_TYPE_MAP <- list( - "angata" = "cane_supply", - "aura" = "agronomic_support", - "chemba" = "cane_supply", - "xinavane" = "cane_supply", - "esa" = "cane_supply" -) -``` - -**Issues**: -- Not clear **why** aura is agronomic_support while angata/chemba are cane_supply -- No documentation of what each client type needs -- Scripts branch heavily on `skip_cane_supply_only` logic - -**Recommendation**: -Add metadata to explain the distinction: - -```r -CLIENT_TYPES <- list( - "cane_supply" = list( - description = "Sugar mill supply chain optimization", - requires_harvest_prediction = TRUE, # Script 31 - requires_phase_assignment = TRUE, # Based on planting date - per_field_detail = TRUE, # Script 91 Excel report - data_sources = c("merged_tif", "merged_tif_8b"), - mosaic_mode = "tiled", - projects = c("angata", "chemba", "xinavane", "esa") - ), - "agronomic_support" = list( - description = "Farm-level decision support for agronomists", - requires_harvest_prediction = FALSE, - requires_phase_assignment = FALSE, - per_field_detail = FALSE, - farm_level_kpis = TRUE, # Script 90 Word report - data_sources = c("merged_tif"), - mosaic_mode = "single-file", - projects = c("aura") - ) -) -``` - ---- - -## 5. COMMAND CONSTRUCTION REDUNDANCY - -### 5.1 Rscript Path Repetition - -**Problem**: The Rscript path is repeated 5 times: - -```r -Line 519: '"C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe"' -Line 676: '"C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe"' -Line 685: '"C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe"' -``` - -**Solution**: Define once in `parameters_project.R`: -```r -RSCRIPT_PATH <- "C:\\Program Files\\R\\R-4.4.3\\bin\\x64\\Rscript.exe" - -# Usage: -cmd <- sprintf('"%s" --vanilla r_app/20_ci_extraction.R ...', RSCRIPT_PATH) -``` - ---- - -## 6. SPECIFIC LINE-BY-LINE ISSUES - -### 6.1 Line 82 Bug: Wrong Format Code - -```r -cat(sprintf(" Running week: %02d / %d\n", - as.numeric(format(end_date, "%V")), - as.numeric(format(end_date, "%Y")))) # ❌ Should be %G, not %Y -``` - -**Issue**: Uses calendar year `%Y` instead of ISO week year `%G`. On dates like 2025-12-30 (week 1 of 2026), this will print "Week 01 / 2025" (confusing). - -**Fix**: -```r -wwy <- get_iso_week_year(end_date) -cat(sprintf(" Running week: %02d / %d\n", wwy$week, wwy$year)) -``` - ---- - -### 6.2 Line 630 Debug Statement - -```r -cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir) -cat("DEBUG: Running command:", cmd, "\n") # ❌ Prints full conda command -``` - -**Solution**: Use `smartcane_debug()` function: -```r -cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir) -smartcane_debug(sprintf("Running Python 31: %s", cmd)) -``` - ---- - -### 6.3 Lines 719-723: Verbose Script 31 Verification - -```r -# Check for THIS WEEK's specific file -current_week <- as.numeric(format(end_date, "%V")) -current_year <- as.numeric(format(end_date, "%Y")) -expected_file <- file.path(...) -``` - -**Issue**: Calculates week twice (already done earlier). Also uses `%Y` (should be `%G`). - -**Solution**: Reuse earlier `wwy` calculation or create helper. - ---- - -## 7. REFACTORING ROADMAP - -### Phase 1: Foundation (1 hour) -- [ ] Consolidate `detect_mosaic_mode()` into single function in `parameters_project.R` -- [ ] Create `get_iso_week_year()` and `format_week_year()` utilities -- [ ] Create `get_project_storage_path()`, `get_mosaic_dir()`, `get_kpi_dir()` helpers -- [ ] Add logging functions (`smartcane_log()`, `smartcane_debug()`, `smartcane_warn()`) - -### Phase 2: Deduplication (1 hour) -- [ ] Replace all 13+ week_num/year_num calculations with `get_iso_week_year()` -- [ ] Replace all 3 `detect_mosaic_mode_*()` calls with single function -- [ ] Combine duplicate KPI checks into `check_kpi_completeness()` function -- [ ] Fix line 82 and 630 format bugs - -### Phase 3: Cleanup (1 hour) -- [ ] Remove all debug statements (40+), replace with `smartcane_debug()` -- [ ] Simplify nested conditions in data_source detection -- [ ] Combine missing weeks detection into single loop -- [ ] Extract Rscript path to constant - -### Phase 4: Documentation (30 min) -- [ ] Add comments explaining `merged_tif` vs `merged_tif_8b` trade-offs -- [ ] Document `single-file` vs `tiled` mosaic modes and when to use each -- [ ] Clarify client type mapping in `CLIENT_TYPE_MAP` -- [ ] Add inline comments for non-obvious logic - ---- - -## 8. ARCHITECTURE & WORKFLOW RECOMMENDATIONS - -### 8.1 Clear Data Flow Diagram - -Add to `r_app/system_architecture/system_architecture.md`: - -``` -INPUT SOURCES: - ├── Planet API 4-band or 8-band imagery - ├── Field boundaries (pivot.geojson) - └── Harvest data (harvest.xlsx, optional for cane_supply) - -STORAGE TIERS: - ├── Tier 1: Raw data (merged_tif/ or merged_tif_8b/) - ├── Tier 2: Daily tiles (daily_tiles_split/{grid_size}/{dates}/) - ├── Tier 3: Extracted CI (Data/extracted_ci/daily_vals/*.rds) - ├── Tier 4: Weekly mosaics (weekly_mosaic/ OR weekly_tile_max/5x5/) - └── Tier 5: KPI outputs (reports/kpis/{field_level|field_analysis}/) - -DECISION POINTS: - └─ Client type (cane_supply vs agronomic_support) - ├─ Drives script selection (Scripts 21, 22, 23, 31, 90/91) - ├─ Drives data source (merged_tif_8b for cane_supply, merged_tif for agronomic) - ├─ Drives mosaic mode (tiled for cane_supply, single-file for agronomic) - └─ Drives KPI subdirectory (field_analysis vs field_level) -``` - -### 8.2 .sh Scripts Alignment - -You mention `.sh` scripts in the online environment. If they're **not calling the R pipeline**, there's a **split responsibility** issue: - -**Question**: Are the `.sh` scripts: -- (A) Independent duplicates of the R pipeline logic? (BAD - maintenance nightmare) -- (B) Wrappers calling the R pipeline? (GOOD - single source of truth) -- (C) Different workflow for online vs local? (RED FLAG - they diverge) - -**Recommendation**: If using `.sh` for production, ensure they **call the same R scripts** (`run_full_pipeline.R`). Example: - -```bash -#!/bin/bash -# Wrapper that ensures R pipeline is called -cd /path/to/smartcane -& "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R -``` - ---- - -## 9. SUMMARY TABLE: Issues by Severity - -| Issue | Type | Impact | Effort | Priority | -|-------|------|--------|--------|----------| -| 3 mosaic detection functions | Duplication | HIGH | 30 min | P0 | -| 13+ week/year calculations | Duplication | HIGH | 1 hour | P0 | -| 40+ debug statements | Clutter | MEDIUM | 1 hour | P1 | -| KPI check run twice | Inefficiency | LOW | 30 min | P2 | -| Line 82: %Y should be %G | Bug | LOW | 5 min | P2 | -| Data source confusion | Documentation | MEDIUM | 30 min | P1 | -| Mosaic mode confusion | Documentation | MEDIUM | 30 min | P1 | -| Client type mapping | Documentation | MEDIUM | 30 min | P1 | -| Data source detection complexity | Code style | LOW | 15 min | P3 | - ---- - -## 10. RECOMMENDED NEXT STEPS - -1. **Review this report** with your team to align on priorities -2. **Create Linear issues** for each phase of refactoring -3. **Start with Phase 1** (foundation utilities) - builds confidence for Phase 2 -4. **Test thoroughly** after each phase - the pipeline is complex and easy to break -5. **Update `.sh` scripts** if they duplicate R logic -6. **Document data flow** in `system_architecture/system_architecture.md` - ---- - -## Questions for Clarification - -Before implementing, please clarify: - -1. **Data source split**: Why does angata use `merged_tif_8b` (8-band with cloud mask) while aura uses `merged_tif` (4-band)? Is this: - - A function of client need (cane_supply requires cloud masking)? - - Historical (legacy decision for angata)? - - Should new projects choose based on availability? - -2. **Mosaic mode split**: Why tiled for angata but single-file for aura? Should this be: - - Hard-coded per project? - - Based on field count/client type? - - Auto-detected from first run? - -3. **Production vs local**: Are the `.sh` scripts in the online environment: - - Calling this same R pipeline? - - Duplicating logic independently? - - A different workflow entirely? - -4. **Client type growth**: Are there other client types planned beyond `cane_supply` and `agronomic_support`? (e.g., extension_service?) - ---- - -**Report prepared**: January 29, 2026 -**Total code reviewed**: ~2,500 lines across 10 files -**Estimated refactoring time**: 3-4 hours -**Estimated maintenance savings**: 5-10 hours/month (fewer bugs, easier updates) - diff --git a/r_app/00_common_utils.R b/r_app/00_common_utils.R new file mode 100644 index 0000000..1d0125f --- /dev/null +++ b/r_app/00_common_utils.R @@ -0,0 +1,401 @@ +# ============================================================================== +# 00_COMMON_UTILS.R +# ============================================================================== +# GENERIC UTILITY FUNCTIONS FOR SMARTCANE PIPELINE +# +# PURPOSE: +# Centralized location for foundational utilities used across multiple scripts. +# These functions have NO project knowledge, NO client-type dependencies, +# NO domain-specific logic. +# +# USAGE: +# All scripts (10, 20, 21, 30, 40, 80, 90, 91) should source this file: +# +# source(here::here("r_app", "parameters_project.R")) # Config first +# source(here::here("r_app", "00_common_utils.R")) # Then common utilities +# +# FUNCTIONS: +# 1. safe_log() — Generic logging with [LEVEL] prefix +# 2. smartcane_debug() — Conditional debug logging +# 3. smartcane_warn() — Convenience wrapper for WARN-level messages +# 4. date_list() — Generate date sequences for processing windows +# 5. get_iso_week() — Extract ISO week number from date +# 6. get_iso_year() — Extract ISO year from date +# 7. get_iso_week_year() — Extract both ISO week and year as list +# 8. format_week_label() — Format date as week/year label (e.g., "week01_2025") +# 9. load_field_boundaries() — Load field geometries from GeoJSON +# 10. load_harvesting_data() — Load harvest schedule from Excel +# +# ============================================================================== + +#' Safe Logging Function +#' +#' Generic logging with [LEVEL] prefix. Works standalone without any framework. +#' Consistent with SmartCane logging standard. +#' +#' @param message The message to log +#' @param level The log level (default: "INFO"). Options: "INFO", "WARNING", "ERROR", "DEBUG" +#' @return NULL (invisible, used for side effects) +#' +#' @examples +#' safe_log("Processing started", "INFO") +#' safe_log("Check input file", "WARNING") +#' safe_log("Failed to load data", "ERROR") +#' +safe_log <- function(message, level = "INFO") { + prefix <- sprintf("[%s]", level) + cat(sprintf("%s %s\n", prefix, message)) +} + +#' SmartCane Debug Logging (Conditional) +#' +#' Logs DEBUG-level messages only if verbose=TRUE or SMARTCANE_DEBUG env var is set. +#' Useful for development/troubleshooting without cluttering normal output. +#' +#' @param message The message to log +#' @param verbose Whether to output regardless of SMARTCANE_DEBUG (default: FALSE) +#' @return NULL (invisible, used for side effects) +#' +#' @examples +#' smartcane_debug("Processing field 1", verbose = FALSE) # Only if SMARTCANE_DEBUG=TRUE +#' smartcane_debug("Detailed state info", verbose = TRUE) # Always outputs +#' +smartcane_debug <- function(message, verbose = FALSE) { + if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") { + return(invisible(NULL)) + } + safe_log(message, level = "DEBUG") +} + +#' SmartCane Warning Logging +#' +#' Logs WARN-level messages. Convenience wrapper around safe_log(). +#' +#' @param message The message to log +#' @return NULL (invisible, used for side effects) +#' +#' @examples +#' smartcane_warn("Check data format before proceeding") +#' +smartcane_warn <- function(message) { + safe_log(message, level = "WARN") +} + +#' Extract ISO Week Number from Date +#' +#' Extracts ISO week number (1-53) from a date using %V format. +#' ISO weeks follow the international standard: Week 1 starts on Monday. +#' +#' @param date A Date object or string convertible to Date +#' @return Numeric: ISO week number (1-53) +#' +#' @examples +#' get_iso_week(as.Date("2025-01-15")) # Returns: 3 +#' +get_iso_week <- function(date) { + as.numeric(format(date, "%V")) +} + +#' Extract ISO Year from Date +#' +#' Extracts ISO year from a date using %G format. +#' ISO year can differ from calendar year around year boundaries. +#' +#' @param date A Date object or string convertible to Date +#' @return Numeric: ISO year +#' +#' @examples +#' get_iso_year(as.Date("2025-01-01")) # Returns: 2025 +#' +get_iso_year <- function(date) { + as.numeric(format(date, "%G")) +} + +#' Extract ISO Week and Year as List +#' +#' Combines get_iso_week() and get_iso_year() for convenience. +#' +#' @param date A Date object or string convertible to Date +#' @return List with elements: week (1-53), year +#' +#' @examples +#' wwy <- get_iso_week_year(as.Date("2025-01-15")) +#' # Returns: list(week = 3, year = 2025) +#' +get_iso_week_year <- function(date) { + list( + week = as.numeric(format(date, "%V")), + year = as.numeric(format(date, "%G")) + ) +} + +#' Format Date as Week/Year Label +#' +#' Converts a date into a readable week label format. +#' Useful for filenames, directory names, and output identification. +#' +#' @param date A Date object or string convertible to Date +#' @param separator Separator between week number and year (default: "_") +#' @return String in format "week##_YYYY" (e.g., "week03_2025") +#' +#' @examples +#' format_week_label(as.Date("2025-01-15")) # "week03_2025" +#' format_week_label(as.Date("2025-01-15"), "-") # "week03-2025" +#' +format_week_label <- function(date, separator = "_") { + wwy <- get_iso_week_year(date) + sprintf("week%02d%s%d", wwy$week, separator, wwy$year) +} + +#' Load Field Boundaries from GeoJSON +#' +#' Loads field polygon geometries from GeoJSON file (pivot.geojson or pivot_2.geojson). +#' Handles CRS validation and column standardization. +#' +#' @param data_dir Directory containing GeoJSON file +#' @return List with elements: +#' - field_boundaries_sf: sf (Simple Features) object +#' - field_boundaries: terra SpatVect object (if conversion successful, else sf fallback) +#' +#' @details +#' Automatically selects pivot_2.geojson for ESA project during CI extraction, +#' otherwise uses pivot.geojson. Handles both multi-polygon and simple polygon geometries. +#' +#' @examples +#' boundaries <- load_field_boundaries("laravel_app/storage/app/angata") +#' head(boundaries$field_boundaries_sf) +#' +load_field_boundaries <- function(data_dir) { + # Choose field boundaries file based on project and script type + # ESA project uses pivot_2.geojson ONLY for scripts 02-03 (CI extraction & growth model) + # All other scripts (including 04-mosaic, 09-KPIs, 10-reports) use pivot.geojson + use_pivot_2 <- exists("project_dir") && project_dir == "esa" && + exists("ci_extraction_script") # ci_extraction_script flag set by scripts 02-03 + + if (use_pivot_2) { + field_boundaries_path <- here(data_dir, "pivot_2.geojson") + } else { + field_boundaries_path <- here(data_dir, "pivot.geojson") + } + + if (!file.exists(field_boundaries_path)) { + stop(paste("Field boundaries file not found at path:", field_boundaries_path)) + } + + tryCatch({ + # Read GeoJSON with explicit CRS handling + field_boundaries_sf <- st_read(field_boundaries_path, quiet = TRUE) + + # Remove OBJECTID column immediately if it exists + if ("OBJECTID" %in% names(field_boundaries_sf)) { + field_boundaries_sf <- field_boundaries_sf %>% select(-OBJECTID) + } + + # Validate and fix CRS if needed + tryCatch({ + # Simply assign WGS84 if not already set (safe approach) + if (is.na(sf::st_crs(field_boundaries_sf)$epsg)) { + st_crs(field_boundaries_sf) <- 4326 + warning("CRS was missing, assigned WGS84 (EPSG:4326)") + } + }, error = function(e) { + tryCatch({ + st_crs(field_boundaries_sf) <<- 4326 + }, error = function(e2) { + warning(paste("Could not set CRS:", e2$message)) + }) + }) + + # Handle column names - accommodate optional sub_area column + if ("sub_area" %in% names(field_boundaries_sf)) { + field_boundaries_sf <- field_boundaries_sf %>% + dplyr::select(field, sub_field, sub_area) %>% + sf::st_set_geometry("geometry") + } else { + field_boundaries_sf <- field_boundaries_sf %>% + dplyr::select(field, sub_field) %>% + sf::st_set_geometry("geometry") + } + + # Convert to terra vector if possible, otherwise use sf + field_boundaries <- tryCatch({ + field_boundaries_terra <- terra::vect(field_boundaries_sf) + crs_value <- tryCatch(terra::crs(field_boundaries_terra), error = function(e) NULL) + crs_str <- if (!is.null(crs_value)) as.character(crs_value) else "" + + if (is.null(crs_value) || length(crs_value) == 0 || nchar(crs_str) == 0) { + terra::crs(field_boundaries_terra) <- "EPSG:4326" + warning("Terra object CRS was empty, assigned WGS84 (EPSG:4326)") + } + field_boundaries_terra + + }, error = function(e) { + warning(paste("Terra conversion failed, using sf object instead:", e$message)) + field_boundaries_sf + }) + + return(list( + field_boundaries_sf = field_boundaries_sf, + field_boundaries = field_boundaries + )) + }, error = function(e) { + cat("[DEBUG] Error in load_field_boundaries:\n") + cat(" Message:", e$message, "\n") + cat(" Call:", deparse(e$call), "\n") + stop(paste("Error loading field boundaries:", e$message)) + }) +} + +#' Load Harvesting Data from Excel +#' +#' Loads crop harvest schedule from harvest.xlsx file. +#' Handles flexible date formats (numeric, YYYY-MM-DD, DD/MM/YYYY, etc.). +#' +#' @param data_dir Directory containing harvest.xlsx file +#' @return Data frame with columns: field, sub_field, year, season_start, season_end, +#' age (weeks), sub_area, tonnage_ha. Returns NULL if file not found. +#' +#' @examples +#' harvest <- load_harvesting_data("laravel_app/storage/app/angata") +#' head(harvest) +#' +load_harvesting_data <- function(data_dir) { + harvest_file <- here(data_dir, "harvest.xlsx") + + if (!file.exists(harvest_file)) { + warning(paste("Harvest data file not found at path:", harvest_file)) + return(NULL) + } + + # Helper function to parse dates with multiple format detection + parse_flexible_date <- function(x) { + if (is.na(x) || is.null(x)) return(NA_real_) + if (inherits(x, "Date")) return(x) + if (inherits(x, "POSIXct")) return(as.Date(x)) + + # If it's numeric (Excel date serial), convert directly + if (is.numeric(x)) { + return(as.Date(x, origin = "1899-12-30")) + } + + # Try character conversion with multiple formats + x_char <- as.character(x) + formats <- c("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y-%m-%d %H:%M:%S") + + for (fmt in formats) { + result <- suppressWarnings(as.Date(x_char, format = fmt)) + if (!is.na(result)) return(result) + } + + return(NA) + } + + tryCatch({ + harvesting_data <- read_excel(harvest_file) %>% + dplyr::select( + c( + "field", + "sub_field", + "year", + "season_start", + "season_end", + "age", + "sub_area", + "tonnage_ha" + ) + ) %>% + mutate( + field = as.character(field), + sub_field = as.character(sub_field), + year = as.numeric(year), + season_start = sapply(season_start, parse_flexible_date), + season_end = sapply(season_end, parse_flexible_date), + season_start = as.Date(season_start, origin = "1970-01-01"), + season_end = as.Date(season_end, origin = "1970-01-01"), + age = as.numeric(age), + sub_area = as.character(sub_area), + tonnage_ha = as.numeric(tonnage_ha) + ) %>% + mutate( + season_end = case_when( + season_end > Sys.Date() ~ Sys.Date(), + is.na(season_end) ~ Sys.Date(), + TRUE ~ season_end + ), + age = round(as.numeric(season_end - season_start) / 7, 0) + ) + + return(harvesting_data) + }, error = function(e) { + warning(paste("Error loading harvesting data:", e$message)) + return(NULL) + }) +} + +#' Generate a Sequence of Dates for Processing +#' +#' Creates a date range from start_date to end_date and extracts week/year info. +#' Used by Scripts 20, 30, 40 to determine data processing windows. +#' +#' @param end_date The end date for the sequence (Date object or "YYYY-MM-DD" string) +#' @param offset Number of days to look back from end_date (e.g., 7 for one week) +#' @return A list containing: +#' - week: ISO week number of start_date +#' - year: ISO year of start_date +#' - days_filter: Vector of dates in "YYYY-MM-DD" format +#' - start_date: Start date as Date object +#' - end_date: End date as Date object +#' +#' @details +#' IMPORTANT: Uses `lubridate::week()` and `lubridate::year()` which return +#' ISO week numbers (week 1 starts on Monday). For ISO week-based calculations, +#' use `lubridate::isoweek()` and `lubridate::isoyear()` instead. +#' +#' @examples +#' dates <- date_list(as.Date("2025-01-15"), offset = 7) +#' # Returns: week=2, year=2025, days_filter = c("2025-01-09", ..., "2025-01-15") +#' +#' dates <- date_list("2025-12-31", offset = 14) +#' # Handles string input and returns 14 days of data +#' +date_list <- function(end_date, offset) { + # Input validation + if (!lubridate::is.Date(end_date)) { + end_date <- as.Date(end_date) + if (is.na(end_date)) { + stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.") + } + } + + offset <- as.numeric(offset) + if (is.na(offset) || offset < 1) { + stop("Invalid offset provided. Expected a positive number.") + } + + # Calculate date range + offset <- offset - 1 # Adjust offset to include end_date + start_date <- end_date - lubridate::days(offset) + + # Extract ISO week and year information (from END date for reporting period) + week <- lubridate::isoweek(end_date) + year <- lubridate::isoyear(end_date) + + # Generate sequence of dates + days_filter <- seq(from = start_date, to = end_date, by = "day") + days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering + + # Log the date range + safe_log(paste("Date range generated from", start_date, "to", end_date)) + + return(list( + "week" = week, + "year" = year, + "days_filter" = days_filter, + "start_date" = start_date, + "end_date" = end_date + )) +} + +# ============================================================================== +# END 00_COMMON_UTILS.R +# ============================================================================== diff --git a/r_app/10_create_per_field_tiffs.R b/r_app/10_create_per_field_tiffs.R index c82a15f..808967e 100644 --- a/r_app/10_create_per_field_tiffs.R +++ b/r_app/10_create_per_field_tiffs.R @@ -46,6 +46,7 @@ library(sf) # LOAD CENTRALIZED PARAMETERS & PATHS # ============================================================================== source(here::here("r_app", "parameters_project.R")) +source(here::here("r_app", "00_common_utils.R")) # Get project parameter from command line args <- commandArgs(trailingOnly = TRUE) @@ -58,9 +59,9 @@ if (length(args) == 0) { # Load centralized path structure (creates all directories automatically) paths <- setup_project_directories(PROJECT) -smartcane_log(paste("Project:", PROJECT)) -smartcane_log(paste("Base path:", paths$laravel_storage_dir)) -smartcane_log(paste("Data dir:", paths$data_dir)) +safe_log(paste("Project:", PROJECT)) +safe_log(paste("Base path:", paths$laravel_storage_dir)) +safe_log(paste("Data dir:", paths$data_dir)) # Unified function to crop TIFF to field boundaries # Called by both migration and processing phases @@ -72,14 +73,14 @@ crop_tiff_to_fields <- function(tif_path, tif_date, fields, output_base_dir) { # Load raster if (!file.exists(tif_path)) { - smartcane_log(paste("ERROR: TIFF not found:", tif_path)) + safe_log(paste("ERROR: TIFF not found:", tif_path)) return(list(created = 0, skipped = 0, errors = 1)) } rast <- tryCatch({ rast(tif_path) }, error = function(e) { - smartcane_log(paste("ERROR loading raster:", e$message)) + safe_log(paste("ERROR loading raster:", e$message)) return(NULL) }) @@ -99,7 +100,7 @@ crop_tiff_to_fields <- function(tif_path, tif_date, fields, output_base_dir) { overlapping_indices <- unique(unlist(overlapping_indices)) if (length(overlapping_indices) == 0) { - smartcane_log(paste("No fields intersect TIFF:", basename(tif_path))) + safe_log(paste("No fields intersect TIFF:", basename(tif_path))) return(list(created = 0, skipped = 0, errors = 0)) } @@ -129,7 +130,7 @@ crop_tiff_to_fields <- function(tif_path, tif_date, fields, output_base_dir) { writeRaster(field_rast, output_path, overwrite = TRUE) created <- created + 1 }, error = function(e) { - smartcane_log(paste("ERROR cropping field", field_name, ":", e$message)) + safe_log(paste("ERROR cropping field", field_name, ":", e$message)) errors <<- errors + 1 }) } @@ -142,13 +143,13 @@ crop_tiff_to_fields <- function(tif_path, tif_date, fields, output_base_dir) { # NORMAL MODE: Otherwise, process merged_tif/ → field_tiles/ process_new_merged_tif <- function(merged_tif_dir, field_tiles_dir, fields, field_tiles_ci_dir = NULL) { - smartcane_log("\n========================================") - smartcane_log("PHASE 2: PROCESSING NEW DOWNLOADS") - smartcane_log("========================================") + safe_log("\n========================================") + safe_log("PHASE 2: PROCESSING NEW DOWNLOADS") + safe_log("========================================") # Check if download directory exists if (!dir.exists(merged_tif_dir)) { - smartcane_log("No merged_tif/ directory found - no new data to process") + safe_log("No merged_tif/ directory found - no new data to process") return(list(total_created = 0, total_skipped = 0, total_errors = 0)) } @@ -164,10 +165,10 @@ process_new_merged_tif <- function(merged_tif_dir, field_tiles_dir, fields, fiel full.names = TRUE ) - smartcane_log(paste("Found", length(tiff_files), "TIFF(s) to process")) + safe_log(paste("Found", length(tiff_files), "TIFF(s) to process")) if (length(tiff_files) == 0) { - smartcane_log("No new TIFFs found - nothing to process") + safe_log("No new TIFFs found - nothing to process") return(list(total_created = 0, total_skipped = 0, total_errors = 0)) } @@ -196,13 +197,13 @@ process_new_merged_tif <- function(merged_tif_dir, field_tiles_dir, fields, fiel } if (date_migrated) { - smartcane_log(paste("Skipping:", tif_date, "(already migrated and processed by Script 20)")) + safe_log(paste("Skipping:", tif_date, "(already migrated and processed by Script 20)")) total_skipped <- total_skipped + 1 next } } - smartcane_log(paste("Processing:", tif_date)) + safe_log(paste("Processing:", tif_date)) result <- crop_tiff_to_fields(tif_path, tif_date, fields, field_tiles_dir) total_created <- total_created + result$created @@ -210,7 +211,7 @@ process_new_merged_tif <- function(merged_tif_dir, field_tiles_dir, fields, fiel total_errors <- total_errors + result$errors } - smartcane_log(paste("Processing complete: created =", total_created, + safe_log(paste("Processing complete: created =", total_created, ", skipped =", total_skipped, ", errors =", total_errors)) return(list(total_created = total_created, total_skipped = total_skipped, @@ -222,9 +223,9 @@ process_new_merged_tif <- function(merged_tif_dir, field_tiles_dir, fields, fiel # MAIN EXECUTION # ============================================================================== -smartcane_log("========================================") -smartcane_log(paste("Script 10: Per-Field TIFF Creation for", PROJECT)) -smartcane_log("========================================") +safe_log("========================================") +safe_log(paste("Script 10: Per-Field TIFF Creation for", PROJECT)) +safe_log("========================================") # Load field boundaries using centralized path (no dir.create needed - already created by setup_project_directories) fields <- load_field_boundaries(paths$field_boundaries_path) @@ -238,11 +239,11 @@ field_tiles_ci_dir <- paths$field_tiles_ci_dir # Pass field_tiles_ci_dir so it can skip dates already migrated process_result <- process_new_merged_tif(merged_tif_dir, field_tiles_dir, fields, field_tiles_ci_dir) -smartcane_log("\n========================================") -smartcane_log("FINAL SUMMARY") -smartcane_log("========================================") -smartcane_log(paste("Processing: created =", process_result$total_created, +safe_log("\n========================================") +safe_log("FINAL SUMMARY") +safe_log("========================================") +safe_log(paste("Processing: created =", process_result$total_created, ", skipped =", process_result$total_skipped, ", errors =", process_result$total_errors)) -smartcane_log("Script 10 complete") -smartcane_log("========================================\n") +safe_log("Script 10 complete") +safe_log("========================================\n") diff --git a/r_app/20_ci_extraction.R b/r_app/20_ci_extraction.R index 79f7f86..cedb9c6 100644 --- a/r_app/20_ci_extraction.R +++ b/r_app/20_ci_extraction.R @@ -114,6 +114,15 @@ main <- function() { # Load centralized path structure (creates all directories automatically) paths <- setup_project_directories(project_dir) + cat("[DEBUG] Attempting to source r_app/00_common_utils.R\n") + tryCatch({ + source("r_app/00_common_utils.R") + cat("[DEBUG] Successfully sourced r_app/00_common_utils.R\n") + }, error = function(e) { + cat("[ERROR] Failed to source r_app/00_common_utils.R:\n", e$message, "\n") + stop(e) + }) + cat("[DEBUG] Attempting to source r_app/20_ci_extraction_utils.R\n") tryCatch({ source("r_app/20_ci_extraction_utils.R") diff --git a/r_app/20_ci_extraction_utils.R b/r_app/20_ci_extraction_utils.R index 156a148..08f56b8 100644 --- a/r_app/20_ci_extraction_utils.R +++ b/r_app/20_ci_extraction_utils.R @@ -11,24 +11,6 @@ # - calc_ci_from_raster(): Calculate CI from 4-band raster (Chlorophyll Index formula: NIR/Green - 1) # - extract_ci_by_subfield(): Extract per-sub_field CI statistics from raster -#' Safe logging function that works whether log_message exists or not -#' -#' @param message The message to log -#' @param level The log level (default: "INFO") -#' @return NULL (used for side effects) -#' -safe_log <- function(message, level = "INFO") { - if (exists("log_message")) { - log_message(message, level) - } else { - if (level %in% c("ERROR", "WARNING")) { - warning(message) - } else { - message(message) - } - } -} - #' Generate a sequence of dates for processing #' #' @param end_date The end date for the sequence (Date object) diff --git a/r_app/30_growth_model_utils.R b/r_app/30_growth_model_utils.R index 32f0c34..7de7f47 100644 --- a/r_app/30_growth_model_utils.R +++ b/r_app/30_growth_model_utils.R @@ -5,24 +5,6 @@ # Utility functions for growth model interpolation and manipulation. # These functions support the creation of continuous growth models from point measurements. -#' Safe logging function that works whether log_message exists or not -#' -#' @param message The message to log -#' @param level The log level (default: "INFO") -#' @return NULL (used for side effects) -#' -safe_log <- function(message, level = "INFO") { - if (exists("log_message")) { - log_message(message, level) - } else { - if (level %in% c("ERROR", "WARNING")) { - warning(message) - } else { - message(message) - } - } -} - #' Load and prepare the combined CI data (Per-Field Architecture) #' #' @param daily_vals_dir Directory containing per-field daily RDS files (Data/extracted_ci/daily_vals) diff --git a/r_app/30_interpolate_growth_model.R b/r_app/30_interpolate_growth_model.R index 922a040..633617a 100644 --- a/r_app/30_interpolate_growth_model.R +++ b/r_app/30_interpolate_growth_model.R @@ -20,9 +20,11 @@ suppressPackageStartupMessages({ }) # ============================================================================= -# Load utility functions from 30_growth_model_utils.R +# Load configuration and utility functions # ============================================================================= -source("r_app/30_growth_model_utils.R") +source(here::here("r_app", "parameters_project.R")) +source(here::here("r_app", "00_common_utils.R")) +source(here::here("r_app", "30_growth_model_utils.R")) # ============================================================================= # Main Processing diff --git a/r_app/40_mosaic_creation.R b/r_app/40_mosaic_creation.R index 14f5b05..cb91f48 100644 --- a/r_app/40_mosaic_creation.R +++ b/r_app/40_mosaic_creation.R @@ -124,6 +124,7 @@ main <- function() { tryCatch({ source("r_app/parameters_project.R") + source("r_app/00_common_utils.R") source("r_app/40_mosaic_creation_utils.R") safe_log(paste("Successfully sourced files from 'r_app' directory.")) }, error = function(e) { diff --git a/r_app/40_mosaic_creation_per_field_utils.R b/r_app/40_mosaic_creation_per_field_utils.R index bf49773..821b02f 100644 --- a/r_app/40_mosaic_creation_per_field_utils.R +++ b/r_app/40_mosaic_creation_per_field_utils.R @@ -17,23 +17,6 @@ # ↓ # Scripts 90/91: Read weekly_mosaic/{FIELD}/week_WW_YYYY.tif (unchanged interface) -#' Safe logging function -#' @param message The message to log -#' @param level The log level (default: "INFO") -#' @return NULL (used for side effects) -#' -safe_log <- function(message, level = "INFO") { - if (exists("log_message")) { - log_message(message, level) - } else { - if (level %in% c("ERROR", "WARNING")) { - warning(message) - } else { - message(message) - } - } -} - #' Generate date range for processing (ISO week-based) #' #' @param end_date The end date (Date object or YYYY-MM-DD string) diff --git a/r_app/40_mosaic_creation_utils.R b/r_app/40_mosaic_creation_utils.R index a602aec..43bb3a9 100644 --- a/r_app/40_mosaic_creation_utils.R +++ b/r_app/40_mosaic_creation_utils.R @@ -35,66 +35,14 @@ detect_tile_structure_from_files <- function(merged_final_tif_dir) { )) } -#' Safe logging function -#' @param message The message to log -#' @param level The log level (default: "INFO") -#' @return NULL (used for side effects) -#' -safe_log <- function(message, level = "INFO") { - if (exists("log_message")) { - log_message(message, level) - } else { - if (level %in% c("ERROR", "WARNING")) { - warning(message) - } else { - message(message) - } - } -} - #' Generate a sequence of dates for processing #' #' @param end_date The end date for the sequence (Date object) #' @param offset Number of days to look back from end_date #' @return A list containing week number, year, and a sequence of dates for filtering #' -date_list <- function(end_date, offset) { - # Input validation - if (!lubridate::is.Date(end_date)) { - end_date <- as.Date(end_date) - if (is.na(end_date)) { - stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.") - } - } - - offset <- as.numeric(offset) - if (is.na(offset) || offset < 1) { - stop("Invalid offset provided. Expected a positive number.") - } - - # Calculate date range - offset <- offset - 1 # Adjust offset to include end_date - start_date <- end_date - lubridate::days(offset) - - # Extract week and year information - week <- lubridate::isoweek(end_date) - year <- lubridate::isoyear(end_date) - - # Generate sequence of dates - days_filter <- seq(from = start_date, to = end_date, by = "day") - days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering - - # Log the date range - safe_log(paste("Date range generated from", start_date, "to", end_date)) - - return(list( - "week" = week, - "year" = year, - "days_filter" = days_filter, - "start_date" = start_date, - "end_date" = end_date - )) -} +# NOTE: date_list() is now in 00_common_utils.R - import from there +# This function was duplicated and has been consolidated #' Create a weekly mosaic from available VRT files #' diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 5df29de..c47fede 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -122,6 +122,18 @@ suppressPackageStartupMessages({ # LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES # ============================================================================ +tryCatch({ + source(here("r_app", "parameters_project.R")) +}, error = function(e) { + stop("Error loading parameters_project.R: ", e$message) +}) + +tryCatch({ + source(here("r_app", "00_common_utils.R")) +}, error = function(e) { + stop("Error loading 00_common_utils.R: ", e$message) +}) + tryCatch({ source(here("r_app", "80_weekly_stats_utils.R")) }, error = function(e) { diff --git a/r_app/80_kpi_utils.R b/r_app/80_kpi_utils.R index 9a2fdea..702ada3 100644 --- a/r_app/80_kpi_utils.R +++ b/r_app/80_kpi_utils.R @@ -18,22 +18,6 @@ MORAN_THRESHOLD_HIGH <- 0.95 # Above this = very strong clustering (problemati MORAN_THRESHOLD_MODERATE <- 0.85 # Above this = moderate clustering MORAN_THRESHOLD_LOW <- 0.7 # Above this = normal field continuity -#' Logging utility for consistent message handling -#' @param message The message to log -#' @param level The log level (default: "INFO") -#' @return NULL (used for side effects) -safe_log <- function(message, level = "INFO") { - if (exists("log_message")) { - log_message(message, level) - } else { - if (level %in% c("ERROR", "WARNING")) { - warning(message) - } else { - message(message) - } - } -} - #' Calculate coefficient of variation for uniformity assessment #' @param values Numeric vector of CI values #' @return Coefficient of variation (CV) as decimal diff --git a/r_app/parameters_project.R b/r_app/parameters_project.R index daf20c3..9caa6ec 100644 --- a/r_app/parameters_project.R +++ b/r_app/parameters_project.R @@ -711,24 +711,11 @@ get_kpi_dir <- function(project_dir, client_type) { get_project_storage_path(project_dir, file.path("reports", "kpis", subdir)) } -# Logging functions for clean output -smartcane_log <- function(message, level = "INFO", verbose = TRUE) { - if (!verbose) return(invisible(NULL)) - timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") - prefix <- sprintf("[%s]", level) - cat(sprintf("%s %s\n", prefix, message)) -} - -smartcane_debug <- function(message, verbose = FALSE) { - if (!verbose && Sys.getenv("SMARTCANE_DEBUG") != "TRUE") { - return(invisible(NULL)) - } - smartcane_log(message, level = "DEBUG", verbose = TRUE) -} - -smartcane_warn <- function(message) { - smartcane_log(message, level = "WARN", verbose = TRUE) -} +# Logging functions moved to 00_common_utils.R +# - smartcane_log() — Main logging function with level prefix +# - smartcane_debug() — Conditional debug logging +# - smartcane_warn() — Warning wrapper +# Import with: source("r_app/00_common_utils.R") # ============================================================================ # PHASE 3 & 4: OPTIMIZATION & DOCUMENTATION diff --git a/r_app/report_utils.R b/r_app/report_utils.R index 822293f..15b0c95 100644 --- a/r_app/report_utils.R +++ b/r_app/report_utils.R @@ -4,24 +4,6 @@ # These functions support the creation of maps, charts and report elements # for the CI_report_dashboard_planet.Rmd document. -#' Safe logging function that works whether log_message exists or not -#' -#' @param message The message to log -#' @param level The log level (default: "INFO") -#' @return NULL (used for side effects) -#' -safe_log <- function(message, level = "INFO") { - if (exists("log_message")) { - log_message(message, level) - } else { - if (level %in% c("ERROR", "WARNING")) { - warning(message) - } else { - message(message) - } - } -} - #' Creates a sub-chunk for use within RMarkdown documents #' #' @param g A ggplot object to render in the sub-chunk diff --git a/r_app/run_full_pipeline.R b/r_app/run_full_pipeline.R index dac54d0..0336898 100644 --- a/r_app/run_full_pipeline.R +++ b/r_app/run_full_pipeline.R @@ -41,6 +41,7 @@ RSCRIPT_PATH <- file.path("C:", "Program Files", "R", "R-4.4.3", "bin", "x64", " # Load client type mapping and centralized paths from parameters_project.R source("r_app/parameters_project.R") +source("r_app/00_common_utils.R") paths <- setup_project_directories(project_dir) client_type <- get_client_type(project_dir) cat(sprintf("\nProject: %s → Client Type: %s\n", project_dir, client_type))