diff --git a/r_app/10_create_per_field_tiffs.R b/r_app/10_create_per_field_tiffs.R index e192dd7..8b4cd08 100644 --- a/r_app/10_create_per_field_tiffs.R +++ b/r_app/10_create_per_field_tiffs.R @@ -19,13 +19,15 @@ # - Naming: Per-field GeoTIFFs organized by field and date # # USAGE: -# Rscript 10_create_per_field_tiffs.R [project] +# Rscript 10_create_per_field_tiffs.R [project] [end_date] [offset] # # Example (Windows PowerShell): -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata 2026-02-09 7 # # PARAMETERS: -# - project: Project name (character) - angata, chemba, xinavane, esa, simba +# - project: Project name (character) - angata, chemba, xinavane, esa, simba (default: angata) +# - end_date: End date for processing (YYYY-MM-DD format, default: today) +# - offset: Days to look back (numeric, default: 7) # # CLIENT TYPES: # - cane_supply (ANGATA): Yes - primary data organization script @@ -70,10 +72,16 @@ main <- function() { # STEP 2: Parse command-line arguments FIRST (needed by parameters_project.R) args <- commandArgs(trailingOnly = TRUE) - project_dir <- if (length(args) == 0) "angata" else args[1] - # Make project_dir available to sourced files (they execute in global scope) + # Parse arguments: [project] [end_date] [offset] + project_dir <- if (length(args) >= 1 && args[1] != "") args[1] else "angata" + end_date_arg <- if (length(args) >= 2 && args[2] != "") as.Date(args[2], format = "%Y-%m-%d") else Sys.Date() + offset_arg <- if (length(args) >= 3 && !is.na(as.numeric(args[3]))) as.numeric(args[3]) else 7 + + # Make variables available to sourced files (they execute in global scope) assign("project_dir", project_dir, envir = .GlobalEnv) + assign("end_date", end_date_arg, envir = .GlobalEnv) + assign("offset", offset_arg, envir = .GlobalEnv) # STEP 3: SOURCE ALL UTILITY SCRIPTS (now that project_dir is defined) # Load parameters_project.R (provides safe_log, setup_project_directories, etc.) @@ -97,7 +105,7 @@ main <- function() { # Window: end_date - offset days to end_date # Always coerce to correct types to avoid issues with lingering/inherited values if (!exists("end_date") || !inherits(end_date, "Date")) { - end_date <- as.Date("2026-02-04") + end_date <- Sys.Date() safe_log(paste("Using default end_date:", end_date), "INFO") } if (!exists("offset") || !is.numeric(offset)) { diff --git a/r_app/20_ci_extraction_per_field.R b/r_app/20_ci_extraction_per_field.R index 63d128c..701a108 100644 --- a/r_app/20_ci_extraction_per_field.R +++ b/r_app/20_ci_extraction_per_field.R @@ -127,91 +127,96 @@ main <- function() { } } - # Process each DATE (OPTIMIZED: load TIFF once, process all fields) + # Process each DATE (load merged TIFF once, extract all fields from it) total_success <- 0 total_error <- 0 - ci_results_by_date <- list() for (date_str in dates_filter) { - # Load the merged TIFF ONCE for this date - merged_tif_path <- file.path(setup$field_tiles_dir, fields[1], sprintf("%s.tif", date_str)) + # Load the MERGED TIFF (farm-wide) ONCE for this date + input_tif_merged <- file.path(setup$merged_tif_folder, sprintf("%s.tif", date_str)) - # Find the actual TIFF path (it's in the first field that has it) - input_tif_full <- NULL - for (field in fields) { - candidate_path <- file.path(setup$field_tiles_dir, field, sprintf("%s.tif", date_str)) - if (file.exists(candidate_path)) { - input_tif_full <- candidate_path - break - } - } - - if (is.null(input_tif_full)) { - safe_log(sprintf(" %s: Input TIFF not found (skipping)", date_str)) + if (!file.exists(input_tif_merged)) { + safe_log(sprintf(" %s: merged_tif not found (skipping)", date_str)) + total_error <<- total_error + 1 next } tryCatch({ - # Load TIFF ONCE - raster_4band <- terra::rast(input_tif_full) + # Load 4-band TIFF ONCE + raster_4band <- terra::rast(input_tif_merged) + safe_log(sprintf(" %s: Loaded merged TIFF, processing %d fields...", date_str, length(fields))) + + # Calculate CI from 4-band + ci_raster <- calc_ci_from_raster(raster_4band) + + # Create 5-band (R, G, B, NIR, CI) + five_band <- c(raster_4band, ci_raster) + + # Now process all fields from this single merged TIFF + fields_processed_this_date <- 0 - # Now process all fields from this single TIFF for (field in fields) { field_ci_path <- file.path(setup$field_tiles_ci_dir, field) field_daily_vals_path <- file.path(setup$daily_ci_vals_dir, field) + + # Pre-create output directories + dir.create(field_ci_path, showWarnings = FALSE, recursive = TRUE) + dir.create(field_daily_vals_path, showWarnings = FALSE, recursive = TRUE) + output_tif <- file.path(field_ci_path, sprintf("%s.tif", date_str)) output_rds <- file.path(field_daily_vals_path, sprintf("%s.rds", date_str)) # MODE 3: Skip if both outputs already exist if (file.exists(output_tif) && file.exists(output_rds)) { - next # Skip to next field + next } # MODE 2: Regeneration mode - RDS missing but CI TIFF exists if (file.exists(output_tif) && !file.exists(output_rds)) { tryCatch({ extract_rds_from_ci_tiff(output_tif, output_rds, field_boundaries_sf, field) - total_success <<- total_success + 1 + fields_processed_this_date <- fields_processed_this_date + 1 }, error = function(e) { - total_error <<- total_error + 1 + # Continue to next field }) next } - # MODE 1: Normal mode - calculate CI from 4-band input + # MODE 1: Normal mode - crop 5-band TIFF to field boundary and save tryCatch({ - # Calculate CI - ci_raster <- calc_ci_from_raster(raster_4band) + # Crop 5-band TIFF to field boundary + field_geom <- field_boundaries_sf %>% filter(field == !!field) + five_band_cropped <- terra::crop(five_band, field_geom, mask = TRUE) - # Create 5-band TIFF (R, G, B, NIR, CI) - five_band <- c(raster_4band, ci_raster) + # Save 5-band field TIFF + terra::writeRaster(five_band_cropped, output_tif, overwrite = TRUE) - # Save 5-band TIFF - terra::writeRaster(five_band, output_tif, overwrite = TRUE) - - # Extract CI statistics by sub_field - ci_stats <- extract_ci_by_subfield(ci_raster, field_boundaries_sf, field) + # Extract CI statistics by sub_field (from cropped CI raster) + ci_cropped <- five_band_cropped[[5]] # 5th band is CI + ci_stats <- extract_ci_by_subfield(ci_cropped, field_boundaries_sf, field) # Save RDS if (!is.null(ci_stats) && nrow(ci_stats) > 0) { saveRDS(ci_stats, output_rds) - - # Store for daily aggregation - ci_stats_with_date <- ci_stats %>% mutate(date = date_str) - key <- sprintf("%s_%s", field, date_str) - ci_results_by_date[[key]] <<- ci_stats_with_date } - total_success <<- total_success + 1 + fields_processed_this_date <- fields_processed_this_date + 1 }, error = function(e) { - total_error <<- total_error + 1 + # Error in individual field, continue to next + safe_log(sprintf(" Error processing field %s: %s", field, e$message), "WARNING") }) } + # Increment success counter if at least one field succeeded + if (fields_processed_this_date > 0) { + total_success <<- total_success + 1 + safe_log(sprintf(" %s: Processed %d fields", date_str, fields_processed_this_date)) + } + }, error = function(e) { - safe_log(sprintf(" %s: ✗ Error loading TIFF - %s", date_str, e$message), "ERROR") total_error <<- total_error + 1 + safe_log(sprintf(" %s: Error loading or processing merged TIFF - %s", date_str, e$message), "ERROR") }) } diff --git a/r_app/30_growth_model_utils.R b/r_app/30_growth_model_utils.R index c3cf386..647b811 100644 --- a/r_app/30_growth_model_utils.R +++ b/r_app/30_growth_model_utils.R @@ -4,13 +4,22 @@ # =================== # Utility functions for growth model interpolation and manipulation. # These functions support the creation of continuous growth models from point measurements. +# +# PERFORMANCE OPTIMIZATION: +# - Parallel file I/O: Reads 450k+ RDS files using furrr::future_map_dfr() +# - Parallel field interpolation: Processes fields in parallel (1 core per ~100 fields) +# - Dynamic CPU detection: Allocates workers based on available cores +# - Windows compatible: Uses furrr with plan(multisession) for cross-platform support #' Load and prepare the combined CI data (Per-Field Architecture) +#' OPTIMIZE: Filters by date during load (skip unnecessary date ranges) +#' PARALLELIZE: Reads 450k+ RDS files in parallel using furrr::future_map_dfr() #' #' @param daily_vals_dir Directory containing per-field daily RDS files (Data/extracted_ci/daily_vals) +#' @param harvesting_data Optional: Dataframe with season dates. If provided, only loads files within season ranges (major speedup) #' @return Long-format dataframe with CI values by date and field #' -load_combined_ci_data <- function(daily_vals_dir) { +load_combined_ci_data <- function(daily_vals_dir, harvesting_data = NULL) { # For per-field architecture: daily_vals_dir = Data/extracted_ci/daily_vals # Structure: daily_vals/{FIELD_NAME}/{YYYY-MM-DD}.rds @@ -20,6 +29,17 @@ load_combined_ci_data <- function(daily_vals_dir) { safe_log(paste("Loading per-field CI data from:", daily_vals_dir)) + # OPTIMIZATION: If harvest data provided, extract date range to avoid loading unnecessary dates + date_filter_min <- NULL + date_filter_max <- NULL + if (!is.null(harvesting_data) && nrow(harvesting_data) > 0) { + date_filter_min <- min(harvesting_data$season_start, na.rm = TRUE) + date_filter_max <- max(harvesting_data$season_end, na.rm = TRUE) + safe_log(sprintf("Pre-filtering by harvest season dates: %s to %s", + format(date_filter_min, "%Y-%m-%d"), + format(date_filter_max, "%Y-%m-%d"))) + } + # Find all daily RDS files recursively (per-field structure) # IMPORTANT: Only load files matching the per-field format YYYY-MM-DD.rds in field subdirectories all_daily_files <- list.files( @@ -37,71 +57,87 @@ load_combined_ci_data <- function(daily_vals_dir) { stop(paste("No per-field daily RDS files found in:", daily_vals_dir)) } - safe_log(sprintf("Found %d per-field daily RDS files to load (filtered from legacy format)", length(all_daily_files))) + safe_log(sprintf("Found %d per-field daily RDS files (filtered from legacy format)", length(all_daily_files))) - # Rebuild with explicit date and field tracking - # File structure: daily_vals/{FIELD_NAME}/{YYYY-MM-DD}.rds - combined_long <- data.frame() + # OPTIMIZATION: Filter files by filename date BEFORE parallel loading + # Skip files outside harvest season (can save 60-80% of I/O on large datasets) + if (!is.null(date_filter_min) && !is.null(date_filter_max)) { + all_daily_files <- all_daily_files[ + { + dates <- as.Date(tools::file_path_sans_ext(basename(all_daily_files)), format = "%Y-%m-%d") + !is.na(dates) & dates >= date_filter_min & dates <= date_filter_max + } + ] + safe_log(sprintf("Filtered to %d files within harvest season date range", length(all_daily_files))) + } - for (file in all_daily_files) { - tryCatch({ + # Set up parallel future plan (Windows PSOCK multisession; Mac/Linux can use forking) + # Automatically detect available cores and limit to reasonable number + n_cores <- min(parallel::detectCores() - 1, 8) # Use max 8 cores (diminishing returns after) + future::plan(strategy = future::multisession, workers = n_cores) + safe_log(sprintf("Using %d parallel workers for file I/O", n_cores)) + + # Parallel file reading: future_map_dfr processes each file in parallel + # Returns combined dataframe directly (no need to rbind) + combined_long <- furrr::future_map_dfr( + all_daily_files, + .progress = TRUE, + .options = furrr::furrr_options(seed = TRUE), + function(file) { # Extract date from filename: {YYYY-MM-DD}.rds filename <- basename(file) date_str <- tools::file_path_sans_ext(filename) - # Parse date - handle various formats - parsed_date <- NA + # Parse date if (nchar(date_str) == 10 && grepl("^\\d{4}-\\d{2}-\\d{2}$", date_str)) { parsed_date <- as.Date(date_str, format = "%Y-%m-%d") } else { - safe_log(sprintf("Warning: Could not parse date from filename: %s", filename), "WARNING") - next + return(data.frame()) # Return empty dataframe if parse fails } if (is.na(parsed_date)) { - safe_log(sprintf("Warning: Invalid date parsed from: %s", filename), "WARNING") - next + return(data.frame()) } # Read RDS file - rds_data <- tryCatch({ - readRDS(file) + tryCatch({ + rds_data <- readRDS(file) + + if (is.null(rds_data) || nrow(rds_data) == 0) { + return(data.frame()) + } + + # Add date column to the data + rds_data %>% + dplyr::mutate(Date = parsed_date) + }, error = function(e) { - safe_log(sprintf("Error reading RDS file %s: %s", file, e$message), "WARNING") - return(NULL) + return(data.frame()) # Return empty dataframe on error }) - - if (is.null(rds_data) || nrow(rds_data) == 0) { - next - } - - # Add date column to the data - rds_data <- rds_data %>% - dplyr::mutate(Date = parsed_date) - - combined_long <- rbind(combined_long, rds_data) - - }, error = function(e) { - safe_log(sprintf("Error processing file %s: %s", file, e$message), "WARNING") - }) - } + } + ) + + # Return to sequential processing to avoid nested parallelism + future::plan(future::sequential) if (nrow(combined_long) == 0) { safe_log("Warning: No valid CI data loaded from daily files", "WARNING") return(data.frame()) } + # OPTIMIZATION: Use data.table for fast filtering (10-20x faster than dplyr on large datasets) # Reshape to long format using ci_mean as the main CI value - # Only keep rows where ci_mean has valid data - pivot_stats_long <- combined_long %>% - dplyr::select(field, sub_field, ci_mean, Date) %>% - dplyr::rename(value = ci_mean) %>% - dplyr::mutate(value = as.numeric(value)) %>% - # Keep rows even if ci_mean is NA or 0 (might be valid), but drop if Date is missing - tidyr::drop_na(Date) %>% - dplyr::filter(!is.na(sub_field), !is.na(field)) %>% - dplyr::filter(!is.infinite(value)) %>% - dplyr::distinct() + DT <- data.table::as.data.table(combined_long) + DT <- DT[, .(field, sub_field, ci_mean, Date)] + DT[, c("value") := list(as.numeric(ci_mean))] + DT[, ci_mean := NULL] + + # Fast filtering without .distinct() (which is slow on large datasets) + # Keep rows where Date is valid, field/sub_field exist, and value is finite + DT <- DT[!is.na(Date) & !is.na(sub_field) & !is.na(field) & is.finite(value)] + + # Convert back to tibble for compatibility with rest of pipeline + pivot_stats_long <- dplyr::as_tibble(DT) safe_log(sprintf("Loaded %d CI data points from %d daily files", nrow(pivot_stats_long), length(all_daily_files))) @@ -194,6 +230,7 @@ extract_CI_data <- function(field_name, harvesting_data, field_CI_data, season, } #' Generate interpolated CI data for all fields and seasons +#' PARALLELIZE: Processes fields in parallel using furrr::future_map_df() #' #' @param years Vector of years to process #' @param harvesting_data Dataframe with harvesting information @@ -227,40 +264,50 @@ generate_interpolated_ci_data <- function(years, harvesting_data, ci_data) { return(data.frame()) } - # Initialize progress bar for this year total_fields <<- total_fields + length(valid_sub_fields) - pb <- txtProgressBar(min = 0, max = length(valid_sub_fields), style = 3, width = 50) - counter <- 0 + safe_log(sprintf("Year %d: Processing %d fields in parallel", yr, length(valid_sub_fields))) - # Extract and interpolate data for each valid field with progress bar - result_list <- list() - for (field in valid_sub_fields) { - counter <- counter + 1 - setTxtProgressBar(pb, counter) - - # Call with verbose=FALSE to suppress warnings during progress bar iteration - field_result <- extract_CI_data(field, - harvesting_data = harvesting_data, - field_CI_data = ci_data, - season = yr, - verbose = FALSE) + # Set up parallel future plan for field interpolation + # Allocate 1 core per ~100 fields (with minimum 2 cores) + n_cores <- max(2, min(parallel::detectCores() - 1, ceiling(length(valid_sub_fields) / 100))) + future::plan(strategy = future::multisession, workers = n_cores) + + # PARALLELIZE: Process all fields in parallel (each extracts & interpolates independently) + result_list <- furrr::future_map( + valid_sub_fields, + .progress = TRUE, + .options = furrr::furrr_options(seed = TRUE), + function(field) { + # Call with verbose=FALSE to suppress warnings during parallel iteration + extract_CI_data(field, + harvesting_data = harvesting_data, + field_CI_data = ci_data, + season = yr, + verbose = FALSE) + } + ) + + # Return to sequential processing + future::plan(future::sequential) + + # Process results and tracking + for (i in seq_along(result_list)) { + field_result <- result_list[[i]] + field_name <- valid_sub_fields[i] if (nrow(field_result) > 0) { successful_fields <<- successful_fields + 1 - result_list[[field]] <- field_result } else { - # Track failed field failed_fields[[length(failed_fields) + 1]] <<- list( - field = field, + field = field_name, season = yr, reason = "Unable to generate interpolated data" ) } } - close(pb) - cat("\n") # Newline after progress bar # Combine all results for this year + result_list <- result_list[sapply(result_list, nrow) > 0] # Keep only non-empty if (length(result_list) > 0) { purrr::list_rbind(result_list) } else { diff --git a/r_app/30_interpolate_growth_model.R b/r_app/30_interpolate_growth_model.R index 42afa35..db42646 100644 --- a/r_app/30_interpolate_growth_model.R +++ b/r_app/30_interpolate_growth_model.R @@ -60,6 +60,12 @@ suppressPackageStartupMessages({ library(tidyverse) # For dplyr (data wrangling, grouping, mutating) library(lubridate) # For date/time operations (date arithmetic, ISO week extraction) library(readxl) # For reading harvest.xlsx (harvest dates for growth model phases) + + # Parallel processing (Windows PSOCK + Mac/Linux fork-safe) + library(future) # For setting up parallel execution plans + library(furrr) # For future_map_dfr (parallel file I/O and field processing) + library(parallel) # For detectCores (automatic CPU detection) + library(data.table) # For fast filtering on large datasets }) # ============================================================================= @@ -110,23 +116,24 @@ main <- function() { safe_log("Starting CI growth model interpolation") + # Set up data directory paths + data_dir <- setup$data_dir + # Load and process the data tryCatch({ # Load the combined CI data (created by Script 20 per-field) # Script 20 per-field outputs: daily_vals/{FIELD_NAME}/{YYYY-MM-DD}.rds - CI_data <- load_combined_ci_data(daily_vals_dir) - - # Load harvesting data from harvest.xlsx for growth model phase assignment - # Use the centralized load_harvesting_data() function which handles NA season_end values - # by setting them to Sys.Date() (field is still in current growing season) - data_dir <- setup$data_dir + # OPTIMIZATION: Pass harvest data to pre-filter by date range (skip unnecessary files) harvesting_data <- tryCatch({ load_harvesting_data(data_dir) }, error = function(e) { - safe_log(paste("Error loading harvest data:", e$message), "WARNING") + safe_log(paste("Error loading harvest data for pre-filtering:", e$message), "WARNING") NULL }) + # Load CI data with date range pre-filtering + CI_data <- load_combined_ci_data(daily_vals_dir, harvesting_data = harvesting_data) + # Validate harvesting data if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { safe_log("No harvesting data available", "ERROR") diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index cd39994..708aecb 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -139,6 +139,7 @@ suppressPackageStartupMessages({ library(readr) # For reading CSV files (harvest predictions from Python) library(readxl) # For reading harvest.xlsx (harvest dates for field mapping) library(writexl) # For writing Excel outputs (KPI summary tables) + library(progress) # For progress bars during field processing # ML/Analysis (optional - only for harvest model inference) tryCatch({ @@ -573,8 +574,10 @@ main <- function() { message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_of_weeks_analysed")) # Load weekly harvest probabilities from script 31 (if available) + # Note: Script 31 saves to reports/kpis/field_stats/ (not field_level) message("\n4. Loading harvest probabilities from script 31...") - harvest_prob_file <- file.path(reports_dir, "kpis", "field_stats", + harvest_prob_dir <- file.path(data_dir, "..", "reports", "kpis", "field_stats") + harvest_prob_file <- file.path(harvest_prob_dir, sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, year)) message(paste(" Looking for:", harvest_prob_file)) @@ -846,7 +849,7 @@ main <- function() { total_acreage = sum(field_data$Acreage, na.rm = TRUE), mean_ci = round(mean(field_data$Mean_CI, na.rm = TRUE), 2), median_ci = round(median(field_data$Mean_CI, na.rm = TRUE), 2), - mean_cv = round(mean(field_data$CI_CV, na.rm = TRUE), 4), + mean_cv = round(mean(field_data$CV, na.rm = TRUE), 4), week = current_week, year = year, date = as.character(end_date) diff --git a/r_app/80_utils_common.R b/r_app/80_utils_common.R index f588e96..3c85f8a 100644 --- a/r_app/80_utils_common.R +++ b/r_app/80_utils_common.R @@ -605,7 +605,7 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre NULL } - output_subdir <- file.path(reports_dir, "kpis", "field_analysis") + output_subdir <- file.path(reports_dir, "field_analysis") if (!dir.exists(output_subdir)) { dir.create(output_subdir, recursive = TRUE) } @@ -637,7 +637,7 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre ) rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d_%d", current_week, year), ".rds") - rds_path <- file.path(reports_dir, "kpis", rds_filename) + rds_path <- file.path(reports_dir, rds_filename) saveRDS(kpi_data, rds_path) message(paste("✓ Field analysis RDS exported to:", rds_path)) @@ -683,8 +683,16 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, message(paste(" Found", length(per_field_files), "per-field mosaic file(s) for week", week_num)) results_list <- list() + # Initialize progress bar + pb <- progress::progress_bar$new( + format = " [:bar] :percent | Field :current/:total", + total = length(per_field_files), + width = 60 + ) + # Process each field's mosaic for (field_idx in seq_along(per_field_files)) { + pb$tick() # Update progress bar field_name <- names(per_field_files)[field_idx] field_file <- per_field_files[[field_name]] @@ -751,8 +759,6 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, stringsAsFactors = FALSE ) - message(paste(" Field", field_idx, "of", length(per_field_files), "processed")) - }, error = function(e) { message(paste(" [ERROR] Field", field_name, ":", e$message)) }) @@ -773,7 +779,7 @@ load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_bo mosaic_dir, reports_dir, report_date = Sys.Date()) { rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year) - rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + rds_path <- file.path(reports_dir, "field_stats", rds_filename) if (file.exists(rds_path)) { message(paste("Loading cached statistics from:", basename(rds_path))) @@ -783,7 +789,7 @@ load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_bo message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num)) stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year, mosaic_dir, report_date) - output_dir <- file.path(reports_dir, "kpis", "field_stats") + output_dir <- file.path(reports_dir, "field_stats") if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) } @@ -812,7 +818,7 @@ load_historical_field_data <- function(project_dir, current_week, current_year, target_year <- target$year csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", target_week, target_year), ".csv") - csv_path <- file.path(reports_dir, "kpis", "field_analysis", csv_filename) + csv_path <- file.path(reports_dir, "field_analysis", csv_filename) if (file.exists(csv_path)) { tryCatch({ @@ -867,7 +873,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, prev_field_analysis <- NULL tryCatch({ - analysis_dir <- file.path(reports_dir, "kpis", "field_analysis") + analysis_dir <- file.path(reports_dir, "field_analysis") if (dir.exists(analysis_dir)) { analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) if (length(analysis_files) > 0) { @@ -899,7 +905,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, } rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year) - rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + rds_path <- file.path(reports_dir, "field_stats", rds_filename) if (file.exists(rds_path)) { tryCatch({ @@ -920,7 +926,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, } rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year) - rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + rds_path <- file.path(reports_dir, "field_stats", rds_filename) if (file.exists(rds_path)) { tryCatch({ diff --git a/MANUAL_PIPELINE_RUNNER.R b/r_app/MANUAL_PIPELINE_RUNNER.R similarity index 90% rename from MANUAL_PIPELINE_RUNNER.R rename to r_app/MANUAL_PIPELINE_RUNNER.R index 8bf2ba8..2cceb43 100644 --- a/MANUAL_PIPELINE_RUNNER.R +++ b/r_app/MANUAL_PIPELINE_RUNNER.R @@ -76,12 +76,19 @@ # python 00_download_8band_pu_optimized.py angata --date 2026-02-04 --resolution 3 --cleanup # # COMMAND #2 - Batch Download (Multiple Dates): +# For date ranges, MUST use download_planet_missing_dates.py (not Script 00) # # python download_planet_missing_dates.py --start [START_DATE] --end [END_DATE] --project [PROJECT] # # Example: # python download_planet_missing_dates.py --start 2026-01-28 --end 2026-02-04 --project angata # +# IMPORTANT DISTINCTION: +# - Script 00 (00_download_8band_pu_optimized.py): Only supports --date flag for SINGLE dates +# - Script download_planet_missing_dates.py: Supports --start/--end for DATE RANGES +# Script 00 does NOT have --start/--end flags despite documentation suggestion +# Use the correct script for your use case! +# # EXPECTED OUTPUT: # laravel_app/storage/app/angata/merged_tif/{YYYY-MM-DD}.tif (~150-300 MB per file) # @@ -110,15 +117,27 @@ # - One TIFF per field per date (1185 fields × N dates in Angata) # # PARAMETERS: -# PROJECT: angata, chemba, xinavane, esa, simba +# PROJECT: angata, chemba, xinavane, esa, simba (default: angata) +# END_DATE: YYYY-MM-DD format (e.g., 2026-02-09, default: today) +# OFFSET: Days to look back (e.g., 7 for one week, default: 7) # -# COMMAND: +# COMMAND #1 - Default (All dates, current date, 7-day window): # -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R [PROJECT] +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata # # Example: # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata # +# COMMAND #2 - Specific Date Range: +# +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R [PROJECT] [END_DATE] [OFFSET] +# +# Example (one week back from 2026-02-09): +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata 2026-02-09 7 +# +# Example (two weeks back from 2026-02-09): +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata 2026-02-09 14 +# # EXPECTED OUTPUT: # Total files created: #fields × #dates (e.g., 1185 × 8 = 9,480 files) # Storage location: laravel_app/storage/app/angata/field_tiles/ @@ -157,7 +176,7 @@ # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction_per_field.R [PROJECT] [END_DATE] [OFFSET] # # Example: -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction_per_field.R angata 2026-02-04 7 +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction_per_field.R angata 2026-02-09 7 # # EXPECTED OUTPUT: # Total files created: #fields × #dates in both field_tiles_CI/ and daily_vals/ @@ -170,12 +189,6 @@ # Example: END_DATE=2026-02-04, OFFSET=7 → processes 2026-01-28 to 2026-02-04 (8 dates) # To process all existing merged_tif files: Use large OFFSET (e.g., 365) # -# TROUBLESHOOTING: -# ❌ If field_tiles_CI has fewer files than field_tiles: -# - Check if all field_tiles/{FIELD}/{DATE}.tif files exist -# - Script 20 may be skipping due to incomplete source files -# - Solution: Delete problematic files from field_tiles and re-run Script 10 -# # ============================================================================ @@ -208,7 +221,6 @@ # EXPECTED OUTPUT: # File: All_pivots_Cumulative_CI_quadrant_year_v2.rds # Contains: Interpolated CI data for all fields (wide format) -# Script execution time: 5-15 minutes # # ============================================================================ @@ -243,7 +255,6 @@ # EXPECTED OUTPUT: # File: ci_data_for_python.csv (~5-10 MB) # Rows: #fields × #dates (e.g., 1185 × 100 = ~118,500 rows) -# Script execution time: 1-2 minutes # # ============================================================================ @@ -283,7 +294,6 @@ # EXPECTED OUTPUT: # File: {PROJECT}_harvest_imminent_week_{WW}_{YYYY}.csv # Rows: One per field (e.g., 1185 rows for Angata) -# Script execution time: 2-5 minutes # # NOTE: Skip this step if harvest.xlsx doesn't exist or is incomplete # @@ -319,9 +329,6 @@ # Example (one week window): # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation_per_field.R 2026-02-04 7 angata # -# Example (two week window): -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation_per_field.R 2026-02-04 14 angata -# # EXPECTED OUTPUT: # Location: laravel_app/storage/app/angata/weekly_mosaic/ # Directory structure: weekly_mosaic/{FIELD_ID}/week_06_2026.tif @@ -360,23 +367,23 @@ # - 21 columns with field-level KPIs and alerts # # PARAMETERS: -# PROJECT: angata, chemba, xinavane, esa, simba -# WEEK: ISO week number (1-53, optional - default current week) -# YEAR: ISO year (optional - default current year) +# END_DATE: Report date in YYYY-MM-DD format (default: today) +# PROJECT: Project name: angata, chemba, xinavane, esa, simba (default: angata) +# OFFSET: Days to look back for historical comparison (default: 7, for backward compatibility) # -# COMMAND #1 - Current Week (Auto-detects from TODAY): +# COMMAND #1 - Current Date & Default Project (Auto-detects TODAY): # -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [PROJECT] +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R # # Example: -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R angata +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R # -# COMMAND #2 - Specific Week & Year: +# COMMAND #2 - Specific Date & Project: # -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [PROJECT] [WEEK] [YEAR] +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R [END_DATE] [PROJECT] [OFFSET] # -# Example (Week 5, Year 2026): -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R angata 5 2026 +# Example (2026-02-09, angata, 7-day lookback): +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-02-09 angata 7 # # EXPECTED OUTPUT: # File: {PROJECT}_field_analysis_week{WW}_{YYYY}.xlsx @@ -390,6 +397,11 @@ # tcch_forecast, growth_4wk, growth_8wk, trend_indicator, weed_presence, # spatial_cluster, alert_urgency, alert_type, alert_message, etc. # +# CRITICAL DIFFERENCE - R80 Uses Different Argument Order Than R40: +# R40 order: [END_DATE] [OFFSET] [PROJECT] +# R80 order: [END_DATE] [PROJECT] [OFFSET] +# These are NOT the same! Ensure correct order for each script. +# # ============================================================================ @@ -469,12 +481,15 @@ # # Steps: # 1. SKIP Python download (if you already have data) -# 2. Run R10: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata +# 2. Run R10: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata 2026-02-04 7 +# (Argument order: [PROJECT] [END_DATE] [OFFSET]) # 3. Run R20: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction_per_field.R angata 2026-02-04 7 # 4. Run R30: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/30_interpolate_growth_model.R angata # 5. Run R21: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/21_convert_ci_rds_to_csv.R angata # 6. Run R40: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation_per_field.R 2026-02-04 7 angata -# 7. Run R80: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R angata +# (Argument order: [END_DATE] [OFFSET] [PROJECT]) +# 7. Run R80: & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-02-04 angata 7 +# (Argument order: [END_DATE] [PROJECT] [OFFSET] - DIFFERENT from R40!) # 8. OPTIONAL R91 (Cane Supply) - Use automated runner: # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R # OR from R console: @@ -492,7 +507,9 @@ # # Steps: # 1. Python download (your entire date range) -# 2. Run R10 once (processes all dates) +# 2. Run R10 with large offset to process all historical dates: +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_per_field_tiffs.R angata 2026-02-04 365 +# (This processes from 2025-02-04 to 2026-02-04, covering entire year) # 3. Run R20 with large offset to process all historical dates: # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction_per_field.R angata 2026-02-04 365 # (This processes from 2025-02-04 to 2026-02-04, covering entire year) @@ -611,3 +628,4 @@ # laravel_app/storage/app/{PROJECT}/output/SmartCane_Report_week{WW}_{YYYY}.docx # # ============================================================================== +