# ============================================================================ # SCRIPT 21: Convert CI RDS to CSV (Python Compatibility Format) # ============================================================================ # PURPOSE: # Convert consolidated CI data from R's wide format (RDS) to Python-compatible # long format (CSV). Prepares per-field CI time series for harvest detection # models and Python ML workflows without requiring interpolated/modeled values. # # INPUT DATA: # - Source: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds # - Format: RDS (interpolated growth model data from Script 30) # - Requirement: Script 30 must have completed growth model interpolation # # OUTPUT DATA: # - Destination: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/ # - Format: CSV (long format) # - Columns: field, sub_field, Date, FitData, DOY, value # # USAGE: # Rscript 21_convert_ci_rds_to_csv.R [project] # # Example (Windows PowerShell): # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/21_convert_ci_rds_to_csv.R angata # # PARAMETERS: # - project: Project name (character) - angata, chemba, xinavane, esa, simba # # CLIENT TYPES: # - cane_supply (ANGATA): Yes - data export # - agronomic_support (AURA): Yes - Python ML integration # # DEPENDENCIES: # - Packages: tidyverse, lubridate, zoo # - Utils files: parameters_project.R # - Input data: combined_CI_data.rds from Script 20 # - Data directories: extracted_ci/cumulative_vals/ # # NOTES: # - Data source: Uses interpolated CI data from Script 30 (growth model output) # - Handles both wide format and long format inputs from growth model # - DOY (Day of Year): Calculated from date for seasonal analysis # - Python integration: CSV format compatible with pandas/scikit-learn workflows # - Used by: Python harvest detection models (harvest_date_prediction.py) # - Exports complete growth curves with interpolated values for ML training # # RELATED ISSUES: # SC-112: Utilities restructuring # SC-108: Core pipeline improvements # # ============================================================================ suppressPackageStartupMessages({ # File path handling library(here) # For relative path resolution (platform-independent file paths) # Data manipulation library(tidyverse) # For dplyr, readr (data wrangling and CSV I/O) library(lubridate) # For date/time operations (DOY calculation) library(zoo) # For zoo objects (gap filling, rolling operations) }) # ============================================================================ # HELPER FUNCTIONS # ============================================================================ #' Convert wide format RDS to long format #' #' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns #' @return Long format tibble: field, sub_field, Date, FitData wide_to_long_ci_data <- function(ci_data_wide) { ci_data_wide %>% pivot_longer( cols = -c(field, sub_field), names_to = "Date", values_to = "FitData", values_drop_na = TRUE ) %>% mutate( Date = as.Date(Date), FitData = as.numeric(FitData) ) %>% filter(!is.na(FitData)) } #' Create daily interpolated sequences with DOY for each field #' #' For each field/sub_field combination, creates complete daily sequences from first to last date, #' fills in measurements, and interpolates missing dates. #' #' @param ci_data_long Long format tibble: field, sub_field, Date, FitData #' @return Tibble with: field, sub_field, Date, FitData, DOY, value create_interpolated_daily_sequences <- function(ci_data_long) { ci_data_long %>% group_by(field, sub_field) %>% nest() %>% mutate( data = map(data, function(df) { # Sort measurements by date df <- df %>% arrange(Date) # Create complete daily sequence from first to last date date_seq <- seq(min(df$Date), max(df$Date), by = "day") # Build daily dataframe (field/sub_field stay in outer df, not here) daily_df <- tibble( Date = date_seq, value = NA_real_, FitData = NA_real_, DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ... ) # Fill in actual measurement values for (i in seq_len(nrow(df))) { idx <- which(daily_df$Date == df$Date[i]) if (length(idx) > 0) { daily_df$value[idx] <- df$FitData[i] } } # Interpolate missing dates linearly daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE) daily_df }) ) %>% unnest(data) %>% select(field, sub_field, Date, FitData, DOY, value) %>% arrange(field, Date) } #' Validate conversion output #' #' @param ci_data_python Tibble with converted CI data #' @return Invisibly returns the tibble (for piping) validate_conversion_output <- function(ci_data_python) { cat(sprintf("\nValidation:\n")) cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field))) cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python))) cat(sprintf(" Date range: %s to %s\n", min(ci_data_python$Date, na.rm = TRUE), max(ci_data_python$Date, na.rm = TRUE))) cat(sprintf(" FitData range: %.2f to %.2f\n", min(ci_data_python$FitData, na.rm = TRUE), max(ci_data_python$FitData, na.rm = TRUE))) cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value)))) cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData)))) invisible(ci_data_python) } #' Print next steps message print_next_steps <- function() { cat("\nNext steps for Python harvest detection:\n") cat(" 1. Read this CSV file in Python\n") cat(" 2. Group by field to identify seasons\n") cat(" 3. Run LSTM model to detect harvest dates\n") cat(" 4. Save predicted harvest dates to Excel\n") cat(" 5. Use output in script 03 for interpolation\n") } # ============================================================================ # MAIN FUNCTION # ============================================================================ main <- function() { # Process command line arguments args <- commandArgs(trailingOnly = TRUE) # Get project directory if (length(args) >= 1 && !is.na(args[1])) { project_dir <- as.character(args[1]) } else if (exists("project_dir", envir = .GlobalEnv)) { project_dir <- get("project_dir", envir = .GlobalEnv) } else { project_dir <- "angata" } # Make available globally assign("project_dir", project_dir, envir = .GlobalEnv) cat(sprintf("Converting CI RDS to CSV: project=%s\n", project_dir)) # Initialize project configuration and centralized paths tryCatch({ source("parameters_project.R") }, error = function(e) { warning("Default parameters_project.R not found. Attempting from 'r_app' directory.") tryCatch({ source(here::here("r_app", "parameters_project.R")) }, error = function(e) { stop("Failed to source parameters_project.R from both default and 'r_app' directories.") }) }) # Load centralized path structure (creates all directories automatically) paths <- setup_project_directories(project_dir) # Use centralized paths (no need for dir.create - already handled) ci_data_source_dir <- paths$cumulative_ci_vals_dir ci_data_output_dir <- paths$ci_for_python_dir # Try to load interpolated growth model data from Script 30 input_file <- file.path(ci_data_source_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds") output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv") # Check if input file exists if (!file.exists(input_file)) { stop(paste("Input file not found:", input_file, "\nScript 30 (growth model) must be run before Script 21.")) } cat(sprintf("Loading: %s\n", input_file)) # Load RDS file (from Script 30 - already in long format with interpolated values) ci_data <- readRDS(input_file) %>% as_tibble() cat(sprintf(" Loaded %d rows\n", nrow(ci_data))) cat(sprintf(" Columns: %s\n", paste(names(ci_data), collapse = ", "))) # Check format and prepare for export # If it's already in long format (from Script 30), use as-is # Otherwise, convert from wide to long if ("Date" %in% names(ci_data) || "date" %in% names(ci_data)) { cat(" Detected: LONG format (from growth model)\n") ci_data_long <- ci_data } else { cat(" Detected: WIDE format - converting to long...\n") ci_data_long <- wide_to_long_ci_data(ci_data) } # Step 1: Ensure Date column exists and is properly formatted ci_data_long <- ci_data_long %>% mutate( Date = as.Date(Date) ) # Step 2: If interpolated values already present, use them; otherwise create interpolated sequences if ("value" %in% names(ci_data_long)) { # Already has interpolated values from Script 30 cat("\nStep 2: Using interpolated values from growth model...\n") ci_data_python <- ci_data_long } else { # Create interpolated daily sequences cat("\nStep 2: Creating complete daily sequences with interpolation...\n") ci_data_python <- create_interpolated_daily_sequences(ci_data_long) } # Step 4: Save to CSV cat(sprintf("\nStep 4: Saving to CSV...\\n")) cat(sprintf(" Output: %s\\n", output_file)) write_csv(ci_data_python, output_file) cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python))) print_next_steps() } if (sys.nframe() == 0) { main() }