# 02b_CONVERT_CI_RDS_TO_CSV.R # ============================ # Convert combined_CI_data.rds (output of script 02) to CSV format for Python # This script runs AFTER script 02 (CI extraction) and creates a CSV that Python # can use for harvest date detection WITHOUT requiring the 'model' column (which # comes from script 03 after interpolation and harvest dates are known). # # Usage: Rscript 02b_convert_ci_rds_to_csv.R [project_dir] # - project_dir: Project directory name (e.g., "esa", "chemba", "angata") # # Output: CSV file at laravel_app/storage/app/{project_dir}/Data/extracted_ci/cumulative_vals/ci_data_for_python.csv # Columns: field, sub_field, Date, FitData, DOY, value (alias for FitData) # suppressPackageStartupMessages({ library(tidyverse) library(lubridate) library(zoo) library(here) }) # ============================================================================ # HELPER FUNCTIONS # ============================================================================ #' Convert wide format RDS to long format #' #' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns #' @return Long format tibble: field, sub_field, Date, FitData wide_to_long_ci_data <- function(ci_data_wide) { ci_data_wide %>% pivot_longer( cols = -c(field, sub_field), names_to = "Date", values_to = "FitData", values_drop_na = TRUE ) %>% mutate( Date = as.Date(Date), FitData = as.numeric(FitData) ) %>% filter(!is.na(FitData)) } #' Create daily interpolated sequences with DOY for each field #' #' For each field/sub_field combination, creates complete daily sequences from first to last date, #' fills in measurements, and interpolates missing dates. #' #' @param ci_data_long Long format tibble: field, sub_field, Date, FitData #' @return Tibble with: field, sub_field, Date, FitData, DOY, value create_interpolated_daily_sequences <- function(ci_data_long) { ci_data_long %>% group_by(field, sub_field) %>% nest() %>% mutate( data = map(data, function(df) { # Sort measurements by date df <- df %>% arrange(Date) # Create complete daily sequence from first to last date date_seq <- seq(min(df$Date), max(df$Date), by = "day") # Build daily dataframe (field/sub_field stay in outer df, not here) daily_df <- tibble( Date = date_seq, value = NA_real_, FitData = NA_real_, DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ... ) # Fill in actual measurement values for (i in seq_len(nrow(df))) { idx <- which(daily_df$Date == df$Date[i]) if (length(idx) > 0) { daily_df$value[idx] <- df$FitData[i] } } # Interpolate missing dates linearly daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE) daily_df }) ) %>% unnest(data) %>% select(field, sub_field, Date, FitData, DOY, value) %>% arrange(field, Date) } #' Validate conversion output #' #' @param ci_data_python Tibble with converted CI data #' @return Invisibly returns the tibble (for piping) validate_conversion_output <- function(ci_data_python) { cat(sprintf("\nValidation:\n")) cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field))) cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python))) cat(sprintf(" Date range: %s to %s\n", min(ci_data_python$Date, na.rm = TRUE), max(ci_data_python$Date, na.rm = TRUE))) cat(sprintf(" FitData range: %.2f to %.2f\n", min(ci_data_python$FitData, na.rm = TRUE), max(ci_data_python$FitData, na.rm = TRUE))) cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value)))) cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData)))) invisible(ci_data_python) } #' Print next steps message print_next_steps <- function() { cat("\nNext steps for Python harvest detection:\n") cat(" 1. Read this CSV file in Python\n") cat(" 2. Group by field to identify seasons\n") cat(" 3. Run LSTM model to detect harvest dates\n") cat(" 4. Save predicted harvest dates to Excel\n") cat(" 5. Use output in script 03 for interpolation\n") } # ============================================================================ # MAIN FUNCTION # ============================================================================ main <- function() { # Process command line arguments args <- commandArgs(trailingOnly = TRUE) # Get project directory if (length(args) >= 1 && !is.na(args[1])) { project_dir <- as.character(args[1]) } else if (exists("project_dir", envir = .GlobalEnv)) { project_dir <- get("project_dir", envir = .GlobalEnv) } else { project_dir <- "angata" } # Make available globally assign("project_dir", project_dir, envir = .GlobalEnv) cat(sprintf("Converting CI RDS to CSV: project=%s\n", project_dir)) # Initialize project configuration tryCatch({ source("parameters_project.R") }, error = function(e) { warning("Default parameters_project.R not found. Attempting from 'r_app' directory.") tryCatch({ source(here::here("r_app", "parameters_project.R")) }, error = function(e) { stop("Failed to source parameters_project.R from both default and 'r_app' directories.") }) }) # Define paths ci_data_source_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals") ci_data_output_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "ci_data_for_python") # Create output directory if it doesn't exist (for new projects) if (!dir.exists(ci_data_output_dir)) { dir.create(ci_data_output_dir, recursive = TRUE, showWarnings = FALSE) cat(sprintf("āœ“ Created output directory: %s\n", ci_data_output_dir)) } input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds") output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv") # Check if input file exists if (!file.exists(input_file)) { stop(paste("Input file not found:", input_file)) } cat(sprintf("Loading: %s\n", input_file)) # Load RDS file ci_data_wide <- readRDS(input_file) %>% as_tibble() cat(sprintf(" Loaded %d rows\n", nrow(ci_data_wide))) cat(sprintf(" Format: WIDE (field, sub_field, then dates as columns)\n")) cat(sprintf(" Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", "))) # Step 1: Convert from WIDE to LONG format cat("\nStep 1: Converting from wide to long format...\n") ci_data_long <- wide_to_long_ci_data(ci_data_wide) # Step 2: Create complete daily sequences with interpolation cat("\nStep 2: Creating complete daily sequences with interpolation...\n") ci_data_python <- create_interpolated_daily_sequences(ci_data_long) # Step 3: Validate output cat("\nStep 3: Validating output...") validate_conversion_output(ci_data_python) # Step 4: Save to CSV cat(sprintf("\nStep 4: Saving to CSV...\n")) cat(sprintf(" Output: %s\n", output_file)) write_csv(ci_data_python, output_file) cat(sprintf("\nāœ“ Successfully created CSV with %d rows\n", nrow(ci_data_python))) print_next_steps() } if (sys.nframe() == 0) { main() }