SmartCane/r_app/02b_convert_ci_rds_to_csv.R

# 02b_CONVERT_CI_RDS_TO_CSV.R
# ============================
# Convert combined_CI_data.rds (output of script 02) to CSV format for Python
# This script runs AFTER script 02 (CI extraction) and creates a CSV that Python
# can use for harvest date detection WITHOUT requiring the 'model' column (which
# comes from script 03 after interpolation and harvest dates are known).
#
# Usage: Rscript 02b_convert_ci_rds_to_csv.R [project_dir]
#   - project_dir: Project directory name (e.g., "esa", "chemba", "angata")
#
# Output: CSV file at laravel_app/storage/app/{project_dir}/Data/extracted_ci/cumulative_vals/ci_data_for_python.csv
#   Columns: field, sub_field, Date, FitData, DOY, value (alias for FitData)
#

suppressPackageStartupMessages({
  library(tidyverse)
  library(lubridate)
  library(here)
})

main <- function() {
  # Process command line arguments
  args <- commandArgs(trailingOnly = TRUE)

  # Get project directory
  if (length(args) >= 1 && !is.na(args[1])) {
    project_dir <- as.character(args[1])
  } else if (exists("project_dir", envir = .GlobalEnv)) {
    project_dir <- get("project_dir", envir = .GlobalEnv)
  } else {
    project_dir <- "esa"
  }

  # Make available globally
  assign("project_dir", project_dir, envir = .GlobalEnv)

  cat(sprintf("Converting CI RDS to CSV: project=%s\n", project_dir))

  # Initialize project configuration
  tryCatch({
    source("parameters_project.R")
  }, error = function(e) {
    warning("Default parameters_project.R not found. Attempting from 'r_app' directory.")
    tryCatch({
      source(here::here("r_app", "parameters_project.R"))
    }, error = function(e) {
      stop("Failed to source parameters_project.R from both default and 'r_app' directories.")
    })
  })

  # Define paths
  ci_data_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
  input_file <- file.path(ci_data_dir, "combined_CI_data.rds")
  output_file <- file.path(ci_data_dir, "ci_data_for_python.csv")

  # Check if input file exists
  if (!file.exists(input_file)) {
    stop(paste("Input file not found:", input_file))
  }

  cat(sprintf("Loading: %s\n", input_file))

  # Load RDS file
  ci_data <- readRDS(input_file) %>%
    as_tibble()

  cat(sprintf("  Loaded %d rows\n", nrow(ci_data)))
  cat(sprintf("  Columns: %s\n", paste(names(ci_data), collapse = ", ")))

  # Prepare data for Python
  ci_data_python <- ci_data %>%
    # Ensure standard column names
    rename(
      field = field,
      sub_field = sub_field,
      Date = Date,
      FitData = FitData,
      DOY = DOY
    ) %>%
    # Add 'value' as an alias for FitData (sometimes needed)
    mutate(value = FitData) %>%
    # Keep only necessary columns
    select(field, sub_field, Date, FitData, DOY, value) %>%
    # Sort by field and date
    arrange(field, Date)

  # Validate data
  cat(sprintf("\nValidation:\n"))
  cat(sprintf("  Unique fields: %d\n", n_distinct(ci_data_python$field)))
  cat(sprintf("  Date range: %s to %s\n",
              min(ci_data_python$Date, na.rm = TRUE),
              max(ci_data_python$Date, na.rm = TRUE)))
  cat(sprintf("  FitData range: %.2f to %.2f\n",
              min(ci_data_python$FitData, na.rm = TRUE),
              max(ci_data_python$FitData, na.rm = TRUE)))
  cat(sprintf("  Missing FitData: %d rows\n", sum(is.na(ci_data_python$FitData))))

  # Save to CSV
  cat(sprintf("\nSaving to: %s\n", output_file))

  write_csv(ci_data_python, output_file)

  cat(sprintf("✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
  cat("\nNext steps for Python harvest detection:\n")
  cat("  1. Read this CSV file in Python\n")
  cat("  2. Group by field to identify seasons\n")
  cat("  3. Run LSTM model to detect harvest dates\n")
  cat("  4. Save predicted harvest dates to Excel\n")
  cat("  5. Use output in script 03 for interpolation\n")
}

if (sys.nframe() == 0) {
  main()
}