244 lines
8.8 KiB
R
244 lines
8.8 KiB
R
# ============================================================================
|
||
# SCRIPT 21: Convert CI RDS to CSV (Python Compatibility Format)
|
||
# ============================================================================
|
||
# PURPOSE:
|
||
# Convert consolidated CI data from R's wide format (RDS) to Python-compatible
|
||
# long format (CSV). Prepares per-field CI time series for harvest detection
|
||
# models and Python ML workflows without requiring interpolated/modeled values.
|
||
#
|
||
# INPUT DATA:
|
||
# - Source: laravel_app/storage/app/{project}/combined_CI/combined_CI_data.rds
|
||
# - Format: RDS (wide format: fields × dates with CI values)
|
||
# - Requirement: Script 20 must have completed CI extraction
|
||
#
|
||
# OUTPUT DATA:
|
||
# - Destination: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/
|
||
# - Format: CSV (long format)
|
||
# - Columns: field, sub_field, Date, FitData, DOY, value
|
||
#
|
||
# USAGE:
|
||
# Rscript 21_convert_ci_rds_to_csv.R [project]
|
||
#
|
||
# Example (Windows PowerShell):
|
||
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/21_convert_ci_rds_to_csv.R angata
|
||
#
|
||
# PARAMETERS:
|
||
# - project: Project name (character) - angata, chemba, xinavane, esa, simba
|
||
#
|
||
# CLIENT TYPES:
|
||
# - cane_supply (ANGATA): Yes - data export
|
||
# - agronomic_support (AURA): Yes - Python ML integration
|
||
#
|
||
# DEPENDENCIES:
|
||
# - Packages: tidyverse, lubridate, zoo
|
||
# - Utils files: parameters_project.R
|
||
# - Input data: combined_CI_data.rds from Script 20
|
||
# - Data directories: extracted_ci/cumulative_vals/
|
||
#
|
||
# NOTES:
|
||
# - Transformation: Wide format (fields as rows, dates as columns) → Long format
|
||
# - Time series: Preserves all CI values without interpolation
|
||
# - DOY (Day of Year): Calculated from date for seasonal analysis
|
||
# - Python integration: CSV format compatible with pandas/scikit-learn workflows
|
||
# - Used by: Python harvest detection models (harvest_date_prediction.py)
|
||
# - Optional: Run only when exporting to Python for ML model training
|
||
#
|
||
# RELATED ISSUES:
|
||
# SC-112: Utilities restructuring
|
||
# SC-108: Core pipeline improvements
|
||
#
|
||
# ============================================================================
|
||
|
||
suppressPackageStartupMessages({
|
||
# File path handling
|
||
library(here) # For relative path resolution (platform-independent file paths)
|
||
|
||
# Data manipulation
|
||
library(tidyverse) # For dplyr, readr (data wrangling and CSV I/O)
|
||
library(lubridate) # For date/time operations (DOY calculation)
|
||
library(zoo) # For zoo objects (gap filling, rolling operations)
|
||
})
|
||
|
||
# ============================================================================
|
||
# HELPER FUNCTIONS
|
||
# ============================================================================
|
||
|
||
#' Convert wide format RDS to long format
|
||
#'
|
||
#' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns
|
||
#' @return Long format tibble: field, sub_field, Date, FitData
|
||
wide_to_long_ci_data <- function(ci_data_wide) {
|
||
ci_data_wide %>%
|
||
pivot_longer(
|
||
cols = -c(field, sub_field),
|
||
names_to = "Date",
|
||
values_to = "FitData",
|
||
values_drop_na = TRUE
|
||
) %>%
|
||
mutate(
|
||
Date = as.Date(Date),
|
||
FitData = as.numeric(FitData)
|
||
) %>%
|
||
filter(!is.na(FitData))
|
||
}
|
||
|
||
#' Create daily interpolated sequences with DOY for each field
|
||
#'
|
||
#' For each field/sub_field combination, creates complete daily sequences from first to last date,
|
||
#' fills in measurements, and interpolates missing dates.
|
||
#'
|
||
#' @param ci_data_long Long format tibble: field, sub_field, Date, FitData
|
||
#' @return Tibble with: field, sub_field, Date, FitData, DOY, value
|
||
create_interpolated_daily_sequences <- function(ci_data_long) {
|
||
ci_data_long %>%
|
||
group_by(field, sub_field) %>%
|
||
nest() %>%
|
||
mutate(
|
||
data = map(data, function(df) {
|
||
# Sort measurements by date
|
||
df <- df %>% arrange(Date)
|
||
|
||
# Create complete daily sequence from first to last date
|
||
date_seq <- seq(min(df$Date), max(df$Date), by = "day")
|
||
|
||
# Build daily dataframe (field/sub_field stay in outer df, not here)
|
||
daily_df <- tibble(
|
||
Date = date_seq,
|
||
value = NA_real_,
|
||
FitData = NA_real_,
|
||
DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ...
|
||
)
|
||
|
||
# Fill in actual measurement values
|
||
for (i in seq_len(nrow(df))) {
|
||
idx <- which(daily_df$Date == df$Date[i])
|
||
if (length(idx) > 0) {
|
||
daily_df$value[idx] <- df$FitData[i]
|
||
}
|
||
}
|
||
|
||
# Interpolate missing dates linearly
|
||
daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE)
|
||
|
||
daily_df
|
||
})
|
||
) %>%
|
||
unnest(data) %>%
|
||
select(field, sub_field, Date, FitData, DOY, value) %>%
|
||
arrange(field, Date)
|
||
}
|
||
|
||
#' Validate conversion output
|
||
#'
|
||
#' @param ci_data_python Tibble with converted CI data
|
||
#' @return Invisibly returns the tibble (for piping)
|
||
validate_conversion_output <- function(ci_data_python) {
|
||
cat(sprintf("\nValidation:\n"))
|
||
cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
|
||
cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python)))
|
||
cat(sprintf(" Date range: %s to %s\n",
|
||
min(ci_data_python$Date, na.rm = TRUE),
|
||
max(ci_data_python$Date, na.rm = TRUE)))
|
||
cat(sprintf(" FitData range: %.2f to %.2f\n",
|
||
min(ci_data_python$FitData, na.rm = TRUE),
|
||
max(ci_data_python$FitData, na.rm = TRUE)))
|
||
cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value))))
|
||
cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData))))
|
||
|
||
invisible(ci_data_python)
|
||
}
|
||
|
||
#' Print next steps message
|
||
print_next_steps <- function() {
|
||
cat("\nNext steps for Python harvest detection:\n")
|
||
cat(" 1. Read this CSV file in Python\n")
|
||
cat(" 2. Group by field to identify seasons\n")
|
||
cat(" 3. Run LSTM model to detect harvest dates\n")
|
||
cat(" 4. Save predicted harvest dates to Excel\n")
|
||
cat(" 5. Use output in script 03 for interpolation\n")
|
||
}
|
||
|
||
# ============================================================================
|
||
# MAIN FUNCTION
|
||
# ============================================================================
|
||
|
||
main <- function() {
|
||
# Process command line arguments
|
||
args <- commandArgs(trailingOnly = TRUE)
|
||
|
||
# Get project directory
|
||
if (length(args) >= 1 && !is.na(args[1])) {
|
||
project_dir <- as.character(args[1])
|
||
} else if (exists("project_dir", envir = .GlobalEnv)) {
|
||
project_dir <- get("project_dir", envir = .GlobalEnv)
|
||
} else {
|
||
project_dir <- "angata"
|
||
}
|
||
|
||
# Make available globally
|
||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||
|
||
cat(sprintf("Converting CI RDS to CSV: project=%s\n", project_dir))
|
||
|
||
# Initialize project configuration and centralized paths
|
||
tryCatch({
|
||
source("parameters_project.R")
|
||
}, error = function(e) {
|
||
warning("Default parameters_project.R not found. Attempting from 'r_app' directory.")
|
||
tryCatch({
|
||
source(here::here("r_app", "parameters_project.R"))
|
||
}, error = function(e) {
|
||
stop("Failed to source parameters_project.R from both default and 'r_app' directories.")
|
||
})
|
||
})
|
||
|
||
# Load centralized path structure (creates all directories automatically)
|
||
paths <- setup_project_directories(project_dir)
|
||
|
||
# Use centralized paths (no need for dir.create - already handled)
|
||
ci_data_source_dir <- paths$cumulative_ci_vals_dir
|
||
ci_data_output_dir <- paths$ci_for_python_dir
|
||
|
||
input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds")
|
||
output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv")
|
||
|
||
# Check if input file exists
|
||
if (!file.exists(input_file)) {
|
||
stop(paste("Input file not found:", input_file))
|
||
}
|
||
|
||
cat(sprintf("Loading: %s\n", input_file))
|
||
|
||
# Load RDS file
|
||
ci_data_wide <- readRDS(input_file) %>%
|
||
as_tibble()
|
||
|
||
cat(sprintf(" Loaded %d rows\n", nrow(ci_data_wide)))
|
||
cat(sprintf(" Format: WIDE (field, sub_field, then dates as columns)\n"))
|
||
cat(sprintf(" Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", ")))
|
||
|
||
# Step 1: Convert from WIDE to LONG format
|
||
cat("\nStep 1: Converting from wide to long format...\n")
|
||
ci_data_long <- wide_to_long_ci_data(ci_data_wide)
|
||
|
||
# Step 2: Create complete daily sequences with interpolation
|
||
cat("\nStep 2: Creating complete daily sequences with interpolation...\n")
|
||
ci_data_python <- create_interpolated_daily_sequences(ci_data_long)
|
||
|
||
# Step 3: Validate output
|
||
cat("\nStep 3: Validating output...")
|
||
validate_conversion_output(ci_data_python)
|
||
|
||
# Step 4: Save to CSV
|
||
cat(sprintf("\nStep 4: Saving to CSV...\n"))
|
||
cat(sprintf(" Output: %s\n", output_file))
|
||
write_csv(ci_data_python, output_file)
|
||
|
||
cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
|
||
print_next_steps()
|
||
}
|
||
|
||
if (sys.nframe() == 0) {
|
||
main()
|
||
}
|