261 lines
9.6 KiB
R
261 lines
9.6 KiB
R
# ============================================================================
|
|
# SCRIPT 21: Convert CI RDS to CSV (Python Compatibility Format)
|
|
# ============================================================================
|
|
# PURPOSE:
|
|
# Convert consolidated CI data from R's wide format (RDS) to Python-compatible
|
|
# long format (CSV). Prepares per-field CI time series for harvest detection
|
|
# models and Python ML workflows without requiring interpolated/modeled values.
|
|
#
|
|
# INPUT DATA:
|
|
# - Source: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds
|
|
# - Format: RDS (interpolated growth model data from Script 30)
|
|
# - Requirement: Script 30 must have completed growth model interpolation
|
|
#
|
|
# OUTPUT DATA:
|
|
# - Destination: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/
|
|
# - Format: CSV (long format)
|
|
# - Columns: field, sub_field, Date, FitData, DAH, value
|
|
#
|
|
# USAGE:
|
|
# Rscript 21_convert_ci_rds_to_csv.R [project]
|
|
#
|
|
# Example (Windows PowerShell):
|
|
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/21_convert_ci_rds_to_csv.R angata
|
|
#
|
|
# PARAMETERS:
|
|
# - project: Project name (character) - angata, chemba, xinavane, esa, simba
|
|
#
|
|
# CLIENT TYPES:
|
|
# - cane_supply (ANGATA): Yes - data export
|
|
# - agronomic_support (AURA): Yes - Python ML integration
|
|
#
|
|
# DEPENDENCIES:
|
|
# - Packages: tidyverse, lubridate, zoo
|
|
# - Utils files: parameters_project.R
|
|
# - Input data: combined_CI_data.rds from Script 20
|
|
# - Data directories: extracted_ci/cumulative_vals/
|
|
#
|
|
# NOTES:
|
|
# - Data source: Uses interpolated CI data from Script 30 (growth model output)
|
|
# - Handles both wide format and long format inputs from growth model
|
|
# - DAH (Days After Harvest): Calculated from date; represents crop age in days
|
|
# - Python integration: CSV format compatible with pandas/scikit-learn workflows
|
|
# - Used by: Python harvest detection models (harvest_date_prediction.py)
|
|
# - Exports complete growth curves with interpolated values for ML training
|
|
#
|
|
# RELATED ISSUES:
|
|
# SC-112: Utilities restructuring
|
|
# SC-108: Core pipeline improvements
|
|
#
|
|
# ============================================================================
|
|
|
|
suppressPackageStartupMessages({
|
|
# File path handling
|
|
library(here) # For relative path resolution (platform-independent file paths)
|
|
|
|
# Data manipulation
|
|
library(tidyverse) # For dplyr, readr (data wrangling and CSV I/O)
|
|
library(lubridate) # For date/time operations (DOY calculation)
|
|
library(zoo) # For zoo objects (gap filling, rolling operations)
|
|
})
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
#' Convert wide format RDS to long format
|
|
#'
|
|
#' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns
|
|
#' @return Long format tibble: field, sub_field, Date, FitData
|
|
wide_to_long_ci_data <- function(ci_data_wide) {
|
|
ci_data_wide %>%
|
|
pivot_longer(
|
|
cols = -c(field, sub_field),
|
|
names_to = "Date",
|
|
values_to = "FitData",
|
|
values_drop_na = TRUE
|
|
) %>%
|
|
mutate(
|
|
Date = as.Date(Date),
|
|
FitData = as.numeric(FitData)
|
|
) %>%
|
|
filter(!is.na(FitData))
|
|
}
|
|
|
|
#' Create daily interpolated sequences with DAH for each field
|
|
#'
|
|
#' For each field/sub_field combination, creates complete daily sequences from first to last date,
|
|
#' fills in measurements, and interpolates missing dates.
|
|
#'
|
|
#' @param ci_data_long Long format tibble: field, sub_field, Date, FitData
|
|
#' @return Tibble with: field, sub_field, Date, FitData, DAH, value
|
|
create_interpolated_daily_sequences <- function(ci_data_long) {
|
|
ci_data_long %>%
|
|
group_by(field, sub_field) %>%
|
|
nest() %>%
|
|
mutate(
|
|
data = map(data, function(df) {
|
|
# Sort measurements by date
|
|
df <- df %>% arrange(Date)
|
|
|
|
# Create complete daily sequence from first to last date
|
|
date_seq <- seq(min(df$Date), max(df$Date), by = "day")
|
|
|
|
# Build daily dataframe (field/sub_field stay in outer df, not here)
|
|
daily_df <- tibble(
|
|
Date = date_seq,
|
|
value = NA_real_,
|
|
FitData = NA_real_,
|
|
DAH = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ...
|
|
)
|
|
|
|
# Fill in actual measurement values
|
|
for (i in seq_len(nrow(df))) {
|
|
idx <- which(daily_df$Date == df$Date[i])
|
|
if (length(idx) > 0) {
|
|
daily_df$value[idx] <- df$FitData[i]
|
|
}
|
|
}
|
|
|
|
# Interpolate missing dates linearly
|
|
daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE)
|
|
|
|
daily_df
|
|
})
|
|
) %>%
|
|
unnest(data) %>%
|
|
select(field, sub_field, Date, FitData, DAH, value) %>%
|
|
arrange(field, Date)
|
|
}
|
|
|
|
#' Validate conversion output
|
|
#'
|
|
#' @param ci_data_python Tibble with converted CI data
|
|
#' @return Invisibly returns the tibble (for piping)
|
|
validate_conversion_output <- function(ci_data_python) {
|
|
cat(sprintf("\nValidation:\n"))
|
|
cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
|
|
cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python)))
|
|
cat(sprintf(" Date range: %s to %s\n",
|
|
min(ci_data_python$Date, na.rm = TRUE),
|
|
max(ci_data_python$Date, na.rm = TRUE)))
|
|
cat(sprintf(" FitData range: %.2f to %.2f\n",
|
|
min(ci_data_python$FitData, na.rm = TRUE),
|
|
max(ci_data_python$FitData, na.rm = TRUE)))
|
|
cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value))))
|
|
cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData))))
|
|
|
|
invisible(ci_data_python)
|
|
}
|
|
|
|
#' Print next steps message
|
|
print_next_steps <- function() {
|
|
cat("\nNext steps for Python harvest detection:\n")
|
|
cat(" 1. Read this CSV file in Python\n")
|
|
cat(" 2. Group by field to identify seasons\n")
|
|
cat(" 3. Run LSTM model to detect harvest dates\n")
|
|
cat(" 4. Save predicted harvest dates to Excel\n")
|
|
cat(" 5. Use output in script 03 for interpolation\n")
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN FUNCTION
|
|
# ============================================================================
|
|
|
|
main <- function() {
|
|
# Process command line arguments
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
|
|
# Get project directory
|
|
if (length(args) >= 1 && !is.na(args[1])) {
|
|
project_dir <- as.character(args[1])
|
|
} else if (exists("project_dir", envir = .GlobalEnv)) {
|
|
project_dir <- get("project_dir", envir = .GlobalEnv)
|
|
} else {
|
|
project_dir <- "angata"
|
|
}
|
|
|
|
# Make available globally
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
cat(sprintf("Converting CI RDS to CSV: project=%s\n", project_dir))
|
|
|
|
# Initialize project configuration and centralized paths
|
|
tryCatch({
|
|
source("parameters_project.R")
|
|
}, error = function(e) {
|
|
warning("Default parameters_project.R not found. Attempting from 'r_app' directory.")
|
|
tryCatch({
|
|
source(here::here("r_app", "parameters_project.R"))
|
|
}, error = function(e) {
|
|
stop("Failed to source parameters_project.R from both default and 'r_app' directories.")
|
|
})
|
|
})
|
|
|
|
# Load centralized path structure (creates all directories automatically)
|
|
paths <- setup_project_directories(project_dir)
|
|
|
|
# Use centralized paths (no need for dir.create - already handled)
|
|
ci_data_source_dir <- paths$cumulative_ci_vals_dir
|
|
ci_data_output_dir <- paths$ci_for_python_dir
|
|
|
|
# Try to load interpolated growth model data from Script 30
|
|
input_file <- file.path(ci_data_source_dir, "All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv")
|
|
|
|
# Check if input file exists
|
|
if (!file.exists(input_file)) {
|
|
stop(paste("Input file not found:", input_file,
|
|
"\nScript 30 (growth model) must be run before Script 21."))
|
|
}
|
|
|
|
cat(sprintf("Loading: %s\n", input_file))
|
|
|
|
# Load RDS file (from Script 30 - already in long format with interpolated values)
|
|
ci_data <- readRDS(input_file) %>%
|
|
as_tibble()
|
|
|
|
cat(sprintf(" Loaded %d rows\n", nrow(ci_data)))
|
|
cat(sprintf(" Columns: %s\n", paste(names(ci_data), collapse = ", ")))
|
|
|
|
# Check format and prepare for export
|
|
# If it's already in long format (from Script 30), use as-is
|
|
# Otherwise, convert from wide to long
|
|
if ("Date" %in% names(ci_data) || "date" %in% names(ci_data)) {
|
|
cat(" Detected: LONG format (from growth model)\n")
|
|
ci_data_long <- ci_data
|
|
} else {
|
|
cat(" Detected: WIDE format - converting to long...\n")
|
|
ci_data_long <- wide_to_long_ci_data(ci_data)
|
|
}
|
|
|
|
# Step 1: Ensure Date column exists and is properly formatted
|
|
ci_data_long <- ci_data_long %>%
|
|
mutate(
|
|
Date = as.Date(Date)
|
|
)
|
|
|
|
# Step 2: If interpolated values already present, use them; otherwise create interpolated sequences
|
|
if ("value" %in% names(ci_data_long)) {
|
|
# Already has interpolated values from Script 30
|
|
cat("\nStep 2: Using interpolated values from growth model...\n")
|
|
ci_data_python <- ci_data_long
|
|
} else {
|
|
# Create interpolated daily sequences
|
|
cat("\nStep 2: Creating complete daily sequences with interpolation...\n")
|
|
ci_data_python <- create_interpolated_daily_sequences(ci_data_long)
|
|
}
|
|
|
|
# Step 4: Save to CSV
|
|
cat(sprintf("\nStep 4: Saving to CSV...\\n"))
|
|
cat(sprintf(" Output: %s\\n", output_file))
|
|
write_csv(ci_data_python, output_file)
|
|
|
|
cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
|
|
print_next_steps()
|
|
}
|
|
|
|
if (sys.nframe() == 0) {
|
|
main()
|
|
}
|