204 lines
7.1 KiB
R
204 lines
7.1 KiB
R
# 02b_CONVERT_CI_RDS_TO_CSV.R
|
|
# ============================
|
|
# Convert combined_CI_data.rds (output of script 02) to CSV format for Python
|
|
# This script runs AFTER script 02 (CI extraction) and creates a CSV that Python
|
|
# can use for harvest date detection WITHOUT requiring the 'model' column (which
|
|
# comes from script 03 after interpolation and harvest dates are known).
|
|
#
|
|
# Usage: Rscript 02b_convert_ci_rds_to_csv.R [project_dir]
|
|
# - project_dir: Project directory name (e.g., "esa", "chemba", "angata")
|
|
#
|
|
# Output: CSV file at laravel_app/storage/app/{project_dir}/Data/extracted_ci/cumulative_vals/ci_data_for_python.csv
|
|
# Columns: field, sub_field, Date, FitData, DOY, value (alias for FitData)
|
|
#
|
|
|
|
suppressPackageStartupMessages({
|
|
library(tidyverse)
|
|
library(lubridate)
|
|
library(zoo)
|
|
library(here)
|
|
})
|
|
|
|
# ============================================================================
|
|
# HELPER FUNCTIONS
|
|
# ============================================================================
|
|
|
|
#' Convert wide format RDS to long format
|
|
#'
|
|
#' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns
|
|
#' @return Long format tibble: field, sub_field, Date, FitData
|
|
wide_to_long_ci_data <- function(ci_data_wide) {
|
|
ci_data_wide %>%
|
|
pivot_longer(
|
|
cols = -c(field, sub_field),
|
|
names_to = "Date",
|
|
values_to = "FitData",
|
|
values_drop_na = TRUE
|
|
) %>%
|
|
mutate(
|
|
Date = as.Date(Date),
|
|
FitData = as.numeric(FitData)
|
|
) %>%
|
|
filter(!is.na(FitData))
|
|
}
|
|
|
|
#' Create daily interpolated sequences with DOY for each field
|
|
#'
|
|
#' For each field/sub_field combination, creates complete daily sequences from first to last date,
|
|
#' fills in measurements, and interpolates missing dates.
|
|
#'
|
|
#' @param ci_data_long Long format tibble: field, sub_field, Date, FitData
|
|
#' @return Tibble with: field, sub_field, Date, FitData, DOY, value
|
|
create_interpolated_daily_sequences <- function(ci_data_long) {
|
|
ci_data_long %>%
|
|
group_by(field, sub_field) %>%
|
|
nest() %>%
|
|
mutate(
|
|
data = map(data, function(df) {
|
|
# Sort measurements by date
|
|
df <- df %>% arrange(Date)
|
|
|
|
# Create complete daily sequence from first to last date
|
|
date_seq <- seq(min(df$Date), max(df$Date), by = "day")
|
|
|
|
# Build daily dataframe (field/sub_field stay in outer df, not here)
|
|
daily_df <- tibble(
|
|
Date = date_seq,
|
|
value = NA_real_,
|
|
FitData = NA_real_,
|
|
DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ...
|
|
)
|
|
|
|
# Fill in actual measurement values
|
|
for (i in seq_len(nrow(df))) {
|
|
idx <- which(daily_df$Date == df$Date[i])
|
|
if (length(idx) > 0) {
|
|
daily_df$value[idx] <- df$FitData[i]
|
|
}
|
|
}
|
|
|
|
# Interpolate missing dates linearly
|
|
daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE)
|
|
|
|
daily_df
|
|
})
|
|
) %>%
|
|
unnest(data) %>%
|
|
select(field, sub_field, Date, FitData, DOY, value) %>%
|
|
arrange(field, Date)
|
|
}
|
|
|
|
#' Validate conversion output
|
|
#'
|
|
#' @param ci_data_python Tibble with converted CI data
|
|
#' @return Invisibly returns the tibble (for piping)
|
|
validate_conversion_output <- function(ci_data_python) {
|
|
cat(sprintf("\nValidation:\n"))
|
|
cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
|
|
cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python)))
|
|
cat(sprintf(" Date range: %s to %s\n",
|
|
min(ci_data_python$Date, na.rm = TRUE),
|
|
max(ci_data_python$Date, na.rm = TRUE)))
|
|
cat(sprintf(" FitData range: %.2f to %.2f\n",
|
|
min(ci_data_python$FitData, na.rm = TRUE),
|
|
max(ci_data_python$FitData, na.rm = TRUE)))
|
|
cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value))))
|
|
cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData))))
|
|
|
|
invisible(ci_data_python)
|
|
}
|
|
|
|
#' Print next steps message
|
|
print_next_steps <- function() {
|
|
cat("\nNext steps for Python harvest detection:\n")
|
|
cat(" 1. Read this CSV file in Python\n")
|
|
cat(" 2. Group by field to identify seasons\n")
|
|
cat(" 3. Run LSTM model to detect harvest dates\n")
|
|
cat(" 4. Save predicted harvest dates to Excel\n")
|
|
cat(" 5. Use output in script 03 for interpolation\n")
|
|
}
|
|
|
|
# ============================================================================
|
|
# MAIN FUNCTION
|
|
# ============================================================================
|
|
|
|
main <- function() {
|
|
# Process command line arguments
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
|
|
# Get project directory
|
|
if (length(args) >= 1 && !is.na(args[1])) {
|
|
project_dir <- as.character(args[1])
|
|
} else if (exists("project_dir", envir = .GlobalEnv)) {
|
|
project_dir <- get("project_dir", envir = .GlobalEnv)
|
|
} else {
|
|
project_dir <- "angata"
|
|
}
|
|
|
|
# Make available globally
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
cat(sprintf("Converting CI RDS to CSV: project=%s\n", project_dir))
|
|
|
|
# Initialize project configuration and centralized paths
|
|
tryCatch({
|
|
source("parameters_project.R")
|
|
}, error = function(e) {
|
|
warning("Default parameters_project.R not found. Attempting from 'r_app' directory.")
|
|
tryCatch({
|
|
source(here::here("r_app", "parameters_project.R"))
|
|
}, error = function(e) {
|
|
stop("Failed to source parameters_project.R from both default and 'r_app' directories.")
|
|
})
|
|
})
|
|
|
|
# Load centralized path structure (creates all directories automatically)
|
|
paths <- setup_project_directories(project_dir)
|
|
|
|
# Use centralized paths (no need for dir.create - already handled)
|
|
ci_data_source_dir <- paths$cumulative_ci_vals_dir
|
|
ci_data_output_dir <- paths$ci_for_python_dir
|
|
|
|
input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds")
|
|
output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv")
|
|
|
|
# Check if input file exists
|
|
if (!file.exists(input_file)) {
|
|
stop(paste("Input file not found:", input_file))
|
|
}
|
|
|
|
cat(sprintf("Loading: %s\n", input_file))
|
|
|
|
# Load RDS file
|
|
ci_data_wide <- readRDS(input_file) %>%
|
|
as_tibble()
|
|
|
|
cat(sprintf(" Loaded %d rows\n", nrow(ci_data_wide)))
|
|
cat(sprintf(" Format: WIDE (field, sub_field, then dates as columns)\n"))
|
|
cat(sprintf(" Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", ")))
|
|
|
|
# Step 1: Convert from WIDE to LONG format
|
|
cat("\nStep 1: Converting from wide to long format...\n")
|
|
ci_data_long <- wide_to_long_ci_data(ci_data_wide)
|
|
|
|
# Step 2: Create complete daily sequences with interpolation
|
|
cat("\nStep 2: Creating complete daily sequences with interpolation...\n")
|
|
ci_data_python <- create_interpolated_daily_sequences(ci_data_long)
|
|
|
|
# Step 3: Validate output
|
|
cat("\nStep 3: Validating output...")
|
|
validate_conversion_output(ci_data_python)
|
|
|
|
# Step 4: Save to CSV
|
|
cat(sprintf("\nStep 4: Saving to CSV...\n"))
|
|
cat(sprintf(" Output: %s\n", output_file))
|
|
write_csv(ci_data_python, output_file)
|
|
|
|
cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
|
|
print_next_steps()
|
|
}
|
|
|
|
if (sys.nframe() == 0) {
|
|
main()
|
|
}
|