SmartCane/r_app/experiments/combine_esa_yield_data.R
Timon d5fd4bb463 Add KPI reporting system and deployment documentation
Major Changes:
- NEW: Scripts 09 & 10 for KPI calculation and enhanced reporting
- NEW: Shell script wrappers (01-10) for easier execution
- NEW: R packages flextable and officer for enhanced Word reports
- NEW: DEPLOYMENT_README.md with complete deployment guide
- RENAMED: Numbered R scripts (02, 03, 04) for clarity
- REMOVED: Old package management scripts (using renv only)
- UPDATED: Workflow now uses scripts 09->10 instead of 05

Files Changed: 90+ files
New Packages: flextable, officer
New Scripts: 09_run_calculate_kpis.sh, 10_run_kpi_report.sh
Documentation: DEPLOYMENT_README.md, EMAIL_TO_ADMIN.txt

See DEPLOYMENT_README.md for full deployment instructions.
2025-10-14 11:49:30 +02:00

240 lines
8.4 KiB
R

# Combine ESA Yield Data from 5 tabs into Aura harvest format
# Script to create harvest.xlsx in ESA directory matching Aura structure
# Load required libraries
library(readxl)
library(writexl)
library(dplyr)
library(lubridate)
# Define file paths using absolute paths
base_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane_v2/smartcane"
esa_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "esa_yield_data.xlsx")
output_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "harvest.xlsx")
# Check if ESA file exists
if (!file.exists(esa_file_path)) {
stop("ESA yield data file not found: ", esa_file_path)
}
# Get sheet names (should be: 2019-20, 2020-21, 2021-22, 2022-2023, 2023-24, 2024-25, etc.)
sheet_names <- excel_sheets(esa_file_path)
cat("Found sheets:", paste(sheet_names, collapse = ", "), "\n")
# Function to extract harvest year from sheet name
extract_year <- function(sheet_name) {
# Extract the second year from patterns like "2019-20" -> 2020
if (grepl("^\\d{4}-\\d{2}$", sheet_name)) {
# Format: 2019-20
year_part <- as.numeric(substr(sheet_name, 1, 4)) + 1
} else if (grepl("^\\d{4}-\\d{4}$", sheet_name)) {
# Format: 2022-2023
year_part <- as.numeric(substr(sheet_name, 6, 9))
} else {
# Fallback: try to extract first 4-digit number
year_match <- regmatches(sheet_name, regexpr("\\d{4}", sheet_name))
year_part <- if (length(year_match) > 0) as.numeric(year_match[1]) else NA
}
return(year_part)
}
# Initialize empty list to store data from all sheets
all_data <- list()
# Read data from each sheet
for (sheet in sheet_names) {
cat("Processing sheet:", sheet, "\n")
# Read the data
tryCatch({
data <- read_excel(esa_file_path, sheet = sheet)
# Add year column based on sheet name
data$harvest_year <- extract_year(sheet)
data$sheet_name <- sheet
# Store in list
all_data[[sheet]] <- data
cat(" - Loaded", nrow(data), "rows from sheet", sheet, "\n")
}, error = function(e) {
cat(" - Error reading sheet", sheet, ":", e$message, "\n")
})
}
# Combine all data
if (length(all_data) > 0) {
combined_data <- bind_rows(all_data)
cat("Combined data: ", nrow(combined_data), "total rows\n")
# Display column names to understand the structure
cat("Available columns:\n")
print(colnames(combined_data))
# Transform to SmartCane format
# Map ESA columns to SmartCane columns based on the sample data provided
harvest_data <- combined_data %>%
mutate(
# Convert dates using lubridate (original format is YYYY-MM-DD = ymd)
grow_start_date = ymd(Grow_Start),
harvest_date_date = ymd(Harvest_Date),
# Calculate age in weeks using lubridate
age = round(as.numeric(harvest_date_date - grow_start_date) / 7, 0),
# Format fields for output
field = Field,
sub_field = Field,
year = harvest_year,
season_start = grow_start_date, # Keep as Date object
season_end = harvest_date_date, # Keep as Date object
sub_area = NA, # Leave empty as requested - not actual area but section names
tonnage_ha = TCH
) %>%
select(field, sub_field, year, season_start, season_end, age, sub_area, tonnage_ha) %>%
arrange(field, year)
# Clean up incomplete future seasons that shouldn't exist
cat("\nCleaning up incomplete future seasons...\n")
before_cleanup <- nrow(harvest_data)
# For each field, find the last season with actual data (either completed or ongoing)
# Remove any future seasons beyond that
harvest_data <- harvest_data %>%
group_by(field, sub_field) %>%
arrange(year) %>%
mutate(
# Mark rows with actual data (has start date)
has_data = !is.na(season_start),
# Mark completely empty rows (both start and end are NA)
is_empty = is.na(season_start) & is.na(season_end)
) %>%
# For each field, find the maximum year with actual data
mutate(
max_data_year = ifelse(any(has_data), max(year[has_data], na.rm = TRUE), NA)
) %>%
# Keep only rows that:
# 1. Have actual data, OR
# 2. Are empty but within 1 year of the last data year (future season placeholder)
filter(
has_data |
(is_empty & !is.na(max_data_year) & year <= max_data_year + 1)
) %>%
# Clean up helper columns
select(-has_data, -is_empty, -max_data_year) %>%
ungroup() %>%
arrange(field, year)
after_cleanup <- nrow(harvest_data)
if (before_cleanup != after_cleanup) {
cat("Removed", before_cleanup - after_cleanup, "incomplete future season rows\n")
}
# Create next season rows for fields that have completed seasons
cat("\nCreating next season rows for completed fields...\n")
# For each field, find the latest completed season (has both start and end dates)
completed_seasons <- harvest_data %>%
filter(!is.na(season_start) & !is.na(season_end)) %>%
group_by(field, sub_field) %>%
arrange(desc(year)) %>%
slice(1) %>% # Get the most recent completed season for each field
ungroup() %>%
select(field, sub_field, year, season_end)
cat("Found", nrow(completed_seasons), "fields with completed seasons\n")
# For each completed season, check if there's already a next season row
next_season_rows <- list()
for (i in 1:nrow(completed_seasons)) {
field_name <- completed_seasons$field[i]
sub_field_name <- completed_seasons$sub_field[i]
last_completed_year <- completed_seasons$year[i]
last_harvest_date <- completed_seasons$season_end[i]
next_year <- last_completed_year + 1
# Check if next season already exists for this field
next_season_exists <- harvest_data %>%
filter(field == field_name, sub_field == sub_field_name, year == next_year) %>%
nrow() > 0
if (!next_season_exists) {
# Create next season row
next_season_row <- data.frame(
field = field_name,
sub_field = sub_field_name,
year = next_year,
season_start = as.Date(last_harvest_date) + 1, # Previous harvest + 1 day
season_end = as.Date(NA), # Not harvested yet
age = NA,
sub_area = NA,
tonnage_ha = NA,
stringsAsFactors = FALSE
)
next_season_rows[[paste(field_name, sub_field_name, next_year, sep = "_")]] <- next_season_row
cat("Creating", next_year, "season for field", field_name, "starting", format(as.Date(last_harvest_date) + 1, "%Y-%m-%d"), "\n")
} else {
cat("Next season", next_year, "already exists for field", field_name, "\n")
}
}
# Combine all next season rows and add to harvest_data
if (length(next_season_rows) > 0) {
next_season_data <- bind_rows(next_season_rows)
harvest_data <- bind_rows(harvest_data, next_season_data) %>%
arrange(field, year)
cat("Added", nrow(next_season_data), "new season rows\n")
} else {
cat("No new season rows needed\n")
}
# Display preview of final transformed data
cat("\nPreview of final transformed data (including next season):\n")
print(head(harvest_data, 15)) # Show more rows to see next season data
# Remove duplicates based on field, sub_field, year combination
cat("\nRemoving duplicate entries...\n")
before_dedup <- nrow(harvest_data)
harvest_data <- harvest_data %>%
distinct(field, sub_field, year, .keep_all = TRUE)
after_dedup <- nrow(harvest_data)
duplicates_removed <- before_dedup - after_dedup
cat("Removed", duplicates_removed, "duplicate entries\n")
cat("Final data has", after_dedup, "unique records\n")
# Remove rows with NA season_start to prevent age calculation issues in reports
cat("\nRemoving rows with NA season_start...\n")
before_na_removal <- nrow(harvest_data)
harvest_data <- harvest_data %>%
filter(!is.na(season_start))
after_na_removal <- nrow(harvest_data)
na_removed <- before_na_removal - after_na_removal
cat("Removed", na_removed, "rows with NA season_start\n")
cat("Final data has", after_na_removal, "valid records\n")
# Save to Excel file
tryCatch({
write_xlsx(harvest_data, output_file_path)
cat("\nSuccessfully saved harvest data to:", output_file_path, "\n")
cat("Total rows saved:", nrow(harvest_data), "\n")
}, error = function(e) {
cat("Error saving file:", e$message, "\n")
})
} else {
cat("No data was successfully loaded from any sheet.\n")
}
cat("\nScript completed.\n")