Major Changes: - NEW: Scripts 09 & 10 for KPI calculation and enhanced reporting - NEW: Shell script wrappers (01-10) for easier execution - NEW: R packages flextable and officer for enhanced Word reports - NEW: DEPLOYMENT_README.md with complete deployment guide - RENAMED: Numbered R scripts (02, 03, 04) for clarity - REMOVED: Old package management scripts (using renv only) - UPDATED: Workflow now uses scripts 09->10 instead of 05 Files Changed: 90+ files New Packages: flextable, officer New Scripts: 09_run_calculate_kpis.sh, 10_run_kpi_report.sh Documentation: DEPLOYMENT_README.md, EMAIL_TO_ADMIN.txt See DEPLOYMENT_README.md for full deployment instructions.
240 lines
8.4 KiB
R
240 lines
8.4 KiB
R
# Combine ESA Yield Data from 5 tabs into Aura harvest format
|
|
# Script to create harvest.xlsx in ESA directory matching Aura structure
|
|
|
|
# Load required libraries
|
|
library(readxl)
|
|
library(writexl)
|
|
library(dplyr)
|
|
library(lubridate)
|
|
|
|
# Define file paths using absolute paths
|
|
base_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane_v2/smartcane"
|
|
esa_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "esa_yield_data.xlsx")
|
|
output_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "harvest.xlsx")
|
|
|
|
# Check if ESA file exists
|
|
if (!file.exists(esa_file_path)) {
|
|
stop("ESA yield data file not found: ", esa_file_path)
|
|
}
|
|
|
|
# Get sheet names (should be: 2019-20, 2020-21, 2021-22, 2022-2023, 2023-24, 2024-25, etc.)
|
|
sheet_names <- excel_sheets(esa_file_path)
|
|
cat("Found sheets:", paste(sheet_names, collapse = ", "), "\n")
|
|
|
|
# Function to extract harvest year from sheet name
|
|
extract_year <- function(sheet_name) {
|
|
# Extract the second year from patterns like "2019-20" -> 2020
|
|
if (grepl("^\\d{4}-\\d{2}$", sheet_name)) {
|
|
# Format: 2019-20
|
|
year_part <- as.numeric(substr(sheet_name, 1, 4)) + 1
|
|
} else if (grepl("^\\d{4}-\\d{4}$", sheet_name)) {
|
|
# Format: 2022-2023
|
|
year_part <- as.numeric(substr(sheet_name, 6, 9))
|
|
} else {
|
|
# Fallback: try to extract first 4-digit number
|
|
year_match <- regmatches(sheet_name, regexpr("\\d{4}", sheet_name))
|
|
year_part <- if (length(year_match) > 0) as.numeric(year_match[1]) else NA
|
|
}
|
|
return(year_part)
|
|
}
|
|
|
|
# Initialize empty list to store data from all sheets
|
|
all_data <- list()
|
|
|
|
# Read data from each sheet
|
|
for (sheet in sheet_names) {
|
|
cat("Processing sheet:", sheet, "\n")
|
|
|
|
# Read the data
|
|
tryCatch({
|
|
data <- read_excel(esa_file_path, sheet = sheet)
|
|
|
|
# Add year column based on sheet name
|
|
data$harvest_year <- extract_year(sheet)
|
|
data$sheet_name <- sheet
|
|
|
|
# Store in list
|
|
all_data[[sheet]] <- data
|
|
|
|
cat(" - Loaded", nrow(data), "rows from sheet", sheet, "\n")
|
|
}, error = function(e) {
|
|
cat(" - Error reading sheet", sheet, ":", e$message, "\n")
|
|
})
|
|
}
|
|
|
|
# Combine all data
|
|
if (length(all_data) > 0) {
|
|
combined_data <- bind_rows(all_data)
|
|
cat("Combined data: ", nrow(combined_data), "total rows\n")
|
|
|
|
# Display column names to understand the structure
|
|
cat("Available columns:\n")
|
|
print(colnames(combined_data))
|
|
|
|
# Transform to SmartCane format
|
|
# Map ESA columns to SmartCane columns based on the sample data provided
|
|
harvest_data <- combined_data %>%
|
|
mutate(
|
|
# Convert dates using lubridate (original format is YYYY-MM-DD = ymd)
|
|
grow_start_date = ymd(Grow_Start),
|
|
harvest_date_date = ymd(Harvest_Date),
|
|
|
|
# Calculate age in weeks using lubridate
|
|
age = round(as.numeric(harvest_date_date - grow_start_date) / 7, 0),
|
|
|
|
# Format fields for output
|
|
field = Field,
|
|
sub_field = Field,
|
|
year = harvest_year,
|
|
season_start = grow_start_date, # Keep as Date object
|
|
season_end = harvest_date_date, # Keep as Date object
|
|
sub_area = NA, # Leave empty as requested - not actual area but section names
|
|
tonnage_ha = TCH
|
|
) %>%
|
|
select(field, sub_field, year, season_start, season_end, age, sub_area, tonnage_ha) %>%
|
|
arrange(field, year)
|
|
|
|
# Clean up incomplete future seasons that shouldn't exist
|
|
cat("\nCleaning up incomplete future seasons...\n")
|
|
|
|
before_cleanup <- nrow(harvest_data)
|
|
|
|
# For each field, find the last season with actual data (either completed or ongoing)
|
|
# Remove any future seasons beyond that
|
|
harvest_data <- harvest_data %>%
|
|
group_by(field, sub_field) %>%
|
|
arrange(year) %>%
|
|
mutate(
|
|
# Mark rows with actual data (has start date)
|
|
has_data = !is.na(season_start),
|
|
# Mark completely empty rows (both start and end are NA)
|
|
is_empty = is.na(season_start) & is.na(season_end)
|
|
) %>%
|
|
# For each field, find the maximum year with actual data
|
|
mutate(
|
|
max_data_year = ifelse(any(has_data), max(year[has_data], na.rm = TRUE), NA)
|
|
) %>%
|
|
# Keep only rows that:
|
|
# 1. Have actual data, OR
|
|
# 2. Are empty but within 1 year of the last data year (future season placeholder)
|
|
filter(
|
|
has_data |
|
|
(is_empty & !is.na(max_data_year) & year <= max_data_year + 1)
|
|
) %>%
|
|
# Clean up helper columns
|
|
select(-has_data, -is_empty, -max_data_year) %>%
|
|
ungroup() %>%
|
|
arrange(field, year)
|
|
|
|
after_cleanup <- nrow(harvest_data)
|
|
|
|
if (before_cleanup != after_cleanup) {
|
|
cat("Removed", before_cleanup - after_cleanup, "incomplete future season rows\n")
|
|
}
|
|
|
|
# Create next season rows for fields that have completed seasons
|
|
cat("\nCreating next season rows for completed fields...\n")
|
|
|
|
# For each field, find the latest completed season (has both start and end dates)
|
|
completed_seasons <- harvest_data %>%
|
|
filter(!is.na(season_start) & !is.na(season_end)) %>%
|
|
group_by(field, sub_field) %>%
|
|
arrange(desc(year)) %>%
|
|
slice(1) %>% # Get the most recent completed season for each field
|
|
ungroup() %>%
|
|
select(field, sub_field, year, season_end)
|
|
|
|
cat("Found", nrow(completed_seasons), "fields with completed seasons\n")
|
|
|
|
# For each completed season, check if there's already a next season row
|
|
next_season_rows <- list()
|
|
|
|
for (i in 1:nrow(completed_seasons)) {
|
|
field_name <- completed_seasons$field[i]
|
|
sub_field_name <- completed_seasons$sub_field[i]
|
|
last_completed_year <- completed_seasons$year[i]
|
|
last_harvest_date <- completed_seasons$season_end[i]
|
|
next_year <- last_completed_year + 1
|
|
|
|
# Check if next season already exists for this field
|
|
next_season_exists <- harvest_data %>%
|
|
filter(field == field_name, sub_field == sub_field_name, year == next_year) %>%
|
|
nrow() > 0
|
|
|
|
if (!next_season_exists) {
|
|
# Create next season row
|
|
next_season_row <- data.frame(
|
|
field = field_name,
|
|
sub_field = sub_field_name,
|
|
year = next_year,
|
|
season_start = as.Date(last_harvest_date) + 1, # Previous harvest + 1 day
|
|
season_end = as.Date(NA), # Not harvested yet
|
|
age = NA,
|
|
sub_area = NA,
|
|
tonnage_ha = NA,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
|
|
next_season_rows[[paste(field_name, sub_field_name, next_year, sep = "_")]] <- next_season_row
|
|
cat("Creating", next_year, "season for field", field_name, "starting", format(as.Date(last_harvest_date) + 1, "%Y-%m-%d"), "\n")
|
|
} else {
|
|
cat("Next season", next_year, "already exists for field", field_name, "\n")
|
|
}
|
|
}
|
|
|
|
# Combine all next season rows and add to harvest_data
|
|
if (length(next_season_rows) > 0) {
|
|
next_season_data <- bind_rows(next_season_rows)
|
|
harvest_data <- bind_rows(harvest_data, next_season_data) %>%
|
|
arrange(field, year)
|
|
|
|
cat("Added", nrow(next_season_data), "new season rows\n")
|
|
} else {
|
|
cat("No new season rows needed\n")
|
|
}
|
|
|
|
# Display preview of final transformed data
|
|
cat("\nPreview of final transformed data (including next season):\n")
|
|
print(head(harvest_data, 15)) # Show more rows to see next season data
|
|
|
|
# Remove duplicates based on field, sub_field, year combination
|
|
cat("\nRemoving duplicate entries...\n")
|
|
before_dedup <- nrow(harvest_data)
|
|
|
|
harvest_data <- harvest_data %>%
|
|
distinct(field, sub_field, year, .keep_all = TRUE)
|
|
|
|
after_dedup <- nrow(harvest_data)
|
|
duplicates_removed <- before_dedup - after_dedup
|
|
|
|
cat("Removed", duplicates_removed, "duplicate entries\n")
|
|
cat("Final data has", after_dedup, "unique records\n")
|
|
|
|
# Remove rows with NA season_start to prevent age calculation issues in reports
|
|
cat("\nRemoving rows with NA season_start...\n")
|
|
before_na_removal <- nrow(harvest_data)
|
|
|
|
harvest_data <- harvest_data %>%
|
|
filter(!is.na(season_start))
|
|
|
|
after_na_removal <- nrow(harvest_data)
|
|
na_removed <- before_na_removal - after_na_removal
|
|
|
|
cat("Removed", na_removed, "rows with NA season_start\n")
|
|
cat("Final data has", after_na_removal, "valid records\n")
|
|
|
|
# Save to Excel file
|
|
tryCatch({
|
|
write_xlsx(harvest_data, output_file_path)
|
|
cat("\nSuccessfully saved harvest data to:", output_file_path, "\n")
|
|
cat("Total rows saved:", nrow(harvest_data), "\n")
|
|
}, error = function(e) {
|
|
cat("Error saving file:", e$message, "\n")
|
|
})
|
|
|
|
} else {
|
|
cat("No data was successfully loaded from any sheet.\n")
|
|
}
|
|
|
|
cat("\nScript completed.\n")
|