# Combine ESA Yield Data from 5 tabs into Aura harvest format # Script to create harvest.xlsx in ESA directory matching Aura structure # Load required libraries library(readxl) library(writexl) library(dplyr) library(lubridate) # Define file paths using absolute paths base_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane_v2/smartcane" esa_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "esa_yield_data.xlsx") output_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "harvest.xlsx") # Check if ESA file exists if (!file.exists(esa_file_path)) { stop("ESA yield data file not found: ", esa_file_path) } # Get sheet names (should be: 2019-20, 2020-21, 2021-22, 2022-2023, 2023-24, 2024-25, etc.) sheet_names <- excel_sheets(esa_file_path) cat("Found sheets:", paste(sheet_names, collapse = ", "), "\n") # Function to extract harvest year from sheet name extract_year <- function(sheet_name) { # Extract the second year from patterns like "2019-20" -> 2020 if (grepl("^\\d{4}-\\d{2}$", sheet_name)) { # Format: 2019-20 year_part <- as.numeric(substr(sheet_name, 1, 4)) + 1 } else if (grepl("^\\d{4}-\\d{4}$", sheet_name)) { # Format: 2022-2023 year_part <- as.numeric(substr(sheet_name, 6, 9)) } else { # Fallback: try to extract first 4-digit number year_match <- regmatches(sheet_name, regexpr("\\d{4}", sheet_name)) year_part <- if (length(year_match) > 0) as.numeric(year_match[1]) else NA } return(year_part) } # Initialize empty list to store data from all sheets all_data <- list() # Read data from each sheet for (sheet in sheet_names) { cat("Processing sheet:", sheet, "\n") # Read the data tryCatch({ data <- read_excel(esa_file_path, sheet = sheet) # Add year column based on sheet name data$harvest_year <- extract_year(sheet) data$sheet_name <- sheet # Store in list all_data[[sheet]] <- data cat(" - Loaded", nrow(data), "rows from sheet", sheet, "\n") }, error = function(e) { cat(" - Error reading sheet", sheet, ":", e$message, "\n") }) } # Combine all data if (length(all_data) > 0) { combined_data <- bind_rows(all_data) cat("Combined data: ", nrow(combined_data), "total rows\n") # Display column names to understand the structure cat("Available columns:\n") print(colnames(combined_data)) # Transform to SmartCane format # Map ESA columns to SmartCane columns based on the sample data provided harvest_data <- combined_data %>% mutate( # Convert dates using lubridate (original format is YYYY-MM-DD = ymd) grow_start_date = ymd(Grow_Start), harvest_date_date = ymd(Harvest_Date), # Calculate age in weeks using lubridate age = round(as.numeric(harvest_date_date - grow_start_date) / 7, 0), # Format fields for output field = Field, sub_field = Field, year = harvest_year, season_start = grow_start_date, # Keep as Date object season_end = harvest_date_date, # Keep as Date object sub_area = NA, # Leave empty as requested - not actual area but section names tonnage_ha = TCH ) %>% select(field, sub_field, year, season_start, season_end, age, sub_area, tonnage_ha) %>% arrange(field, year) # Clean up incomplete future seasons that shouldn't exist cat("\nCleaning up incomplete future seasons...\n") before_cleanup <- nrow(harvest_data) # For each field, find the last season with actual data (either completed or ongoing) # Remove any future seasons beyond that harvest_data <- harvest_data %>% group_by(field, sub_field) %>% arrange(year) %>% mutate( # Mark rows with actual data (has start date) has_data = !is.na(season_start), # Mark completely empty rows (both start and end are NA) is_empty = is.na(season_start) & is.na(season_end) ) %>% # For each field, find the maximum year with actual data mutate( max_data_year = ifelse(any(has_data), max(year[has_data], na.rm = TRUE), NA) ) %>% # Keep only rows that: # 1. Have actual data, OR # 2. Are empty but within 1 year of the last data year (future season placeholder) filter( has_data | (is_empty & !is.na(max_data_year) & year <= max_data_year + 1) ) %>% # Clean up helper columns select(-has_data, -is_empty, -max_data_year) %>% ungroup() %>% arrange(field, year) after_cleanup <- nrow(harvest_data) if (before_cleanup != after_cleanup) { cat("Removed", before_cleanup - after_cleanup, "incomplete future season rows\n") } # Create next season rows for fields that have completed seasons cat("\nCreating next season rows for completed fields...\n") # For each field, find the latest completed season (has both start and end dates) completed_seasons <- harvest_data %>% filter(!is.na(season_start) & !is.na(season_end)) %>% group_by(field, sub_field) %>% arrange(desc(year)) %>% slice(1) %>% # Get the most recent completed season for each field ungroup() %>% select(field, sub_field, year, season_end) cat("Found", nrow(completed_seasons), "fields with completed seasons\n") # For each completed season, check if there's already a next season row next_season_rows <- list() for (i in 1:nrow(completed_seasons)) { field_name <- completed_seasons$field[i] sub_field_name <- completed_seasons$sub_field[i] last_completed_year <- completed_seasons$year[i] last_harvest_date <- completed_seasons$season_end[i] next_year <- last_completed_year + 1 # Check if next season already exists for this field next_season_exists <- harvest_data %>% filter(field == field_name, sub_field == sub_field_name, year == next_year) %>% nrow() > 0 if (!next_season_exists) { # Create next season row next_season_row <- data.frame( field = field_name, sub_field = sub_field_name, year = next_year, season_start = as.Date(last_harvest_date) + 1, # Previous harvest + 1 day season_end = as.Date(NA), # Not harvested yet age = NA, sub_area = NA, tonnage_ha = NA, stringsAsFactors = FALSE ) next_season_rows[[paste(field_name, sub_field_name, next_year, sep = "_")]] <- next_season_row cat("Creating", next_year, "season for field", field_name, "starting", format(as.Date(last_harvest_date) + 1, "%Y-%m-%d"), "\n") } else { cat("Next season", next_year, "already exists for field", field_name, "\n") } } # Combine all next season rows and add to harvest_data if (length(next_season_rows) > 0) { next_season_data <- bind_rows(next_season_rows) harvest_data <- bind_rows(harvest_data, next_season_data) %>% arrange(field, year) cat("Added", nrow(next_season_data), "new season rows\n") } else { cat("No new season rows needed\n") } # Display preview of final transformed data cat("\nPreview of final transformed data (including next season):\n") print(head(harvest_data, 15)) # Show more rows to see next season data # Remove duplicates based on field, sub_field, year combination cat("\nRemoving duplicate entries...\n") before_dedup <- nrow(harvest_data) harvest_data <- harvest_data %>% distinct(field, sub_field, year, .keep_all = TRUE) after_dedup <- nrow(harvest_data) duplicates_removed <- before_dedup - after_dedup cat("Removed", duplicates_removed, "duplicate entries\n") cat("Final data has", after_dedup, "unique records\n") # Remove rows with NA season_start to prevent age calculation issues in reports cat("\nRemoving rows with NA season_start...\n") before_na_removal <- nrow(harvest_data) harvest_data <- harvest_data %>% filter(!is.na(season_start)) after_na_removal <- nrow(harvest_data) na_removed <- before_na_removal - after_na_removal cat("Removed", na_removed, "rows with NA season_start\n") cat("Final data has", after_na_removal, "valid records\n") # Save to Excel file tryCatch({ write_xlsx(harvest_data, output_file_path) cat("\nSuccessfully saved harvest data to:", output_file_path, "\n") cat("Total rows saved:", nrow(harvest_data), "\n") }, error = function(e) { cat("Error saving file:", e$message, "\n") }) } else { cat("No data was successfully loaded from any sheet.\n") } cat("\nScript completed.\n")