SmartCane/r_app/experiments/combine_esa_yield_data.R

# Combine ESA Yield Data from 5 tabs into Aura harvest format
# Script to create harvest.xlsx in ESA directory matching Aura structure

# Load required libraries
library(readxl)
library(writexl)
library(dplyr)
library(lubridate)

# Define file paths using absolute paths
base_path <- "C:/Users/timon/Resilience BV/4020 SCane ESA DEMO - Documenten/General/4020 SCDEMO Team/4020 TechnicalData/WP3/smartcane_v2/smartcane"
esa_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "esa_yield_data.xlsx")
output_file_path <- file.path(base_path, "laravel_app", "storage", "app", "esa", "Data", "harvest.xlsx")

# Check if ESA file exists
if (!file.exists(esa_file_path)) {
  stop("ESA yield data file not found: ", esa_file_path)
}

# Get sheet names (should be: 2019-20, 2020-21, 2021-22, 2022-2023, 2023-24, 2024-25, etc.)
sheet_names <- excel_sheets(esa_file_path)
cat("Found sheets:", paste(sheet_names, collapse = ", "), "\n")

# Function to extract harvest year from sheet name
extract_year <- function(sheet_name) {
  # Extract the second year from patterns like "2019-20" -> 2020
  if (grepl("^\\d{4}-\\d{2}$", sheet_name)) {
    # Format: 2019-20
    year_part <- as.numeric(substr(sheet_name, 1, 4)) + 1
  } else if (grepl("^\\d{4}-\\d{4}$", sheet_name)) {
    # Format: 2022-2023
    year_part <- as.numeric(substr(sheet_name, 6, 9))
  } else {
    # Fallback: try to extract first 4-digit number
    year_match <- regmatches(sheet_name, regexpr("\\d{4}", sheet_name))
    year_part <- if (length(year_match) > 0) as.numeric(year_match[1]) else NA
  }
  return(year_part)
}

# Initialize empty list to store data from all sheets
all_data <- list()

# Read data from each sheet
for (sheet in sheet_names) {
  cat("Processing sheet:", sheet, "\n")

  # Read the data
  tryCatch({
    data <- read_excel(esa_file_path, sheet = sheet)

    # Add year column based on sheet name
    data$harvest_year <- extract_year(sheet)
    data$sheet_name <- sheet

    # Store in list
    all_data[[sheet]] <- data

    cat("  - Loaded", nrow(data), "rows from sheet", sheet, "\n")
  }, error = function(e) {
    cat("  - Error reading sheet", sheet, ":", e$message, "\n")
  })
}

# Combine all data
if (length(all_data) > 0) {
  combined_data <- bind_rows(all_data)
  cat("Combined data: ", nrow(combined_data), "total rows\n")

  # Display column names to understand the structure
  cat("Available columns:\n")
  print(colnames(combined_data))

  # Transform to SmartCane format
  # Map ESA columns to SmartCane columns based on the sample data provided
  harvest_data <- combined_data %>%
    mutate(
      # Convert dates using lubridate (original format is YYYY-MM-DD = ymd)
      grow_start_date = ymd(Grow_Start),
      harvest_date_date = ymd(Harvest_Date),

      # Calculate age in weeks using lubridate
      age = round(as.numeric(harvest_date_date - grow_start_date) / 7, 0),

      # Format fields for output
      field = Field,
      sub_field = Field,
      year = harvest_year,
      season_start = grow_start_date,   # Keep as Date object
      season_end = harvest_date_date,   # Keep as Date object
      sub_area = NA,  # Leave empty as requested - not actual area but section names
      tonnage_ha = TCH
    ) %>%
    select(field, sub_field, year, season_start, season_end, age, sub_area, tonnage_ha) %>%
    arrange(field, year)

  # Clean up incomplete future seasons that shouldn't exist
  cat("\nCleaning up incomplete future seasons...\n")

  before_cleanup <- nrow(harvest_data)

  # For each field, find the last season with actual data (either completed or ongoing)
  # Remove any future seasons beyond that
  harvest_data <- harvest_data %>%
    group_by(field, sub_field) %>%
    arrange(year) %>%
    mutate(
      # Mark rows with actual data (has start date)
      has_data = !is.na(season_start),
      # Mark completely empty rows (both start and end are NA)
      is_empty = is.na(season_start) & is.na(season_end)
    ) %>%
    # For each field, find the maximum year with actual data
    mutate(
      max_data_year = ifelse(any(has_data), max(year[has_data], na.rm = TRUE), NA)
    ) %>%
    # Keep only rows that:
    # 1. Have actual data, OR
    # 2. Are empty but within 1 year of the last data year (future season placeholder)
    filter(
      has_data |
      (is_empty & !is.na(max_data_year) & year <= max_data_year + 1)
    ) %>%
    # Clean up helper columns
    select(-has_data, -is_empty, -max_data_year) %>%
    ungroup() %>%
    arrange(field, year)

  after_cleanup <- nrow(harvest_data)

  if (before_cleanup != after_cleanup) {
    cat("Removed", before_cleanup - after_cleanup, "incomplete future season rows\n")
  }

  # Create next season rows for fields that have completed seasons
  cat("\nCreating next season rows for completed fields...\n")

  # For each field, find the latest completed season (has both start and end dates)
  completed_seasons <- harvest_data %>%
    filter(!is.na(season_start) & !is.na(season_end)) %>%
    group_by(field, sub_field) %>%
    arrange(desc(year)) %>%
    slice(1) %>%  # Get the most recent completed season for each field
    ungroup() %>%
    select(field, sub_field, year, season_end)

  cat("Found", nrow(completed_seasons), "fields with completed seasons\n")

  # For each completed season, check if there's already a next season row
  next_season_rows <- list()

  for (i in 1:nrow(completed_seasons)) {
    field_name <- completed_seasons$field[i]
    sub_field_name <- completed_seasons$sub_field[i]
    last_completed_year <- completed_seasons$year[i]
    last_harvest_date <- completed_seasons$season_end[i]
    next_year <- last_completed_year + 1

    # Check if next season already exists for this field
    next_season_exists <- harvest_data %>%
      filter(field == field_name, sub_field == sub_field_name, year == next_year) %>%
      nrow() > 0

    if (!next_season_exists) {
      # Create next season row
      next_season_row <- data.frame(
        field = field_name,
        sub_field = sub_field_name,
        year = next_year,
        season_start = as.Date(last_harvest_date) + 1,  # Previous harvest + 1 day
        season_end = as.Date(NA),  # Not harvested yet
        age = NA,
        sub_area = NA,
        tonnage_ha = NA,
        stringsAsFactors = FALSE
      )

      next_season_rows[[paste(field_name, sub_field_name, next_year, sep = "_")]] <- next_season_row
      cat("Creating", next_year, "season for field", field_name, "starting", format(as.Date(last_harvest_date) + 1, "%Y-%m-%d"), "\n")
    } else {
      cat("Next season", next_year, "already exists for field", field_name, "\n")
    }
  }

  # Combine all next season rows and add to harvest_data
  if (length(next_season_rows) > 0) {
    next_season_data <- bind_rows(next_season_rows)
    harvest_data <- bind_rows(harvest_data, next_season_data) %>%
      arrange(field, year)

    cat("Added", nrow(next_season_data), "new season rows\n")
  } else {
    cat("No new season rows needed\n")
  }

  # Display preview of final transformed data
  cat("\nPreview of final transformed data (including next season):\n")
  print(head(harvest_data, 15))  # Show more rows to see next season data

  # Remove duplicates based on field, sub_field, year combination
  cat("\nRemoving duplicate entries...\n")
  before_dedup <- nrow(harvest_data)

  harvest_data <- harvest_data %>%
    distinct(field, sub_field, year, .keep_all = TRUE)

  after_dedup <- nrow(harvest_data)
  duplicates_removed <- before_dedup - after_dedup

  cat("Removed", duplicates_removed, "duplicate entries\n")
  cat("Final data has", after_dedup, "unique records\n")

  # Remove rows with NA season_start to prevent age calculation issues in reports
  cat("\nRemoving rows with NA season_start...\n")
  before_na_removal <- nrow(harvest_data)

  harvest_data <- harvest_data %>%
    filter(!is.na(season_start))

  after_na_removal <- nrow(harvest_data)
  na_removed <- before_na_removal - after_na_removal

  cat("Removed", na_removed, "rows with NA season_start\n")
  cat("Final data has", after_na_removal, "valid records\n")

  # Save to Excel file
  tryCatch({
    write_xlsx(harvest_data, output_file_path)
    cat("\nSuccessfully saved harvest data to:", output_file_path, "\n")
    cat("Total rows saved:", nrow(harvest_data), "\n")
  }, error = function(e) {
    cat("Error saving file:", e$message, "\n")
  })

} else {
  cat("No data was successfully loaded from any sheet.\n")
}

cat("\nScript completed.\n")