SmartCane/r_app/30_growth_model_utils.R

# filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\growth_model_utils.R
#
# GROWTH_MODEL_UTILS.R
# ===================
# Utility functions for growth model interpolation and manipulation.
# These functions support the creation of continuous growth models from point measurements.

#' Safe logging function that works whether log_message exists or not
#'
#' @param message The message to log
#' @param level The log level (default: "INFO")
#' @return NULL (used for side effects)
#'
safe_log <- function(message, level = "INFO") {
  if (exists("log_message")) {
    log_message(message, level)
  } else {
    if (level %in% c("ERROR", "WARNING")) {
      warning(message)
    } else {
      message(message)
    }
  }
}

#' Load and prepare the combined CI data
#'
#' @param data_dir Directory containing the combined CI data
#' @return Long-format dataframe with CI values by date
#'
load_combined_ci_data <- function(data_dir) {
  # Load all daily RDS files from daily_vals/ directory
  daily_vals_dir <- file.path(data_dir, "..", "daily_vals")

  if (!dir.exists(daily_vals_dir)) {
    stop(paste("Daily values directory not found:", daily_vals_dir))
  }

  safe_log(paste("Loading CI data from daily files in:", daily_vals_dir))

  # Find all daily RDS files recursively
  all_daily_files <- list.files(
    path = daily_vals_dir,
    pattern = "\\.rds$",
    full.names = TRUE,
    recursive = TRUE
  )

  if (length(all_daily_files) == 0) {
    stop(paste("No daily RDS files found in:", daily_vals_dir))
  }

  safe_log(sprintf("Found %d daily RDS files to load", length(all_daily_files)))

  # Read and combine all daily RDS files
  # Each file contains: field, sub_field, ci_mean, ci_median, ci_sd, ci_min, ci_max, ci_count
  combined_data <- all_daily_files %>%
    purrr::map(readRDS) %>%
    purrr::list_rbind()

  # Extract date from file path: .../daily_vals/{FIELD}/{YYYY-MM-DD}.rds
  combined_data <- combined_data %>%
    dplyr::mutate(
      file_path = NA_character_,  # Will be filled by mapping
      Date = NA_Date_
    )

  # Add dates by mapping file paths to dates
  for (i in seq_along(all_daily_files)) {
    file_path <- all_daily_files[i]
    date_str <- tools::file_path_sans_ext(basename(file_path))

    # Match rows in combined_data that came from this file
    # This is a simplification - in practice we'd need to track which rows came from which file
    # For now, we'll rebuild the data with explicit date tracking
  }

  # Better approach: rebuild with explicit date tracking
  combined_long <- data.frame()

  for (file in all_daily_files) {
    date_str <- tools::file_path_sans_ext(basename(file))
    rds_data <- readRDS(file)
    rds_data <- rds_data %>%
      dplyr::mutate(Date = lubridate::ymd(date_str))
    combined_long <- rbind(combined_long, rds_data)
  }

  # Reshape to long format using ci_mean as the main CI value
  pivot_stats_long <- combined_long %>%
    dplyr::select(field, sub_field, ci_mean, Date) %>%
    dplyr::rename(value = ci_mean) %>%
    dplyr::mutate(value = as.numeric(value)) %>%
    tidyr::drop_na(c("value", "Date")) %>%
    dplyr::filter(!is.na(sub_field), !is.na(field)) %>%
    dplyr::filter(!is.infinite(value)) %>%
    dplyr::distinct()

  safe_log(sprintf("Loaded %d CI data points from %d daily files",
                   nrow(pivot_stats_long), length(all_daily_files)))

  return(pivot_stats_long)
}

#' Extract and interpolate CI data for a specific field and season
#'
#' @param field_name Name of the field or sub-field
#' @param harvesting_data Dataframe with harvesting information
#' @param field_CI_data Dataframe with CI measurements
#' @param season Year of the growing season
#' @return Dataframe with interpolated daily CI values
#'
extract_CI_data <- function(field_name, harvesting_data, field_CI_data, season) {
  # Filter harvesting data for the given season and field name
  filtered_harvesting_data <- harvesting_data %>%
    dplyr::filter(year == season, sub_field == field_name)

  if (nrow(filtered_harvesting_data) == 0) {
    safe_log(paste("No harvesting data found for field:", field_name, "in season:", season), "WARNING")
    return(data.frame())
  }

  # Filter field CI data for the given field name
  filtered_field_CI_data <- field_CI_data %>%
    dplyr::filter(sub_field == field_name)

  # Return an empty data frame if no CI data is found
  if (nrow(filtered_field_CI_data) == 0) {
    safe_log(paste("No CI data found for field:", field_name, "in season:", season), "WARNING")
    return(data.frame())
  }

  # Log season dates
  season_start <- filtered_harvesting_data$season_start[1]
  season_end <- filtered_harvesting_data$season_end[1]
  ci_date_range <- paste(format(min(filtered_field_CI_data$Date), "%Y-%m-%d"),
                         "to",
                         format(max(filtered_field_CI_data$Date), "%Y-%m-%d"))

  # Create a linear interpolation function for the CI data
  tryCatch({
    ApproxFun <- stats::approxfun(x = filtered_field_CI_data$Date, y = filtered_field_CI_data$value)
    Dates <- seq.Date(min(filtered_field_CI_data$Date), max(filtered_field_CI_data$Date), by = 1)
    LinearFit <- ApproxFun(Dates)

    # Combine interpolated data with the original CI data
    CI <- data.frame(Date = Dates, FitData = LinearFit) %>%
      dplyr::left_join(filtered_field_CI_data, by = "Date") %>%
      dplyr::filter(Date > filtered_harvesting_data$season_start & Date < filtered_harvesting_data$season_end)

    # If CI is empty after filtering, return an empty dataframe
    if (nrow(CI) == 0) {
      safe_log(paste0("No CI data within season dates for field: ", field_name,
                     " (Season: ", season, ", dates: ",
                     format(season_start, "%Y-%m-%d"), " to ",
                     format(season_end, "%Y-%m-%d"),
                     "). Available CI data range: ", ci_date_range),
               "WARNING")
      return(data.frame())
    }

    # Add additional columns
    CI <- CI %>%
      dplyr::mutate(
        DOY = seq(1, n(), 1),
        model = paste0("Data", season, " : ", field_name),
        season = season,
        subField = field_name
      )

    # Log successful interpolation
    safe_log(paste0("Successfully interpolated CI data for field: ", field_name,
                   " (Season: ", season, ", dates: ",
                   format(season_start, "%Y-%m-%d"), " to ",
                   format(season_end, "%Y-%m-%d"),
                   "). ", nrow(CI), " data points created."))

    return(CI)
  }, error = function(e) {
    safe_log(paste0("Error interpolating CI data for field ", field_name,
                   " in season ", season,
                   " (", format(season_start, "%Y-%m-%d"), " to ",
                   format(season_end, "%Y-%m-%d"),
                   "): ", e$message), "ERROR")
    return(data.frame())
  })
}

#' Generate interpolated CI data for all fields and seasons
#'
#' @param years Vector of years to process
#' @param harvesting_data Dataframe with harvesting information
#' @param ci_data Long-format dataframe with CI measurements
#' @return Dataframe with interpolated daily CI values for all fields/seasons
#'
generate_interpolated_ci_data <- function(years, harvesting_data, ci_data) {
  safe_log("Starting CI data interpolation for all fields")

  # Process each year
  result <- purrr::map_df(years, function(yr) {
    safe_log(paste("Processing year:", yr))

    # Get the fields harvested in this year with valid season start dates
    sub_fields <- harvesting_data %>%
      dplyr::filter(year == yr, !is.na(season_start)) %>%
      dplyr::pull(sub_field)

    if (length(sub_fields) == 0) {
      safe_log(paste("No fields with valid season data for year:", yr), "WARNING")
      return(data.frame())
    }

    # Filter sub_fields to only include those with value data in ci_data
    valid_sub_fields <- sub_fields %>%
      purrr::keep(~ any(ci_data$sub_field == .x))

    if (length(valid_sub_fields) == 0) {
      safe_log(paste("No fields with CI data for year:", yr), "WARNING")
      return(data.frame())
    }

    # Extract and interpolate data for each valid field
    safe_log(paste("Processing", length(valid_sub_fields), "fields for year:", yr))

    result <- purrr::map(valid_sub_fields, ~ extract_CI_data(.x,
                                                  harvesting_data = harvesting_data,
                                                  field_CI_data = ci_data,
                                                  season = yr)) %>%
      purrr::list_rbind()

    safe_log(paste("Generated", nrow(result), "interpolated data points for year:", yr))
    return(result)
  })

  safe_log(paste("Total interpolated data points:", nrow(result)))
  return(result)
}

#' Calculate growth metrics for interpolated CI data
#'
#' @param interpolated_data Dataframe with interpolated CI values
#' @return Dataframe with added growth metrics (CI_per_day and cumulative_CI)
#'
calculate_growth_metrics <- function(interpolated_data) {
  if (nrow(interpolated_data) == 0) {
    safe_log("No data provided to calculate growth metrics", "WARNING")
    return(interpolated_data)
  }

  result <- interpolated_data %>%
    dplyr::group_by(model) %>%
    dplyr::mutate(
      CI_per_day = FitData - dplyr::lag(FitData),
      cumulative_CI = cumsum(FitData)
    )

  return(result)
}

#' Save interpolated growth model data
#'
#' @param data Dataframe with interpolated growth data
#' @param output_dir Directory to save the output
#' @param file_name Filename for the output (default: "All_pivots_Cumulative_CI_quadrant_year_v2.rds")
#' @return Path to the saved file
#'
save_growth_model <- function(data, output_dir, file_name = "All_pivots_Cumulative_CI_quadrant_year_v2.rds") {
  # Create output directory if it doesn't exist
  dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)

  # Create full file path
  file_path <- here::here(output_dir, file_name)

  # Save the data
  saveRDS(data, file_path)

  safe_log(paste("Interpolated CI data saved to:", file_path))
  return(file_path)
}