SmartCane/r_app/experiments/ci_extraction_and_yield_prediction.R

# CI_EXTRACTION_AND_YIELD_PREDICTION.R
# =====================================
#
# This standalone script demonstrates:
# 1. How Chlorophyll Index (CI) is extracted from satellite imagery
# 2. How yield prediction is performed based on CI values
#
# Created for sharing with colleagues to illustrate the core functionality
# of the SmartCane monitoring system.
#

# -----------------------------
# PART 1: LIBRARY DEPENDENCIES
# -----------------------------

suppressPackageStartupMessages({
  # Spatial data processing
  library(sf)
  library(terra)
  library(exactextractr)

  # Data manipulation
  library(tidyverse)
  library(lubridate)
  library(here)

  # Machine learning for yield prediction
  library(rsample)
  library(caret)
  library(randomForest)
  library(CAST)
})

# ----------------------------------
# PART 2: LOGGING & UTILITY FUNCTIONS
# ----------------------------------

#' Safe logging function that works in any environment
#'
#' @param message The message to log
#' @param level The log level (default: "INFO")
#' @return NULL (used for side effects)
#'
safe_log <- function(message, level = "INFO") {
  timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
  formatted_msg <- paste0("[", timestamp, "][", level, "] ", message)

  if (level %in% c("ERROR", "WARNING")) {
    warning(formatted_msg)
  } else {
    message(formatted_msg)
  }
}

#' Generate a sequence of dates for processing
#'
#' @param end_date The end date for the sequence (Date object)
#' @param offset Number of days to look back from end_date
#' @return A list containing week number, year, and a sequence of dates for filtering
#'
date_list <- function(end_date, offset) {
  # Input validation
  if (!lubridate::is.Date(end_date)) {
    end_date <- as.Date(end_date)
    if (is.na(end_date)) {
      stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
    }
  }

  offset <- as.numeric(offset)
  if (is.na(offset) || offset < 1) {
    stop("Invalid offset provided. Expected a positive number.")
  }

  # Calculate date range
  offset <- offset - 1  # Adjust offset to include end_date
  start_date <- end_date - lubridate::days(offset)

  # Extract week and year information
  week <- lubridate::week(start_date)
  year <- lubridate::year(start_date)

  # Generate sequence of dates
  days_filter <- seq(from = start_date, to = end_date, by = "day")
  days_filter <- format(days_filter, "%Y-%m-%d")  # Format for consistent filtering

  # Log the date range
  safe_log(paste("Date range generated from", start_date, "to", end_date))

  return(list(
    "week" = week,
    "year" = year,
    "days_filter" = days_filter,
    "start_date" = start_date,
    "end_date" = end_date
  ))
}

# -----------------------------
# PART 3: CI EXTRACTION PROCESS
# -----------------------------

#' Find satellite imagery files within a specific date range
#'
#' @param image_folder Path to the folder containing satellite images
#' @param date_filter Vector of dates to filter by (in YYYY-MM-DD format)
#' @return Vector of file paths matching the date filter
#'
find_satellite_images <- function(image_folder, date_filter) {
  # Validate inputs
  if (!dir.exists(image_folder)) {
    stop(paste("Image folder not found:", image_folder))
  }

  # List all files in the directory
  all_files <- list.files(image_folder, pattern = "\\.tif$", full.names = TRUE, recursive = TRUE)

  if (length(all_files) == 0) {
    safe_log("No TIF files found in the specified directory", "WARNING")
    return(character(0))
  }

  # Filter files by date pattern in filename
  filtered_files <- character(0)

  for (date in date_filter) {
    # Format date for matching (remove dashes)
    date_pattern <- gsub("-", "", date)

    # Find files with matching date pattern
    matching_files <- all_files[grepl(date_pattern, all_files)]

    if (length(matching_files) > 0) {
      filtered_files <- c(filtered_files, matching_files)
      safe_log(paste("Found", length(matching_files), "files for date", date))
    }
  }

  return(filtered_files)
}

#' Create a Chlorophyll Index (CI) from satellite imagery
#'
#' @param raster_obj A SpatRaster object with Red, Green, Blue, and NIR bands
#' @return A SpatRaster object with a CI band
#'
calculate_ci <- function(raster_obj) {
  # Validate input has required bands
  if (terra::nlyr(raster_obj) < 4) {
    stop("Raster must have at least 4 bands (Red, Green, Blue, NIR)")
  }

  # Extract bands (assuming standard order: B, G, R, NIR)
  blue_band <- raster_obj[[1]]
  green_band <- raster_obj[[2]]
  red_band <- raster_obj[[3]]
  nir_band <- raster_obj[[4]]

  # CI formula: (NIR / Green) - 1, NOT (NIR / Red) - 1
  # *** CRITICAL: Use GREEN band for Chlorophyll Index, NOT RED ***
  # GREEN band is essential for proper chlorophyll-sensitive calculation
  ci_raster <- (nir_band / green_band) - 1

  # Filter extreme values that may result from division operations
  ci_raster[ci_raster > 10] <- 10  # Cap max value
  ci_raster[ci_raster < 0] <- 0    # Cap min value

  # Name the layer
  names(ci_raster) <- "CI"

  return(ci_raster)
}

#' Create a mask for cloudy pixels and shadows using thresholds
#'
#' @param raster_obj A SpatRaster object with multiple bands
#' @return A binary mask where 1=clear pixel, 0=cloudy or shadow pixel
#'
create_cloud_mask <- function(raster_obj) {
  # Extract bands
  blue_band <- raster_obj[[1]]
  green_band <- raster_obj[[2]]
  red_band <- raster_obj[[3]]
  nir_band <- raster_obj[[4]]

  # Create initial mask (all pixels valid)
  mask <- blue_band * 0 + 1

  # Calculate indices used for detection
  ndvi <- (nir_band - red_band) / (nir_band + red_band)
  brightness <- (blue_band + green_band + red_band) / 3

  # CLOUD DETECTION CRITERIA
  # ------------------------
  # Clouds are typically very bright in all bands
  bright_pixels <- (blue_band > 0.3) & (green_band > 0.3) & (red_band > 0.3)

  # Snow/high reflectance clouds have high blue values
  blue_dominant <- blue_band > (red_band * 1.2)

  # Low NDVI areas that are bright are likely clouds
  low_ndvi <- ndvi < 0.1

  # Combine cloud criteria
  cloud_pixels <- bright_pixels & (blue_dominant | low_ndvi)

  # SHADOW DETECTION CRITERIA
  # ------------------------
  # Shadows typically have:
  # 1. Low overall brightness across all bands
  # 2. Lower NIR reflectance
  # 3. Can still have reasonable NDVI (if over vegetation)

  # Dark pixels in visible spectrum
  dark_pixels <- brightness < 0.1

  # Low NIR reflectance
  low_nir <- nir_band < 0.15

  # Shadows often have higher blue proportion relative to NIR
  blue_nir_ratio <- blue_band / (nir_band + 0.01)  # Add small constant to avoid division by zero
  blue_enhanced <- blue_nir_ratio > 0.8

  # Combine shadow criteria
  shadow_pixels <- dark_pixels & (low_nir | blue_enhanced)

  # Update mask (0 for cloud or shadow pixels)
  mask[cloud_pixels | shadow_pixels] <- 0

  # Optional: create different values for clouds vs shadows for visualization
  # mask[cloud_pixels] <- 0  # Clouds
  # mask[shadow_pixels] <- 0  # Shadows

  return(mask)
}

#' Process satellite image, calculate CI, and crop to field boundaries
#'
#' @param file Path to the satellite image file
#' @param field_boundaries Field boundaries vector object
#' @param output_dir Directory to save the processed raster
#' @return Path to the processed raster file
#'
process_satellite_image <- function(file, field_boundaries, output_dir) {
  # Validate inputs
  if (!file.exists(file)) {
    stop(paste("File not found:", file))
  }

  if (is.null(field_boundaries)) {
    stop("Field boundaries are required but were not provided")
  }

  # Create output filename
  basename_no_ext <- tools::file_path_sans_ext(basename(file))
  output_file <- here::here(output_dir, paste0(basename_no_ext, "_CI.tif"))

  # Process with error handling
  tryCatch({
    # Load and prepare raster
    loaded_raster <- terra::rast(file)

    # Calculate CI
    ci_raster <- calculate_ci(loaded_raster)

    # Create cloud mask
    cloud_mask <- create_cloud_mask(loaded_raster)

    # Apply cloud mask to CI
    ci_masked <- ci_raster * cloud_mask

    # Crop to field boundaries extent (for efficiency)
    field_extent <- terra::ext(field_boundaries)
    ci_cropped <- terra::crop(ci_masked, field_extent)

    # Write output
    terra::writeRaster(ci_cropped, output_file, overwrite = TRUE)

    safe_log(paste("Successfully processed", basename(file)))

    return(output_file)

  }, error = function(e) {
    safe_log(paste("Error processing", basename(file), ":", e$message), "ERROR")
    return(NULL)
  })
}

#' Extract CI statistics for each field
#'
#' @param ci_raster A SpatRaster with CI values
#' @param field_boundaries An sf object with field polygons
#' @return A data frame with CI statistics by field
#'
extract_ci_by_field <- function(ci_raster, field_boundaries) {
  # Validate inputs
  if (is.null(ci_raster)) {
    stop("CI raster is required but was NULL")
  }

  if (is.null(field_boundaries) || nrow(field_boundaries) == 0) {
    stop("Field boundaries are required but were empty")
  }

  # Extract statistics using exact extraction (weighted by coverage)
  ci_stats <- exactextractr::exact_extract(
    ci_raster,
    field_boundaries,
    fun = c("mean", "median", "min", "max", "stdev", "count"),
    progress = FALSE
  )

  # Add field identifiers
  ci_stats$field <- field_boundaries$field
  if ("sub_field" %in% names(field_boundaries)) {
    ci_stats$sub_field <- field_boundaries$sub_field
  } else {
    ci_stats$sub_field <- field_boundaries$field
  }

  # Add date info
  ci_stats$date <- Sys.Date()

  # Clean up names
  names(ci_stats) <- gsub("CI\\.", "", names(ci_stats))

  return(ci_stats)
}

# -----------------------------------------
# PART 4: YIELD PREDICTION IMPLEMENTATION
# -----------------------------------------

#' Prepare data for yield prediction model
#'
#' @param ci_data Data frame with cumulative CI values
#' @param harvest_data Data frame with harvest information
#' @return Data frame ready for modeling
#'
prepare_yield_prediction_data <- function(ci_data, harvest_data) {
  # Join CI and yield data
  ci_and_yield <- dplyr::left_join(ci_data, harvest_data, by = c("field", "sub_field", "season")) %>%
    dplyr::group_by(sub_field, season) %>%
    dplyr::slice(which.max(DOY)) %>%
    dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
    dplyr::mutate(CI_per_day = cumulative_CI / DOY)

  # Split into training and prediction sets
  ci_and_yield_train <- ci_and_yield %>%
    as.data.frame() %>%
    dplyr::filter(!is.na(tonnage_ha))

  prediction_yields <- ci_and_yield %>%
    as.data.frame() %>%
    dplyr::filter(is.na(tonnage_ha))

  return(list(
    train = ci_and_yield_train,
    predict = prediction_yields
  ))
}

#' Train a random forest model for yield prediction
#'
#' @param training_data Data frame with training data
#' @param predictors Vector of predictor variable names
#' @param response Name of the response variable
#' @return Trained model
#'
train_yield_model <- function(training_data, predictors = c("cumulative_CI", "DOY", "CI_per_day"), response = "tonnage_ha") {
  # Configure model training parameters
  ctrl <- caret::trainControl(
    method = "cv",
    savePredictions = TRUE,
    allowParallel = TRUE,
    number = 5,
    verboseIter = TRUE
  )

  # Train the model with feature selection
  set.seed(202) # For reproducibility
  model_ffs_rf <- CAST::ffs(
    training_data[, predictors],
    training_data[, response],
    method = "rf",
    trControl = ctrl,
    importance = TRUE,
    withinSE = TRUE,
    tuneLength = 5,
    na.rm = TRUE
  )

  return(model_ffs_rf)
}

#' Format predictions into a clean data frame
#'
#' @param predictions Raw prediction results
#' @param newdata Original data frame with field information
#' @return Formatted predictions data frame
#'
prepare_predictions <- function(predictions, newdata) {
  return(predictions %>%
    as.data.frame() %>%
    dplyr::rename(predicted_Tcha = ".") %>%
    dplyr::mutate(
      sub_field = newdata$sub_field,
      field = newdata$field,
      Age_days = newdata$DOY,
      total_CI = round(newdata$cumulative_CI, 0),
      predicted_Tcha = round(predicted_Tcha, 0),
      season = newdata$season
    ) %>%
    dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>%
    dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
  )
}

#' Predict yields for mature fields
#'
#' @param model Trained model
#' @param prediction_data Data frame with fields to predict
#' @param min_age Minimum age in days to qualify as mature (default: 300)
#' @return Data frame with yield predictions
#'
predict_yields <- function(model, prediction_data, min_age = 300) {
  # Make predictions
  predictions <- stats::predict(model, newdata = prediction_data)

  # Format predictions
  pred_formatted <- prepare_predictions(predictions, prediction_data) %>%
    dplyr::filter(Age_days > min_age) %>%
    dplyr::mutate(CI_per_day = round(total_CI / Age_days, 1))

  return(pred_formatted)
}

# ------------------------------
# PART 5: DEMONSTRATION WORKFLOW
# ------------------------------

#' Demonstration workflow showing how to use the functions
#'
#' @param end_date The end date for processing satellite images
#' @param offset Number of days to look back
#' @param image_folder Path to the folder containing satellite images
#' @param field_boundaries_path Path to field boundaries shapefile
#' @param output_dir Path to save processed outputs
#' @param harvest_data_path Path to historical harvest data
#'
demo_workflow <- function(end_date = Sys.Date(), offset = 7,
                         image_folder = "path/to/satellite/images",
                         field_boundaries_path = "path/to/field_boundaries.shp",
                         output_dir = "path/to/output",
                         harvest_data_path = "path/to/harvest_data.csv") {

  # Step 1: Generate date list for processing
  dates <- date_list(end_date, offset)
  safe_log(paste("Processing data for week", dates$week, "of", dates$year))

  # Step 2: Load field boundaries
  field_boundaries <- sf::read_sf(field_boundaries_path)
  safe_log(paste("Loaded", nrow(field_boundaries), "field boundaries"))

  # Step 3: Find satellite images for the specified date range
  image_files <- find_satellite_images(image_folder, dates$days_filter)
  safe_log(paste("Found", length(image_files), "satellite images for processing"))

  # Step 4: Process each satellite image and calculate CI
  ci_files <- list()
  for (file in image_files) {
    ci_file <- process_satellite_image(file, field_boundaries, output_dir)
    if (!is.null(ci_file)) {
      ci_files <- c(ci_files, ci_file)
    }
  }

  # Step 5: Extract CI statistics for each field
  ci_stats_list <- list()
  for (ci_file in ci_files) {
    ci_raster <- terra::rast(ci_file)
    ci_stats <- extract_ci_by_field(ci_raster, field_boundaries)
    ci_stats_list[[basename(ci_file)]] <- ci_stats
  }

  # Combine all stats
  all_ci_stats <- dplyr::bind_rows(ci_stats_list)
  safe_log(paste("Extracted CI statistics for", nrow(all_ci_stats), "field-date combinations"))

  # Step 6: Prepare for yield prediction
  if (file.exists(harvest_data_path)) {
    # Load harvest data
    harvest_data <- read.csv(harvest_data_path)
    safe_log("Loaded harvest data for yield prediction")

    # Make up cumulative_CI data for demonstration purposes
    # In a real scenario, this would come from accumulating CI values over time
    ci_data <- all_ci_stats %>%
      dplyr::group_by(field, sub_field) %>%
      dplyr::summarise(
        cumulative_CI = sum(mean, na.rm = TRUE),
        DOY = n(),  # Days of year as the count of observations
        season = lubridate::year(max(date, na.rm = TRUE)),
        .groups = "drop"
      )

    # Prepare data for modeling
    modeling_data <- prepare_yield_prediction_data(ci_data, harvest_data)

    if (nrow(modeling_data$train) > 0) {
      # Train yield prediction model
      yield_model <- train_yield_model(modeling_data$train)
      safe_log("Trained yield prediction model")

      # Predict yields for mature fields
      yield_predictions <- predict_yields(yield_model, modeling_data$predict)
      safe_log(paste("Generated yield predictions for", nrow(yield_predictions), "fields"))

      # Return results
      return(list(
        ci_stats = all_ci_stats,
        yield_predictions = yield_predictions,
        model = yield_model
      ))
    } else {
      safe_log("No training data available for yield prediction", "WARNING")
      return(list(ci_stats = all_ci_stats))
    }
  } else {
    safe_log("Harvest data not found, skipping yield prediction", "WARNING")
    return(list(ci_stats = all_ci_stats))
  }
}

# ------------------------------
# PART 6: USAGE EXAMPLE
# ------------------------------

# Uncomment and modify paths to run the demo workflow
# results <- demo_workflow(
#   end_date = "2023-10-01",
#   offset = 7,
#   image_folder = "data/satellite_images",
#   field_boundaries_path = "data/field_boundaries.shp",
#   output_dir = "output/processed",
#   harvest_data_path = "data/harvest_history.csv"
# )
#
# # Access results
# ci_stats <- results$ci_stats
# yield_predictions <- results$yield_predictions
#
# # Example: Plot CI distribution by field
# if (require(ggplot2)) {
#   ggplot(ci_stats, aes(x = field, y = mean, fill = field)) +
#     geom_boxplot() +
#     labs(title = "CI Distribution by Field",
#          x = "Field",
#          y = "Mean CI") +
#     theme_minimal() +
#     theme(axis.text.x = element_text(angle = 45, hjust = 1))
# }
#
# # Example: Plot predicted yield vs age
# if (exists("yield_predictions") && require(ggplot2)) {
#   ggplot(yield_predictions, aes(x = Age_days, y = predicted_Tcha, color = field)) +
#     geom_point(size = 3) +
#     geom_text(aes(label = field), hjust = -0.2, vjust = -0.2) +
#     labs(title = "Predicted Yield by Field Age",
#          x = "Age (Days)",
#          y = "Predicted Yield (Tonnes/ha)") +
#     theme_minimal()
# }