# CI_EXTRACTION_AND_YIELD_PREDICTION.R # ===================================== # # This standalone script demonstrates: # 1. How Chlorophyll Index (CI) is extracted from satellite imagery # 2. How yield prediction is performed based on CI values # # Created for sharing with colleagues to illustrate the core functionality # of the SmartCane monitoring system. # # ----------------------------- # PART 1: LIBRARY DEPENDENCIES # ----------------------------- suppressPackageStartupMessages({ # Spatial data processing library(sf) library(terra) library(exactextractr) # Data manipulation library(tidyverse) library(lubridate) library(here) # Machine learning for yield prediction library(rsample) library(caret) library(randomForest) library(CAST) }) # ---------------------------------- # PART 2: LOGGING & UTILITY FUNCTIONS # ---------------------------------- #' Safe logging function that works in any environment #' #' @param message The message to log #' @param level The log level (default: "INFO") #' @return NULL (used for side effects) #' safe_log <- function(message, level = "INFO") { timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S") formatted_msg <- paste0("[", timestamp, "][", level, "] ", message) if (level %in% c("ERROR", "WARNING")) { warning(formatted_msg) } else { message(formatted_msg) } } #' Generate a sequence of dates for processing #' #' @param end_date The end date for the sequence (Date object) #' @param offset Number of days to look back from end_date #' @return A list containing week number, year, and a sequence of dates for filtering #' date_list <- function(end_date, offset) { # Input validation if (!lubridate::is.Date(end_date)) { end_date <- as.Date(end_date) if (is.na(end_date)) { stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.") } } offset <- as.numeric(offset) if (is.na(offset) || offset < 1) { stop("Invalid offset provided. Expected a positive number.") } # Calculate date range offset <- offset - 1 # Adjust offset to include end_date start_date <- end_date - lubridate::days(offset) # Extract week and year information week <- lubridate::week(start_date) year <- lubridate::year(start_date) # Generate sequence of dates days_filter <- seq(from = start_date, to = end_date, by = "day") days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering # Log the date range safe_log(paste("Date range generated from", start_date, "to", end_date)) return(list( "week" = week, "year" = year, "days_filter" = days_filter, "start_date" = start_date, "end_date" = end_date )) } # ----------------------------- # PART 3: CI EXTRACTION PROCESS # ----------------------------- #' Find satellite imagery files within a specific date range #' #' @param image_folder Path to the folder containing satellite images #' @param date_filter Vector of dates to filter by (in YYYY-MM-DD format) #' @return Vector of file paths matching the date filter #' find_satellite_images <- function(image_folder, date_filter) { # Validate inputs if (!dir.exists(image_folder)) { stop(paste("Image folder not found:", image_folder)) } # List all files in the directory all_files <- list.files(image_folder, pattern = "\\.tif$", full.names = TRUE, recursive = TRUE) if (length(all_files) == 0) { safe_log("No TIF files found in the specified directory", "WARNING") return(character(0)) } # Filter files by date pattern in filename filtered_files <- character(0) for (date in date_filter) { # Format date for matching (remove dashes) date_pattern <- gsub("-", "", date) # Find files with matching date pattern matching_files <- all_files[grepl(date_pattern, all_files)] if (length(matching_files) > 0) { filtered_files <- c(filtered_files, matching_files) safe_log(paste("Found", length(matching_files), "files for date", date)) } } return(filtered_files) } #' Create a Chlorophyll Index (CI) from satellite imagery #' #' @param raster_obj A SpatRaster object with Red, Green, Blue, and NIR bands #' @return A SpatRaster object with a CI band #' calculate_ci <- function(raster_obj) { # Validate input has required bands if (terra::nlyr(raster_obj) < 4) { stop("Raster must have at least 4 bands (Red, Green, Blue, NIR)") } # Extract bands (assuming standard order: B, G, R, NIR) blue_band <- raster_obj[[1]] green_band <- raster_obj[[2]] red_band <- raster_obj[[3]] nir_band <- raster_obj[[4]] # CI formula: (NIR / Red) - 1 # This highlights chlorophyll content in vegetation ci_raster <- (nir_band / red_band) - 1 # Filter extreme values that may result from division operations ci_raster[ci_raster > 10] <- 10 # Cap max value ci_raster[ci_raster < 0] <- 0 # Cap min value # Name the layer names(ci_raster) <- "CI" return(ci_raster) } #' Create a mask for cloudy pixels and shadows using thresholds #' #' @param raster_obj A SpatRaster object with multiple bands #' @return A binary mask where 1=clear pixel, 0=cloudy or shadow pixel #' create_cloud_mask <- function(raster_obj) { # Extract bands blue_band <- raster_obj[[1]] green_band <- raster_obj[[2]] red_band <- raster_obj[[3]] nir_band <- raster_obj[[4]] # Create initial mask (all pixels valid) mask <- blue_band * 0 + 1 # Calculate indices used for detection ndvi <- (nir_band - red_band) / (nir_band + red_band) brightness <- (blue_band + green_band + red_band) / 3 # CLOUD DETECTION CRITERIA # ------------------------ # Clouds are typically very bright in all bands bright_pixels <- (blue_band > 0.3) & (green_band > 0.3) & (red_band > 0.3) # Snow/high reflectance clouds have high blue values blue_dominant <- blue_band > (red_band * 1.2) # Low NDVI areas that are bright are likely clouds low_ndvi <- ndvi < 0.1 # Combine cloud criteria cloud_pixels <- bright_pixels & (blue_dominant | low_ndvi) # SHADOW DETECTION CRITERIA # ------------------------ # Shadows typically have: # 1. Low overall brightness across all bands # 2. Lower NIR reflectance # 3. Can still have reasonable NDVI (if over vegetation) # Dark pixels in visible spectrum dark_pixels <- brightness < 0.1 # Low NIR reflectance low_nir <- nir_band < 0.15 # Shadows often have higher blue proportion relative to NIR blue_nir_ratio <- blue_band / (nir_band + 0.01) # Add small constant to avoid division by zero blue_enhanced <- blue_nir_ratio > 0.8 # Combine shadow criteria shadow_pixels <- dark_pixels & (low_nir | blue_enhanced) # Update mask (0 for cloud or shadow pixels) mask[cloud_pixels | shadow_pixels] <- 0 # Optional: create different values for clouds vs shadows for visualization # mask[cloud_pixels] <- 0 # Clouds # mask[shadow_pixels] <- 0 # Shadows return(mask) } #' Process satellite image, calculate CI, and crop to field boundaries #' #' @param file Path to the satellite image file #' @param field_boundaries Field boundaries vector object #' @param output_dir Directory to save the processed raster #' @return Path to the processed raster file #' process_satellite_image <- function(file, field_boundaries, output_dir) { # Validate inputs if (!file.exists(file)) { stop(paste("File not found:", file)) } if (is.null(field_boundaries)) { stop("Field boundaries are required but were not provided") } # Create output filename basename_no_ext <- tools::file_path_sans_ext(basename(file)) output_file <- here::here(output_dir, paste0(basename_no_ext, "_CI.tif")) # Process with error handling tryCatch({ # Load and prepare raster loaded_raster <- terra::rast(file) # Calculate CI ci_raster <- calculate_ci(loaded_raster) # Create cloud mask cloud_mask <- create_cloud_mask(loaded_raster) # Apply cloud mask to CI ci_masked <- ci_raster * cloud_mask # Crop to field boundaries extent (for efficiency) field_extent <- terra::ext(field_boundaries) ci_cropped <- terra::crop(ci_masked, field_extent) # Write output terra::writeRaster(ci_cropped, output_file, overwrite = TRUE) safe_log(paste("Successfully processed", basename(file))) return(output_file) }, error = function(e) { safe_log(paste("Error processing", basename(file), ":", e$message), "ERROR") return(NULL) }) } #' Extract CI statistics for each field #' #' @param ci_raster A SpatRaster with CI values #' @param field_boundaries An sf object with field polygons #' @return A data frame with CI statistics by field #' extract_ci_by_field <- function(ci_raster, field_boundaries) { # Validate inputs if (is.null(ci_raster)) { stop("CI raster is required but was NULL") } if (is.null(field_boundaries) || nrow(field_boundaries) == 0) { stop("Field boundaries are required but were empty") } # Extract statistics using exact extraction (weighted by coverage) ci_stats <- exactextractr::exact_extract( ci_raster, field_boundaries, fun = c("mean", "median", "min", "max", "stdev", "count"), progress = FALSE ) # Add field identifiers ci_stats$field <- field_boundaries$field if ("sub_field" %in% names(field_boundaries)) { ci_stats$sub_field <- field_boundaries$sub_field } else { ci_stats$sub_field <- field_boundaries$field } # Add date info ci_stats$date <- Sys.Date() # Clean up names names(ci_stats) <- gsub("CI\\.", "", names(ci_stats)) return(ci_stats) } # ----------------------------------------- # PART 4: YIELD PREDICTION IMPLEMENTATION # ----------------------------------------- #' Prepare data for yield prediction model #' #' @param ci_data Data frame with cumulative CI values #' @param harvest_data Data frame with harvest information #' @return Data frame ready for modeling #' prepare_yield_prediction_data <- function(ci_data, harvest_data) { # Join CI and yield data ci_and_yield <- dplyr::left_join(ci_data, harvest_data, by = c("field", "sub_field", "season")) %>% dplyr::group_by(sub_field, season) %>% dplyr::slice(which.max(DOY)) %>% dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>% dplyr::mutate(CI_per_day = cumulative_CI / DOY) # Split into training and prediction sets ci_and_yield_train <- ci_and_yield %>% as.data.frame() %>% dplyr::filter(!is.na(tonnage_ha)) prediction_yields <- ci_and_yield %>% as.data.frame() %>% dplyr::filter(is.na(tonnage_ha)) return(list( train = ci_and_yield_train, predict = prediction_yields )) } #' Train a random forest model for yield prediction #' #' @param training_data Data frame with training data #' @param predictors Vector of predictor variable names #' @param response Name of the response variable #' @return Trained model #' train_yield_model <- function(training_data, predictors = c("cumulative_CI", "DOY", "CI_per_day"), response = "tonnage_ha") { # Configure model training parameters ctrl <- caret::trainControl( method = "cv", savePredictions = TRUE, allowParallel = TRUE, number = 5, verboseIter = TRUE ) # Train the model with feature selection set.seed(202) # For reproducibility model_ffs_rf <- CAST::ffs( training_data[, predictors], training_data[, response], method = "rf", trControl = ctrl, importance = TRUE, withinSE = TRUE, tuneLength = 5, na.rm = TRUE ) return(model_ffs_rf) } #' Format predictions into a clean data frame #' #' @param predictions Raw prediction results #' @param newdata Original data frame with field information #' @return Formatted predictions data frame #' prepare_predictions <- function(predictions, newdata) { return(predictions %>% as.data.frame() %>% dplyr::rename(predicted_Tcha = ".") %>% dplyr::mutate( sub_field = newdata$sub_field, field = newdata$field, Age_days = newdata$DOY, total_CI = round(newdata$cumulative_CI, 0), predicted_Tcha = round(predicted_Tcha, 0), season = newdata$season ) %>% dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>% dplyr::left_join(., newdata, by = c("field", "sub_field", "season")) ) } #' Predict yields for mature fields #' #' @param model Trained model #' @param prediction_data Data frame with fields to predict #' @param min_age Minimum age in days to qualify as mature (default: 300) #' @return Data frame with yield predictions #' predict_yields <- function(model, prediction_data, min_age = 300) { # Make predictions predictions <- stats::predict(model, newdata = prediction_data) # Format predictions pred_formatted <- prepare_predictions(predictions, prediction_data) %>% dplyr::filter(Age_days > min_age) %>% dplyr::mutate(CI_per_day = round(total_CI / Age_days, 1)) return(pred_formatted) } # ------------------------------ # PART 5: DEMONSTRATION WORKFLOW # ------------------------------ #' Demonstration workflow showing how to use the functions #' #' @param end_date The end date for processing satellite images #' @param offset Number of days to look back #' @param image_folder Path to the folder containing satellite images #' @param field_boundaries_path Path to field boundaries shapefile #' @param output_dir Path to save processed outputs #' @param harvest_data_path Path to historical harvest data #' demo_workflow <- function(end_date = Sys.Date(), offset = 7, image_folder = "path/to/satellite/images", field_boundaries_path = "path/to/field_boundaries.shp", output_dir = "path/to/output", harvest_data_path = "path/to/harvest_data.csv") { # Step 1: Generate date list for processing dates <- date_list(end_date, offset) safe_log(paste("Processing data for week", dates$week, "of", dates$year)) # Step 2: Load field boundaries field_boundaries <- sf::read_sf(field_boundaries_path) safe_log(paste("Loaded", nrow(field_boundaries), "field boundaries")) # Step 3: Find satellite images for the specified date range image_files <- find_satellite_images(image_folder, dates$days_filter) safe_log(paste("Found", length(image_files), "satellite images for processing")) # Step 4: Process each satellite image and calculate CI ci_files <- list() for (file in image_files) { ci_file <- process_satellite_image(file, field_boundaries, output_dir) if (!is.null(ci_file)) { ci_files <- c(ci_files, ci_file) } } # Step 5: Extract CI statistics for each field ci_stats_list <- list() for (ci_file in ci_files) { ci_raster <- terra::rast(ci_file) ci_stats <- extract_ci_by_field(ci_raster, field_boundaries) ci_stats_list[[basename(ci_file)]] <- ci_stats } # Combine all stats all_ci_stats <- dplyr::bind_rows(ci_stats_list) safe_log(paste("Extracted CI statistics for", nrow(all_ci_stats), "field-date combinations")) # Step 6: Prepare for yield prediction if (file.exists(harvest_data_path)) { # Load harvest data harvest_data <- read.csv(harvest_data_path) safe_log("Loaded harvest data for yield prediction") # Make up cumulative_CI data for demonstration purposes # In a real scenario, this would come from accumulating CI values over time ci_data <- all_ci_stats %>% dplyr::group_by(field, sub_field) %>% dplyr::summarise( cumulative_CI = sum(mean, na.rm = TRUE), DOY = n(), # Days of year as the count of observations season = lubridate::year(max(date, na.rm = TRUE)), .groups = "drop" ) # Prepare data for modeling modeling_data <- prepare_yield_prediction_data(ci_data, harvest_data) if (nrow(modeling_data$train) > 0) { # Train yield prediction model yield_model <- train_yield_model(modeling_data$train) safe_log("Trained yield prediction model") # Predict yields for mature fields yield_predictions <- predict_yields(yield_model, modeling_data$predict) safe_log(paste("Generated yield predictions for", nrow(yield_predictions), "fields")) # Return results return(list( ci_stats = all_ci_stats, yield_predictions = yield_predictions, model = yield_model )) } else { safe_log("No training data available for yield prediction", "WARNING") return(list(ci_stats = all_ci_stats)) } } else { safe_log("Harvest data not found, skipping yield prediction", "WARNING") return(list(ci_stats = all_ci_stats)) } } # ------------------------------ # PART 6: USAGE EXAMPLE # ------------------------------ # Uncomment and modify paths to run the demo workflow # results <- demo_workflow( # end_date = "2023-10-01", # offset = 7, # image_folder = "data/satellite_images", # field_boundaries_path = "data/field_boundaries.shp", # output_dir = "output/processed", # harvest_data_path = "data/harvest_history.csv" # ) # # # Access results # ci_stats <- results$ci_stats # yield_predictions <- results$yield_predictions # # # Example: Plot CI distribution by field # if (require(ggplot2)) { # ggplot(ci_stats, aes(x = field, y = mean, fill = field)) + # geom_boxplot() + # labs(title = "CI Distribution by Field", # x = "Field", # y = "Mean CI") + # theme_minimal() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) # } # # # Example: Plot predicted yield vs age # if (exists("yield_predictions") && require(ggplot2)) { # ggplot(yield_predictions, aes(x = Age_days, y = predicted_Tcha, color = field)) + # geom_point(size = 3) + # geom_text(aes(label = field), hjust = -0.2, vjust = -0.2) + # labs(title = "Predicted Yield by Field Age", # x = "Age (Days)", # y = "Predicted Yield (Tonnes/ha)") + # theme_minimal() # }