SmartCane/r_app/experiments/ci_extraction_and_yield_prediction.R
Timon 6efcc8cfec Fix CI report pipeline: update tmap v4 syntax, add continuous color scales, fix formatting
- Updated all CI maps to use tm_scale_continuous() for proper tmap v4 compatibility
- Added fixed color scale limits (1-8 for CI, -3 to +3 for differences) for consistent field comparison
- Fixed YAML header formatting issues in CI_report_dashboard_planet.Rmd
- Positioned RGB map before CI overview map as requested
- Removed all obsolete use_breaks parameter references
- Enhanced error handling and logging throughout the pipeline
- Added new experimental analysis scripts and improvements to mosaic creation
2025-06-19 20:37:20 +02:00

573 lines
18 KiB
R

# CI_EXTRACTION_AND_YIELD_PREDICTION.R
# =====================================
#
# This standalone script demonstrates:
# 1. How Chlorophyll Index (CI) is extracted from satellite imagery
# 2. How yield prediction is performed based on CI values
#
# Created for sharing with colleagues to illustrate the core functionality
# of the SmartCane monitoring system.
#
# -----------------------------
# PART 1: LIBRARY DEPENDENCIES
# -----------------------------
suppressPackageStartupMessages({
# Spatial data processing
library(sf)
library(terra)
library(exactextractr)
# Data manipulation
library(tidyverse)
library(lubridate)
library(here)
# Machine learning for yield prediction
library(rsample)
library(caret)
library(randomForest)
library(CAST)
})
# ----------------------------------
# PART 2: LOGGING & UTILITY FUNCTIONS
# ----------------------------------
#' Safe logging function that works in any environment
#'
#' @param message The message to log
#' @param level The log level (default: "INFO")
#' @return NULL (used for side effects)
#'
safe_log <- function(message, level = "INFO") {
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
formatted_msg <- paste0("[", timestamp, "][", level, "] ", message)
if (level %in% c("ERROR", "WARNING")) {
warning(formatted_msg)
} else {
message(formatted_msg)
}
}
#' Generate a sequence of dates for processing
#'
#' @param end_date The end date for the sequence (Date object)
#' @param offset Number of days to look back from end_date
#' @return A list containing week number, year, and a sequence of dates for filtering
#'
date_list <- function(end_date, offset) {
# Input validation
if (!lubridate::is.Date(end_date)) {
end_date <- as.Date(end_date)
if (is.na(end_date)) {
stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
}
}
offset <- as.numeric(offset)
if (is.na(offset) || offset < 1) {
stop("Invalid offset provided. Expected a positive number.")
}
# Calculate date range
offset <- offset - 1 # Adjust offset to include end_date
start_date <- end_date - lubridate::days(offset)
# Extract week and year information
week <- lubridate::week(start_date)
year <- lubridate::year(start_date)
# Generate sequence of dates
days_filter <- seq(from = start_date, to = end_date, by = "day")
days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering
# Log the date range
safe_log(paste("Date range generated from", start_date, "to", end_date))
return(list(
"week" = week,
"year" = year,
"days_filter" = days_filter,
"start_date" = start_date,
"end_date" = end_date
))
}
# -----------------------------
# PART 3: CI EXTRACTION PROCESS
# -----------------------------
#' Find satellite imagery files within a specific date range
#'
#' @param image_folder Path to the folder containing satellite images
#' @param date_filter Vector of dates to filter by (in YYYY-MM-DD format)
#' @return Vector of file paths matching the date filter
#'
find_satellite_images <- function(image_folder, date_filter) {
# Validate inputs
if (!dir.exists(image_folder)) {
stop(paste("Image folder not found:", image_folder))
}
# List all files in the directory
all_files <- list.files(image_folder, pattern = "\\.tif$", full.names = TRUE, recursive = TRUE)
if (length(all_files) == 0) {
safe_log("No TIF files found in the specified directory", "WARNING")
return(character(0))
}
# Filter files by date pattern in filename
filtered_files <- character(0)
for (date in date_filter) {
# Format date for matching (remove dashes)
date_pattern <- gsub("-", "", date)
# Find files with matching date pattern
matching_files <- all_files[grepl(date_pattern, all_files)]
if (length(matching_files) > 0) {
filtered_files <- c(filtered_files, matching_files)
safe_log(paste("Found", length(matching_files), "files for date", date))
}
}
return(filtered_files)
}
#' Create a Chlorophyll Index (CI) from satellite imagery
#'
#' @param raster_obj A SpatRaster object with Red, Green, Blue, and NIR bands
#' @return A SpatRaster object with a CI band
#'
calculate_ci <- function(raster_obj) {
# Validate input has required bands
if (terra::nlyr(raster_obj) < 4) {
stop("Raster must have at least 4 bands (Red, Green, Blue, NIR)")
}
# Extract bands (assuming standard order: B, G, R, NIR)
blue_band <- raster_obj[[1]]
green_band <- raster_obj[[2]]
red_band <- raster_obj[[3]]
nir_band <- raster_obj[[4]]
# CI formula: (NIR / Red) - 1
# This highlights chlorophyll content in vegetation
ci_raster <- (nir_band / red_band) - 1
# Filter extreme values that may result from division operations
ci_raster[ci_raster > 10] <- 10 # Cap max value
ci_raster[ci_raster < 0] <- 0 # Cap min value
# Name the layer
names(ci_raster) <- "CI"
return(ci_raster)
}
#' Create a mask for cloudy pixels and shadows using thresholds
#'
#' @param raster_obj A SpatRaster object with multiple bands
#' @return A binary mask where 1=clear pixel, 0=cloudy or shadow pixel
#'
create_cloud_mask <- function(raster_obj) {
# Extract bands
blue_band <- raster_obj[[1]]
green_band <- raster_obj[[2]]
red_band <- raster_obj[[3]]
nir_band <- raster_obj[[4]]
# Create initial mask (all pixels valid)
mask <- blue_band * 0 + 1
# Calculate indices used for detection
ndvi <- (nir_band - red_band) / (nir_band + red_band)
brightness <- (blue_band + green_band + red_band) / 3
# CLOUD DETECTION CRITERIA
# ------------------------
# Clouds are typically very bright in all bands
bright_pixels <- (blue_band > 0.3) & (green_band > 0.3) & (red_band > 0.3)
# Snow/high reflectance clouds have high blue values
blue_dominant <- blue_band > (red_band * 1.2)
# Low NDVI areas that are bright are likely clouds
low_ndvi <- ndvi < 0.1
# Combine cloud criteria
cloud_pixels <- bright_pixels & (blue_dominant | low_ndvi)
# SHADOW DETECTION CRITERIA
# ------------------------
# Shadows typically have:
# 1. Low overall brightness across all bands
# 2. Lower NIR reflectance
# 3. Can still have reasonable NDVI (if over vegetation)
# Dark pixels in visible spectrum
dark_pixels <- brightness < 0.1
# Low NIR reflectance
low_nir <- nir_band < 0.15
# Shadows often have higher blue proportion relative to NIR
blue_nir_ratio <- blue_band / (nir_band + 0.01) # Add small constant to avoid division by zero
blue_enhanced <- blue_nir_ratio > 0.8
# Combine shadow criteria
shadow_pixels <- dark_pixels & (low_nir | blue_enhanced)
# Update mask (0 for cloud or shadow pixels)
mask[cloud_pixels | shadow_pixels] <- 0
# Optional: create different values for clouds vs shadows for visualization
# mask[cloud_pixels] <- 0 # Clouds
# mask[shadow_pixels] <- 0 # Shadows
return(mask)
}
#' Process satellite image, calculate CI, and crop to field boundaries
#'
#' @param file Path to the satellite image file
#' @param field_boundaries Field boundaries vector object
#' @param output_dir Directory to save the processed raster
#' @return Path to the processed raster file
#'
process_satellite_image <- function(file, field_boundaries, output_dir) {
# Validate inputs
if (!file.exists(file)) {
stop(paste("File not found:", file))
}
if (is.null(field_boundaries)) {
stop("Field boundaries are required but were not provided")
}
# Create output filename
basename_no_ext <- tools::file_path_sans_ext(basename(file))
output_file <- here::here(output_dir, paste0(basename_no_ext, "_CI.tif"))
# Process with error handling
tryCatch({
# Load and prepare raster
loaded_raster <- terra::rast(file)
# Calculate CI
ci_raster <- calculate_ci(loaded_raster)
# Create cloud mask
cloud_mask <- create_cloud_mask(loaded_raster)
# Apply cloud mask to CI
ci_masked <- ci_raster * cloud_mask
# Crop to field boundaries extent (for efficiency)
field_extent <- terra::ext(field_boundaries)
ci_cropped <- terra::crop(ci_masked, field_extent)
# Write output
terra::writeRaster(ci_cropped, output_file, overwrite = TRUE)
safe_log(paste("Successfully processed", basename(file)))
return(output_file)
}, error = function(e) {
safe_log(paste("Error processing", basename(file), ":", e$message), "ERROR")
return(NULL)
})
}
#' Extract CI statistics for each field
#'
#' @param ci_raster A SpatRaster with CI values
#' @param field_boundaries An sf object with field polygons
#' @return A data frame with CI statistics by field
#'
extract_ci_by_field <- function(ci_raster, field_boundaries) {
# Validate inputs
if (is.null(ci_raster)) {
stop("CI raster is required but was NULL")
}
if (is.null(field_boundaries) || nrow(field_boundaries) == 0) {
stop("Field boundaries are required but were empty")
}
# Extract statistics using exact extraction (weighted by coverage)
ci_stats <- exactextractr::exact_extract(
ci_raster,
field_boundaries,
fun = c("mean", "median", "min", "max", "stdev", "count"),
progress = FALSE
)
# Add field identifiers
ci_stats$field <- field_boundaries$field
if ("sub_field" %in% names(field_boundaries)) {
ci_stats$sub_field <- field_boundaries$sub_field
} else {
ci_stats$sub_field <- field_boundaries$field
}
# Add date info
ci_stats$date <- Sys.Date()
# Clean up names
names(ci_stats) <- gsub("CI\\.", "", names(ci_stats))
return(ci_stats)
}
# -----------------------------------------
# PART 4: YIELD PREDICTION IMPLEMENTATION
# -----------------------------------------
#' Prepare data for yield prediction model
#'
#' @param ci_data Data frame with cumulative CI values
#' @param harvest_data Data frame with harvest information
#' @return Data frame ready for modeling
#'
prepare_yield_prediction_data <- function(ci_data, harvest_data) {
# Join CI and yield data
ci_and_yield <- dplyr::left_join(ci_data, harvest_data, by = c("field", "sub_field", "season")) %>%
dplyr::group_by(sub_field, season) %>%
dplyr::slice(which.max(DOY)) %>%
dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
dplyr::mutate(CI_per_day = cumulative_CI / DOY)
# Split into training and prediction sets
ci_and_yield_train <- ci_and_yield %>%
as.data.frame() %>%
dplyr::filter(!is.na(tonnage_ha))
prediction_yields <- ci_and_yield %>%
as.data.frame() %>%
dplyr::filter(is.na(tonnage_ha))
return(list(
train = ci_and_yield_train,
predict = prediction_yields
))
}
#' Train a random forest model for yield prediction
#'
#' @param training_data Data frame with training data
#' @param predictors Vector of predictor variable names
#' @param response Name of the response variable
#' @return Trained model
#'
train_yield_model <- function(training_data, predictors = c("cumulative_CI", "DOY", "CI_per_day"), response = "tonnage_ha") {
# Configure model training parameters
ctrl <- caret::trainControl(
method = "cv",
savePredictions = TRUE,
allowParallel = TRUE,
number = 5,
verboseIter = TRUE
)
# Train the model with feature selection
set.seed(202) # For reproducibility
model_ffs_rf <- CAST::ffs(
training_data[, predictors],
training_data[, response],
method = "rf",
trControl = ctrl,
importance = TRUE,
withinSE = TRUE,
tuneLength = 5,
na.rm = TRUE
)
return(model_ffs_rf)
}
#' Format predictions into a clean data frame
#'
#' @param predictions Raw prediction results
#' @param newdata Original data frame with field information
#' @return Formatted predictions data frame
#'
prepare_predictions <- function(predictions, newdata) {
return(predictions %>%
as.data.frame() %>%
dplyr::rename(predicted_Tcha = ".") %>%
dplyr::mutate(
sub_field = newdata$sub_field,
field = newdata$field,
Age_days = newdata$DOY,
total_CI = round(newdata$cumulative_CI, 0),
predicted_Tcha = round(predicted_Tcha, 0),
season = newdata$season
) %>%
dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>%
dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
)
}
#' Predict yields for mature fields
#'
#' @param model Trained model
#' @param prediction_data Data frame with fields to predict
#' @param min_age Minimum age in days to qualify as mature (default: 300)
#' @return Data frame with yield predictions
#'
predict_yields <- function(model, prediction_data, min_age = 300) {
# Make predictions
predictions <- stats::predict(model, newdata = prediction_data)
# Format predictions
pred_formatted <- prepare_predictions(predictions, prediction_data) %>%
dplyr::filter(Age_days > min_age) %>%
dplyr::mutate(CI_per_day = round(total_CI / Age_days, 1))
return(pred_formatted)
}
# ------------------------------
# PART 5: DEMONSTRATION WORKFLOW
# ------------------------------
#' Demonstration workflow showing how to use the functions
#'
#' @param end_date The end date for processing satellite images
#' @param offset Number of days to look back
#' @param image_folder Path to the folder containing satellite images
#' @param field_boundaries_path Path to field boundaries shapefile
#' @param output_dir Path to save processed outputs
#' @param harvest_data_path Path to historical harvest data
#'
demo_workflow <- function(end_date = Sys.Date(), offset = 7,
image_folder = "path/to/satellite/images",
field_boundaries_path = "path/to/field_boundaries.shp",
output_dir = "path/to/output",
harvest_data_path = "path/to/harvest_data.csv") {
# Step 1: Generate date list for processing
dates <- date_list(end_date, offset)
safe_log(paste("Processing data for week", dates$week, "of", dates$year))
# Step 2: Load field boundaries
field_boundaries <- sf::read_sf(field_boundaries_path)
safe_log(paste("Loaded", nrow(field_boundaries), "field boundaries"))
# Step 3: Find satellite images for the specified date range
image_files <- find_satellite_images(image_folder, dates$days_filter)
safe_log(paste("Found", length(image_files), "satellite images for processing"))
# Step 4: Process each satellite image and calculate CI
ci_files <- list()
for (file in image_files) {
ci_file <- process_satellite_image(file, field_boundaries, output_dir)
if (!is.null(ci_file)) {
ci_files <- c(ci_files, ci_file)
}
}
# Step 5: Extract CI statistics for each field
ci_stats_list <- list()
for (ci_file in ci_files) {
ci_raster <- terra::rast(ci_file)
ci_stats <- extract_ci_by_field(ci_raster, field_boundaries)
ci_stats_list[[basename(ci_file)]] <- ci_stats
}
# Combine all stats
all_ci_stats <- dplyr::bind_rows(ci_stats_list)
safe_log(paste("Extracted CI statistics for", nrow(all_ci_stats), "field-date combinations"))
# Step 6: Prepare for yield prediction
if (file.exists(harvest_data_path)) {
# Load harvest data
harvest_data <- read.csv(harvest_data_path)
safe_log("Loaded harvest data for yield prediction")
# Make up cumulative_CI data for demonstration purposes
# In a real scenario, this would come from accumulating CI values over time
ci_data <- all_ci_stats %>%
dplyr::group_by(field, sub_field) %>%
dplyr::summarise(
cumulative_CI = sum(mean, na.rm = TRUE),
DOY = n(), # Days of year as the count of observations
season = lubridate::year(max(date, na.rm = TRUE)),
.groups = "drop"
)
# Prepare data for modeling
modeling_data <- prepare_yield_prediction_data(ci_data, harvest_data)
if (nrow(modeling_data$train) > 0) {
# Train yield prediction model
yield_model <- train_yield_model(modeling_data$train)
safe_log("Trained yield prediction model")
# Predict yields for mature fields
yield_predictions <- predict_yields(yield_model, modeling_data$predict)
safe_log(paste("Generated yield predictions for", nrow(yield_predictions), "fields"))
# Return results
return(list(
ci_stats = all_ci_stats,
yield_predictions = yield_predictions,
model = yield_model
))
} else {
safe_log("No training data available for yield prediction", "WARNING")
return(list(ci_stats = all_ci_stats))
}
} else {
safe_log("Harvest data not found, skipping yield prediction", "WARNING")
return(list(ci_stats = all_ci_stats))
}
}
# ------------------------------
# PART 6: USAGE EXAMPLE
# ------------------------------
# Uncomment and modify paths to run the demo workflow
# results <- demo_workflow(
# end_date = "2023-10-01",
# offset = 7,
# image_folder = "data/satellite_images",
# field_boundaries_path = "data/field_boundaries.shp",
# output_dir = "output/processed",
# harvest_data_path = "data/harvest_history.csv"
# )
#
# # Access results
# ci_stats <- results$ci_stats
# yield_predictions <- results$yield_predictions
#
# # Example: Plot CI distribution by field
# if (require(ggplot2)) {
# ggplot(ci_stats, aes(x = field, y = mean, fill = field)) +
# geom_boxplot() +
# labs(title = "CI Distribution by Field",
# x = "Field",
# y = "Mean CI") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
# }
#
# # Example: Plot predicted yield vs age
# if (exists("yield_predictions") && require(ggplot2)) {
# ggplot(yield_predictions, aes(x = Age_days, y = predicted_Tcha, color = field)) +
# geom_point(size = 3) +
# geom_text(aes(label = field), hjust = -0.2, vjust = -0.2) +
# labs(title = "Predicted Yield by Field Age",
# x = "Age (Days)",
# y = "Predicted Yield (Tonnes/ha)") +
# theme_minimal()
# }