574 lines
18 KiB
R
574 lines
18 KiB
R
# CI_EXTRACTION_AND_YIELD_PREDICTION.R
|
|
# =====================================
|
|
#
|
|
# This standalone script demonstrates:
|
|
# 1. How Chlorophyll Index (CI) is extracted from satellite imagery
|
|
# 2. How yield prediction is performed based on CI values
|
|
#
|
|
# Created for sharing with colleagues to illustrate the core functionality
|
|
# of the SmartCane monitoring system.
|
|
#
|
|
|
|
# -----------------------------
|
|
# PART 1: LIBRARY DEPENDENCIES
|
|
# -----------------------------
|
|
|
|
suppressPackageStartupMessages({
|
|
# Spatial data processing
|
|
library(sf)
|
|
library(terra)
|
|
library(exactextractr)
|
|
|
|
# Data manipulation
|
|
library(tidyverse)
|
|
library(lubridate)
|
|
library(here)
|
|
|
|
# Machine learning for yield prediction
|
|
library(rsample)
|
|
library(caret)
|
|
library(randomForest)
|
|
library(CAST)
|
|
})
|
|
|
|
# ----------------------------------
|
|
# PART 2: LOGGING & UTILITY FUNCTIONS
|
|
# ----------------------------------
|
|
|
|
#' Safe logging function that works in any environment
|
|
#'
|
|
#' @param message The message to log
|
|
#' @param level The log level (default: "INFO")
|
|
#' @return NULL (used for side effects)
|
|
#'
|
|
safe_log <- function(message, level = "INFO") {
|
|
timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
|
|
formatted_msg <- paste0("[", timestamp, "][", level, "] ", message)
|
|
|
|
if (level %in% c("ERROR", "WARNING")) {
|
|
warning(formatted_msg)
|
|
} else {
|
|
message(formatted_msg)
|
|
}
|
|
}
|
|
|
|
#' Generate a sequence of dates for processing
|
|
#'
|
|
#' @param end_date The end date for the sequence (Date object)
|
|
#' @param offset Number of days to look back from end_date
|
|
#' @return A list containing week number, year, and a sequence of dates for filtering
|
|
#'
|
|
date_list <- function(end_date, offset) {
|
|
# Input validation
|
|
if (!lubridate::is.Date(end_date)) {
|
|
end_date <- as.Date(end_date)
|
|
if (is.na(end_date)) {
|
|
stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
|
|
}
|
|
}
|
|
|
|
offset <- as.numeric(offset)
|
|
if (is.na(offset) || offset < 1) {
|
|
stop("Invalid offset provided. Expected a positive number.")
|
|
}
|
|
|
|
# Calculate date range
|
|
offset <- offset - 1 # Adjust offset to include end_date
|
|
start_date <- end_date - lubridate::days(offset)
|
|
|
|
# Extract week and year information
|
|
week <- lubridate::week(start_date)
|
|
year <- lubridate::year(start_date)
|
|
|
|
# Generate sequence of dates
|
|
days_filter <- seq(from = start_date, to = end_date, by = "day")
|
|
days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering
|
|
|
|
# Log the date range
|
|
safe_log(paste("Date range generated from", start_date, "to", end_date))
|
|
|
|
return(list(
|
|
"week" = week,
|
|
"year" = year,
|
|
"days_filter" = days_filter,
|
|
"start_date" = start_date,
|
|
"end_date" = end_date
|
|
))
|
|
}
|
|
|
|
# -----------------------------
|
|
# PART 3: CI EXTRACTION PROCESS
|
|
# -----------------------------
|
|
|
|
#' Find satellite imagery files within a specific date range
|
|
#'
|
|
#' @param image_folder Path to the folder containing satellite images
|
|
#' @param date_filter Vector of dates to filter by (in YYYY-MM-DD format)
|
|
#' @return Vector of file paths matching the date filter
|
|
#'
|
|
find_satellite_images <- function(image_folder, date_filter) {
|
|
# Validate inputs
|
|
if (!dir.exists(image_folder)) {
|
|
stop(paste("Image folder not found:", image_folder))
|
|
}
|
|
|
|
# List all files in the directory
|
|
all_files <- list.files(image_folder, pattern = "\\.tif$", full.names = TRUE, recursive = TRUE)
|
|
|
|
if (length(all_files) == 0) {
|
|
safe_log("No TIF files found in the specified directory", "WARNING")
|
|
return(character(0))
|
|
}
|
|
|
|
# Filter files by date pattern in filename
|
|
filtered_files <- character(0)
|
|
|
|
for (date in date_filter) {
|
|
# Format date for matching (remove dashes)
|
|
date_pattern <- gsub("-", "", date)
|
|
|
|
# Find files with matching date pattern
|
|
matching_files <- all_files[grepl(date_pattern, all_files)]
|
|
|
|
if (length(matching_files) > 0) {
|
|
filtered_files <- c(filtered_files, matching_files)
|
|
safe_log(paste("Found", length(matching_files), "files for date", date))
|
|
}
|
|
}
|
|
|
|
return(filtered_files)
|
|
}
|
|
|
|
#' Create a Chlorophyll Index (CI) from satellite imagery
|
|
#'
|
|
#' @param raster_obj A SpatRaster object with Red, Green, Blue, and NIR bands
|
|
#' @return A SpatRaster object with a CI band
|
|
#'
|
|
calculate_ci <- function(raster_obj) {
|
|
# Validate input has required bands
|
|
if (terra::nlyr(raster_obj) < 4) {
|
|
stop("Raster must have at least 4 bands (Red, Green, Blue, NIR)")
|
|
}
|
|
|
|
# Extract bands (assuming standard order: B, G, R, NIR)
|
|
blue_band <- raster_obj[[1]]
|
|
green_band <- raster_obj[[2]]
|
|
red_band <- raster_obj[[3]]
|
|
nir_band <- raster_obj[[4]]
|
|
|
|
# CI formula: (NIR / Green) - 1, NOT (NIR / Red) - 1
|
|
# *** CRITICAL: Use GREEN band for Chlorophyll Index, NOT RED ***
|
|
# GREEN band is essential for proper chlorophyll-sensitive calculation
|
|
ci_raster <- (nir_band / green_band) - 1
|
|
|
|
# Filter extreme values that may result from division operations
|
|
ci_raster[ci_raster > 10] <- 10 # Cap max value
|
|
ci_raster[ci_raster < 0] <- 0 # Cap min value
|
|
|
|
# Name the layer
|
|
names(ci_raster) <- "CI"
|
|
|
|
return(ci_raster)
|
|
}
|
|
|
|
#' Create a mask for cloudy pixels and shadows using thresholds
|
|
#'
|
|
#' @param raster_obj A SpatRaster object with multiple bands
|
|
#' @return A binary mask where 1=clear pixel, 0=cloudy or shadow pixel
|
|
#'
|
|
create_cloud_mask <- function(raster_obj) {
|
|
# Extract bands
|
|
blue_band <- raster_obj[[1]]
|
|
green_band <- raster_obj[[2]]
|
|
red_band <- raster_obj[[3]]
|
|
nir_band <- raster_obj[[4]]
|
|
|
|
# Create initial mask (all pixels valid)
|
|
mask <- blue_band * 0 + 1
|
|
|
|
# Calculate indices used for detection
|
|
ndvi <- (nir_band - red_band) / (nir_band + red_band)
|
|
brightness <- (blue_band + green_band + red_band) / 3
|
|
|
|
# CLOUD DETECTION CRITERIA
|
|
# ------------------------
|
|
# Clouds are typically very bright in all bands
|
|
bright_pixels <- (blue_band > 0.3) & (green_band > 0.3) & (red_band > 0.3)
|
|
|
|
# Snow/high reflectance clouds have high blue values
|
|
blue_dominant <- blue_band > (red_band * 1.2)
|
|
|
|
# Low NDVI areas that are bright are likely clouds
|
|
low_ndvi <- ndvi < 0.1
|
|
|
|
# Combine cloud criteria
|
|
cloud_pixels <- bright_pixels & (blue_dominant | low_ndvi)
|
|
|
|
# SHADOW DETECTION CRITERIA
|
|
# ------------------------
|
|
# Shadows typically have:
|
|
# 1. Low overall brightness across all bands
|
|
# 2. Lower NIR reflectance
|
|
# 3. Can still have reasonable NDVI (if over vegetation)
|
|
|
|
# Dark pixels in visible spectrum
|
|
dark_pixels <- brightness < 0.1
|
|
|
|
# Low NIR reflectance
|
|
low_nir <- nir_band < 0.15
|
|
|
|
# Shadows often have higher blue proportion relative to NIR
|
|
blue_nir_ratio <- blue_band / (nir_band + 0.01) # Add small constant to avoid division by zero
|
|
blue_enhanced <- blue_nir_ratio > 0.8
|
|
|
|
# Combine shadow criteria
|
|
shadow_pixels <- dark_pixels & (low_nir | blue_enhanced)
|
|
|
|
# Update mask (0 for cloud or shadow pixels)
|
|
mask[cloud_pixels | shadow_pixels] <- 0
|
|
|
|
# Optional: create different values for clouds vs shadows for visualization
|
|
# mask[cloud_pixels] <- 0 # Clouds
|
|
# mask[shadow_pixels] <- 0 # Shadows
|
|
|
|
return(mask)
|
|
}
|
|
|
|
#' Process satellite image, calculate CI, and crop to field boundaries
|
|
#'
|
|
#' @param file Path to the satellite image file
|
|
#' @param field_boundaries Field boundaries vector object
|
|
#' @param output_dir Directory to save the processed raster
|
|
#' @return Path to the processed raster file
|
|
#'
|
|
process_satellite_image <- function(file, field_boundaries, output_dir) {
|
|
# Validate inputs
|
|
if (!file.exists(file)) {
|
|
stop(paste("File not found:", file))
|
|
}
|
|
|
|
if (is.null(field_boundaries)) {
|
|
stop("Field boundaries are required but were not provided")
|
|
}
|
|
|
|
# Create output filename
|
|
basename_no_ext <- tools::file_path_sans_ext(basename(file))
|
|
output_file <- here::here(output_dir, paste0(basename_no_ext, "_CI.tif"))
|
|
|
|
# Process with error handling
|
|
tryCatch({
|
|
# Load and prepare raster
|
|
loaded_raster <- terra::rast(file)
|
|
|
|
# Calculate CI
|
|
ci_raster <- calculate_ci(loaded_raster)
|
|
|
|
# Create cloud mask
|
|
cloud_mask <- create_cloud_mask(loaded_raster)
|
|
|
|
# Apply cloud mask to CI
|
|
ci_masked <- ci_raster * cloud_mask
|
|
|
|
# Crop to field boundaries extent (for efficiency)
|
|
field_extent <- terra::ext(field_boundaries)
|
|
ci_cropped <- terra::crop(ci_masked, field_extent)
|
|
|
|
# Write output
|
|
terra::writeRaster(ci_cropped, output_file, overwrite = TRUE)
|
|
|
|
safe_log(paste("Successfully processed", basename(file)))
|
|
|
|
return(output_file)
|
|
|
|
}, error = function(e) {
|
|
safe_log(paste("Error processing", basename(file), ":", e$message), "ERROR")
|
|
return(NULL)
|
|
})
|
|
}
|
|
|
|
#' Extract CI statistics for each field
|
|
#'
|
|
#' @param ci_raster A SpatRaster with CI values
|
|
#' @param field_boundaries An sf object with field polygons
|
|
#' @return A data frame with CI statistics by field
|
|
#'
|
|
extract_ci_by_field <- function(ci_raster, field_boundaries) {
|
|
# Validate inputs
|
|
if (is.null(ci_raster)) {
|
|
stop("CI raster is required but was NULL")
|
|
}
|
|
|
|
if (is.null(field_boundaries) || nrow(field_boundaries) == 0) {
|
|
stop("Field boundaries are required but were empty")
|
|
}
|
|
|
|
# Extract statistics using exact extraction (weighted by coverage)
|
|
ci_stats <- exactextractr::exact_extract(
|
|
ci_raster,
|
|
field_boundaries,
|
|
fun = c("mean", "median", "min", "max", "stdev", "count"),
|
|
progress = FALSE
|
|
)
|
|
|
|
# Add field identifiers
|
|
ci_stats$field <- field_boundaries$field
|
|
if ("sub_field" %in% names(field_boundaries)) {
|
|
ci_stats$sub_field <- field_boundaries$sub_field
|
|
} else {
|
|
ci_stats$sub_field <- field_boundaries$field
|
|
}
|
|
|
|
# Add date info
|
|
ci_stats$date <- Sys.Date()
|
|
|
|
# Clean up names
|
|
names(ci_stats) <- gsub("CI\\.", "", names(ci_stats))
|
|
|
|
return(ci_stats)
|
|
}
|
|
|
|
# -----------------------------------------
|
|
# PART 4: YIELD PREDICTION IMPLEMENTATION
|
|
# -----------------------------------------
|
|
|
|
#' Prepare data for yield prediction model
|
|
#'
|
|
#' @param ci_data Data frame with cumulative CI values
|
|
#' @param harvest_data Data frame with harvest information
|
|
#' @return Data frame ready for modeling
|
|
#'
|
|
prepare_yield_prediction_data <- function(ci_data, harvest_data) {
|
|
# Join CI and yield data
|
|
ci_and_yield <- dplyr::left_join(ci_data, harvest_data, by = c("field", "sub_field", "season")) %>%
|
|
dplyr::group_by(sub_field, season) %>%
|
|
dplyr::slice(which.max(DOY)) %>%
|
|
dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
|
|
dplyr::mutate(CI_per_day = cumulative_CI / DOY)
|
|
|
|
# Split into training and prediction sets
|
|
ci_and_yield_train <- ci_and_yield %>%
|
|
as.data.frame() %>%
|
|
dplyr::filter(!is.na(tonnage_ha))
|
|
|
|
prediction_yields <- ci_and_yield %>%
|
|
as.data.frame() %>%
|
|
dplyr::filter(is.na(tonnage_ha))
|
|
|
|
return(list(
|
|
train = ci_and_yield_train,
|
|
predict = prediction_yields
|
|
))
|
|
}
|
|
|
|
#' Train a random forest model for yield prediction
|
|
#'
|
|
#' @param training_data Data frame with training data
|
|
#' @param predictors Vector of predictor variable names
|
|
#' @param response Name of the response variable
|
|
#' @return Trained model
|
|
#'
|
|
train_yield_model <- function(training_data, predictors = c("cumulative_CI", "DOY", "CI_per_day"), response = "tonnage_ha") {
|
|
# Configure model training parameters
|
|
ctrl <- caret::trainControl(
|
|
method = "cv",
|
|
savePredictions = TRUE,
|
|
allowParallel = TRUE,
|
|
number = 5,
|
|
verboseIter = TRUE
|
|
)
|
|
|
|
# Train the model with feature selection
|
|
set.seed(202) # For reproducibility
|
|
model_ffs_rf <- CAST::ffs(
|
|
training_data[, predictors],
|
|
training_data[, response],
|
|
method = "rf",
|
|
trControl = ctrl,
|
|
importance = TRUE,
|
|
withinSE = TRUE,
|
|
tuneLength = 5,
|
|
na.rm = TRUE
|
|
)
|
|
|
|
return(model_ffs_rf)
|
|
}
|
|
|
|
#' Format predictions into a clean data frame
|
|
#'
|
|
#' @param predictions Raw prediction results
|
|
#' @param newdata Original data frame with field information
|
|
#' @return Formatted predictions data frame
|
|
#'
|
|
prepare_predictions <- function(predictions, newdata) {
|
|
return(predictions %>%
|
|
as.data.frame() %>%
|
|
dplyr::rename(predicted_Tcha = ".") %>%
|
|
dplyr::mutate(
|
|
sub_field = newdata$sub_field,
|
|
field = newdata$field,
|
|
Age_days = newdata$DOY,
|
|
total_CI = round(newdata$cumulative_CI, 0),
|
|
predicted_Tcha = round(predicted_Tcha, 0),
|
|
season = newdata$season
|
|
) %>%
|
|
dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>%
|
|
dplyr::left_join(., newdata, by = c("field", "sub_field", "season"))
|
|
)
|
|
}
|
|
|
|
#' Predict yields for mature fields
|
|
#'
|
|
#' @param model Trained model
|
|
#' @param prediction_data Data frame with fields to predict
|
|
#' @param min_age Minimum age in days to qualify as mature (default: 300)
|
|
#' @return Data frame with yield predictions
|
|
#'
|
|
predict_yields <- function(model, prediction_data, min_age = 300) {
|
|
# Make predictions
|
|
predictions <- stats::predict(model, newdata = prediction_data)
|
|
|
|
# Format predictions
|
|
pred_formatted <- prepare_predictions(predictions, prediction_data) %>%
|
|
dplyr::filter(Age_days > min_age) %>%
|
|
dplyr::mutate(CI_per_day = round(total_CI / Age_days, 1))
|
|
|
|
return(pred_formatted)
|
|
}
|
|
|
|
# ------------------------------
|
|
# PART 5: DEMONSTRATION WORKFLOW
|
|
# ------------------------------
|
|
|
|
#' Demonstration workflow showing how to use the functions
|
|
#'
|
|
#' @param end_date The end date for processing satellite images
|
|
#' @param offset Number of days to look back
|
|
#' @param image_folder Path to the folder containing satellite images
|
|
#' @param field_boundaries_path Path to field boundaries shapefile
|
|
#' @param output_dir Path to save processed outputs
|
|
#' @param harvest_data_path Path to historical harvest data
|
|
#'
|
|
demo_workflow <- function(end_date = Sys.Date(), offset = 7,
|
|
image_folder = "path/to/satellite/images",
|
|
field_boundaries_path = "path/to/field_boundaries.shp",
|
|
output_dir = "path/to/output",
|
|
harvest_data_path = "path/to/harvest_data.csv") {
|
|
|
|
# Step 1: Generate date list for processing
|
|
dates <- date_list(end_date, offset)
|
|
safe_log(paste("Processing data for week", dates$week, "of", dates$year))
|
|
|
|
# Step 2: Load field boundaries
|
|
field_boundaries <- sf::read_sf(field_boundaries_path)
|
|
safe_log(paste("Loaded", nrow(field_boundaries), "field boundaries"))
|
|
|
|
# Step 3: Find satellite images for the specified date range
|
|
image_files <- find_satellite_images(image_folder, dates$days_filter)
|
|
safe_log(paste("Found", length(image_files), "satellite images for processing"))
|
|
|
|
# Step 4: Process each satellite image and calculate CI
|
|
ci_files <- list()
|
|
for (file in image_files) {
|
|
ci_file <- process_satellite_image(file, field_boundaries, output_dir)
|
|
if (!is.null(ci_file)) {
|
|
ci_files <- c(ci_files, ci_file)
|
|
}
|
|
}
|
|
|
|
# Step 5: Extract CI statistics for each field
|
|
ci_stats_list <- list()
|
|
for (ci_file in ci_files) {
|
|
ci_raster <- terra::rast(ci_file)
|
|
ci_stats <- extract_ci_by_field(ci_raster, field_boundaries)
|
|
ci_stats_list[[basename(ci_file)]] <- ci_stats
|
|
}
|
|
|
|
# Combine all stats
|
|
all_ci_stats <- dplyr::bind_rows(ci_stats_list)
|
|
safe_log(paste("Extracted CI statistics for", nrow(all_ci_stats), "field-date combinations"))
|
|
|
|
# Step 6: Prepare for yield prediction
|
|
if (file.exists(harvest_data_path)) {
|
|
# Load harvest data
|
|
harvest_data <- read.csv(harvest_data_path)
|
|
safe_log("Loaded harvest data for yield prediction")
|
|
|
|
# Make up cumulative_CI data for demonstration purposes
|
|
# In a real scenario, this would come from accumulating CI values over time
|
|
ci_data <- all_ci_stats %>%
|
|
dplyr::group_by(field, sub_field) %>%
|
|
dplyr::summarise(
|
|
cumulative_CI = sum(mean, na.rm = TRUE),
|
|
DOY = n(), # Days of year as the count of observations
|
|
season = lubridate::year(max(date, na.rm = TRUE)),
|
|
.groups = "drop"
|
|
)
|
|
|
|
# Prepare data for modeling
|
|
modeling_data <- prepare_yield_prediction_data(ci_data, harvest_data)
|
|
|
|
if (nrow(modeling_data$train) > 0) {
|
|
# Train yield prediction model
|
|
yield_model <- train_yield_model(modeling_data$train)
|
|
safe_log("Trained yield prediction model")
|
|
|
|
# Predict yields for mature fields
|
|
yield_predictions <- predict_yields(yield_model, modeling_data$predict)
|
|
safe_log(paste("Generated yield predictions for", nrow(yield_predictions), "fields"))
|
|
|
|
# Return results
|
|
return(list(
|
|
ci_stats = all_ci_stats,
|
|
yield_predictions = yield_predictions,
|
|
model = yield_model
|
|
))
|
|
} else {
|
|
safe_log("No training data available for yield prediction", "WARNING")
|
|
return(list(ci_stats = all_ci_stats))
|
|
}
|
|
} else {
|
|
safe_log("Harvest data not found, skipping yield prediction", "WARNING")
|
|
return(list(ci_stats = all_ci_stats))
|
|
}
|
|
}
|
|
|
|
# ------------------------------
|
|
# PART 6: USAGE EXAMPLE
|
|
# ------------------------------
|
|
|
|
# Uncomment and modify paths to run the demo workflow
|
|
# results <- demo_workflow(
|
|
# end_date = "2023-10-01",
|
|
# offset = 7,
|
|
# image_folder = "data/satellite_images",
|
|
# field_boundaries_path = "data/field_boundaries.shp",
|
|
# output_dir = "output/processed",
|
|
# harvest_data_path = "data/harvest_history.csv"
|
|
# )
|
|
#
|
|
# # Access results
|
|
# ci_stats <- results$ci_stats
|
|
# yield_predictions <- results$yield_predictions
|
|
#
|
|
# # Example: Plot CI distribution by field
|
|
# if (require(ggplot2)) {
|
|
# ggplot(ci_stats, aes(x = field, y = mean, fill = field)) +
|
|
# geom_boxplot() +
|
|
# labs(title = "CI Distribution by Field",
|
|
# x = "Field",
|
|
# y = "Mean CI") +
|
|
# theme_minimal() +
|
|
# theme(axis.text.x = element_text(angle = 45, hjust = 1))
|
|
# }
|
|
#
|
|
# # Example: Plot predicted yield vs age
|
|
# if (exists("yield_predictions") && require(ggplot2)) {
|
|
# ggplot(yield_predictions, aes(x = Age_days, y = predicted_Tcha, color = field)) +
|
|
# geom_point(size = 3) +
|
|
# geom_text(aes(label = field), hjust = -0.2, vjust = -0.2) +
|
|
# labs(title = "Predicted Yield by Field Age",
|
|
# x = "Age (Days)",
|
|
# y = "Predicted Yield (Tonnes/ha)") +
|
|
# theme_minimal()
|
|
# }
|