SmartCane/r_app/ci_extraction_utils.R
2026-01-06 14:17:37 +01:00

621 lines
21 KiB
R

# CI_EXTRACTION_UTILS.R
# =====================
# Utility functions for the SmartCane CI (Chlorophill Index) extraction workflow.
# These functions support date handling, raster processing, and data extraction.
#' Safe logging function that works whether log_message exists or not
#'
#' @param message The message to log
#' @param level The log level (default: "INFO")
#' @return NULL (used for side effects)
#'
safe_log <- function(message, level = "INFO") {
if (exists("log_message")) {
log_message(message, level)
} else {
if (level %in% c("ERROR", "WARNING")) {
warning(message)
} else {
message(message)
}
}
}
#' Generate a sequence of dates for processing
#'
#' @param end_date The end date for the sequence (Date object)
#' @param offset Number of days to look back from end_date
#' @return A list containing week number, year, and a sequence of dates for filtering
#'
date_list <- function(end_date, offset) {
# Input validation
if (!lubridate::is.Date(end_date)) {
end_date <- as.Date(end_date)
if (is.na(end_date)) {
stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
}
}
offset <- as.numeric(offset)
if (is.na(offset) || offset < 1) {
stop("Invalid offset provided. Expected a positive number.")
}
# Calculate date range
offset <- offset - 1 # Adjust offset to include end_date
start_date <- end_date - lubridate::days(offset)
# Extract week and year information
week <- lubridate::week(start_date)
year <- lubridate::year(start_date)
# Generate sequence of dates
days_filter <- seq(from = start_date, to = end_date, by = "day")
days_filter <- format(days_filter, "%Y-%m-%d") # Format for consistent filtering
# Log the date range
safe_log(paste("Date range generated from", start_date, "to", end_date))
return(list(
"week" = week,
"year" = year,
"days_filter" = days_filter,
"start_date" = start_date,
"end_date" = end_date
))
}
#' Detect band count and structure (4-band vs 8-band with optional UDM)
#'
#' @param loaded_raster Loaded raster object
#' @return List with structure info: $type (4b or 8b), $has_udm (logical), $band_names
#'
detect_raster_structure <- function(loaded_raster) {
n_bands <- terra::nlyr(loaded_raster)
# Determine raster type and structure
if (n_bands == 4) {
return(list(
type = "4b",
has_udm = FALSE,
band_names = c("Red", "Green", "Blue", "NIR"),
red_idx = 1, green_idx = 2, blue_idx = 3, nir_idx = 4, udm_idx = NA
))
} else if (n_bands %in% c(8, 9)) {
# PlanetScope 8-band structure:
# 1=Coastal Blue, 2=Blue, 3=Green I, 4=Green, 5=Yellow, 6=Red, 7=Red Edge, 8=NIR
# 9-band: includes UDM1 (Usable Data Mask) as final band
has_udm <- n_bands == 9
return(list(
type = "8b",
has_udm = has_udm,
band_names = if (has_udm) {
c("CoastalBlue", "Blue", "GreenI", "Green", "Yellow", "Red", "RedEdge", "NIR", "UDM1")
} else {
c("CoastalBlue", "Blue", "GreenI", "Green", "Yellow", "Red", "RedEdge", "NIR")
},
red_idx = 6, green_idx = 4, blue_idx = 2, nir_idx = 8, udm_idx = if (has_udm) 9 else NA
))
} else {
stop(paste("Unexpected number of bands:", n_bands,
"Expected 4-band, 8-band, or 9-band (8-band + UDM) data"))
}
}
#' Apply cloud masking for 8-band data with UDM layer
#'
#' @param loaded_raster Raster with UDM band
#' @param udm_idx Index of the UDM band
#' @return Raster with cloud-masked pixels set to NA
#'
apply_udm_masking <- function(loaded_raster, udm_idx) {
if (is.na(udm_idx)) {
return(loaded_raster)
}
# Extract UDM band (0 = clear sky, 1 = shadow, 2 = cloud, 3 = snow, 4 = water)
# We only mask pixels where UDM = 2 (clouds)
udm_band <- loaded_raster[[udm_idx]]
# Create mask where UDM == 0 (clear/valid pixels only)
cloud_mask <- udm_band == 0
# Apply mask to all bands except UDM itself
for (i in 1:(terra::nlyr(loaded_raster) - 1)) {
loaded_raster[[i]][!cloud_mask] <- NA
}
safe_log(paste("Applied UDM cloud masking to raster (masking non-clear pixels)"))
return(loaded_raster)
}
#' Create a Chlorophill Index (CI) mask from satellite imagery and crop to field boundaries
#'
#' Supports both 4-band and 8-band (with optional UDM) Planet Scope data.
#' For 8-band data, automatically applies cloud masking using the UDM layer if present.
#'
#' @param file Path to the satellite image file
#' @param field_boundaries Field boundaries vector object
#' @param merged_final_dir Directory to save the processed raster
#' @return Processed raster object with CI band
#'
create_mask_and_crop <- function(file, field_boundaries, merged_final_dir) {
# Validate inputs
if (!file.exists(file)) {
stop(paste("File not found:", file))
}
if (is.null(field_boundaries)) {
stop("Field boundaries are required but were not provided")
}
# CRITICAL: Convert field_boundaries to terra if it's an sf object
# This ensures all subsequent terra operations work correctly
# But if it's already a terra object or conversion fails, use as-is
if (inherits(field_boundaries, "sf")) {
field_boundaries <- tryCatch({
terra::vect(field_boundaries)
}, error = function(e) {
warning(paste("Could not convert sf to terra:", e$message, "- using sf object directly"))
field_boundaries # Return original sf object
})
}
# Establish file names for output
basename_no_ext <- tools::file_path_sans_ext(basename(file))
new_file <- here::here(merged_final_dir, paste0(basename_no_ext, ".tif"))
vrt_file <- here::here(daily_vrt, paste0(basename_no_ext, ".vrt"))
# Process with error handling
tryCatch({
# Log processing start
safe_log(paste("Processing", basename(file)))
# Load and prepare raster
loaded_raster <- terra::rast(file)
# Validate raster has necessary bands
if (terra::nlyr(loaded_raster) < 4) {
stop("Raster must have at least 4 bands (Red, Green, Blue, NIR)")
}
# Detect raster structure (4b vs 8b with optional UDM)
structure_info <- detect_raster_structure(loaded_raster)
safe_log(paste("Detected", structure_info$type, "data",
if (structure_info$has_udm) "with UDM cloud masking" else "without cloud masking"))
# Extract the bands we need FIRST
# This ensures we're working with just the necessary data
red_band <- loaded_raster[[structure_info$red_idx]]
green_band <- loaded_raster[[structure_info$green_idx]]
blue_band <- loaded_raster[[structure_info$blue_idx]]
nir_band <- loaded_raster[[structure_info$nir_idx]]
# Now apply cloud masking to these selected bands if UDM exists
if (structure_info$has_udm) {
udm_band <- loaded_raster[[structure_info$udm_idx]]
# Create mask where UDM != 2 (mask only clouds, keep clear sky and shadows)
cloud_mask <- udm_band != 2
red_band[!cloud_mask] <- NA
green_band[!cloud_mask] <- NA
blue_band[!cloud_mask] <- NA
nir_band[!cloud_mask] <- NA
safe_log("Applied UDM cloud masking to selected bands (masking UDM=2 clouds only)")
}
# Name the bands
names(red_band) <- "Red"
names(green_band) <- "Green"
names(blue_band) <- "Blue"
names(nir_band) <- "NIR"
# Calculate Canopy Index from Red, Green, NIR
# CI = (NIR - Red) / (NIR + Red) is a common formulation
# But using NIR/Green - 1 is also valid and more sensitive to green vegetation
CI <- nir_band / green_band - 1
names(CI) <- "CI"
# Create output raster with essential bands: Red, Green, Blue, NIR, CI
output_raster <- c(red_band, green_band, blue_band, nir_band, CI)
names(output_raster) <- c("Red", "Green", "Blue", "NIR", "CI")
# Ensure CRS compatibility before cropping
tryCatch({
raster_crs <- terra::crs(output_raster, proj = TRUE)
raster_crs_char <- as.character(raster_crs)
# Handle boundaries CRS - works for both terra and sf objects
if (inherits(field_boundaries, "sf")) {
boundaries_crs <- sf::st_crs(field_boundaries)
boundaries_crs_char <- if (!is.na(boundaries_crs)) as.character(boundaries_crs$wkt) else ""
} else {
boundaries_crs <- terra::crs(field_boundaries, proj = TRUE)
boundaries_crs_char <- as.character(boundaries_crs)
}
if (length(raster_crs_char) > 0 && length(boundaries_crs_char) > 0 &&
nchar(raster_crs_char) > 0 && nchar(boundaries_crs_char) > 0) {
if (raster_crs_char != boundaries_crs_char) {
# Transform field boundaries to match raster CRS only if it's a terra object
if (inherits(field_boundaries, "SpatVector")) {
field_boundaries <- terra::project(field_boundaries, raster_crs_char)
safe_log("Transformed field boundaries CRS to match raster CRS")
} else {
safe_log("Field boundaries is sf object - CRS transformation skipped")
}
}
} else {
# If CRS is missing, try to assign a default WGS84 CRS
if (length(raster_crs_char) == 0 || nchar(raster_crs_char) == 0) {
terra::crs(output_raster) <- "EPSG:4326"
safe_log("Assigned default WGS84 CRS to raster")
}
if (length(boundaries_crs_char) == 0 || nchar(boundaries_crs_char) == 0) {
if (inherits(field_boundaries, "SpatVector")) {
terra::crs(field_boundaries) <- "EPSG:4326"
} else {
sf::st_crs(field_boundaries) <- 4326
}
safe_log("Assigned default WGS84 CRS to field boundaries")
}
}
}, error = function(e) {
safe_log(paste("CRS handling warning:", e$message), "WARNING")
})
output_raster <- tryCatch({
# terra::crop can work with both terra and sf objects, but if it fails with sf, try conversion
terra::crop(output_raster, field_boundaries, mask = TRUE)
}, error = function(e) {
# If crop fails (common with certain sf geometries), convert sf to terra first
if (inherits(field_boundaries, "sf")) {
safe_log(paste("Crop with sf failed, attempting alternative approach:", e$message), "WARNING")
# Use terra mask operation instead of crop for sf objects
# First, get the bbox from sf object and use it for rough crop
bbox <- sf::st_bbox(field_boundaries)
output_raster_cropped <- terra::crop(output_raster,
terra::ext(bbox[1], bbox[3], bbox[2], bbox[4]))
return(output_raster_cropped)
} else {
stop(e)
}
})
# Note: Do NOT replace zeros with NA here - Red/Green/Blue/NIR reflectance can be near zero
# Only CI can go negative (if NIR < Green), but that's valid vegetation index behavior
# output_raster[output_raster == 0] <- NA # REMOVED - this was causing data loss
# Write output files
terra::writeRaster(output_raster, new_file, overwrite = TRUE)
terra::vrt(new_file, vrt_file, overwrite = TRUE)
# Check if the result has enough valid pixels
valid_pixels <- terra::global(output_raster$CI, "notNA", na.rm=TRUE)
# Log completion
safe_log(paste("Completed processing", basename(file),
"- Valid pixels:", valid_pixels[1,]))
return(output_raster)
}, error = function(e) {
err_msg <- paste("Error processing", basename(file), "-", e$message)
safe_log(err_msg, "ERROR")
return(NULL)
}, finally = {
# Clean up memory
gc()
})
}
#' Process a batch of satellite images and create VRT files
#'
#' @param files Vector of file paths to process
#' @param field_boundaries Field boundaries vector object for cropping
#' @param merged_final_dir Directory to save processed rasters
#' @param daily_vrt_dir Directory to save VRT files
#' @param min_valid_pixels Minimum number of valid pixels for a raster to be kept (default: 100)
#' @return List of valid VRT files created
#'
process_satellite_images <- function(files, field_boundaries, merged_final_dir, daily_vrt_dir, min_valid_pixels = 100) {
vrt_list <- list()
safe_log(paste("Starting batch processing of", length(files), "files"))
# Process each file
for (file in files) {
# Process each raster file
v_crop <- create_mask_and_crop(file, field_boundaries, merged_final_dir)
# Skip if processing failed
if (is.null(v_crop)) {
next
}
# Check if the raster has enough valid data
valid_data <- terra::global(v_crop, "notNA")
vrt_file <- here::here(daily_vrt_dir, paste0(tools::file_path_sans_ext(basename(file)), ".vrt"))
if (valid_data[1,] > min_valid_pixels) {
vrt_list[[vrt_file]] <- vrt_file
} else {
# Remove VRT files with insufficient data
if (file.exists(vrt_file)) {
file.remove(vrt_file)
}
safe_log(paste("Skipping", basename(file), "- insufficient valid data"), "WARNING")
}
# Clean up memory
rm(v_crop)
gc()
}
safe_log(paste("Completed processing", length(vrt_list), "raster files"))
return(vrt_list)
}
#' Find satellite image files filtered by date
#'
#' @param tif_folder Directory containing satellite imagery files
#' @param dates_filter Character vector of dates in YYYY-MM-DD format
#' @return Vector of file paths matching the date filter
#'
find_satellite_images <- function(tif_folder, dates_filter) {
# Find all raster files
raster_files <- list.files(tif_folder, full.names = TRUE, pattern = "\\.tif$")
if (length(raster_files) == 0) {
stop("No raster files found in directory: ", tif_folder)
}
# Filter files by dates
filtered_files <- purrr::map(dates_filter, ~ raster_files[grepl(pattern = .x, x = raster_files)]) %>%
purrr::compact() %>%
purrr::flatten_chr()
# Remove files that do not exist
existing_files <- filtered_files[file.exists(filtered_files)]
# Check if the list of existing files is empty
if (length(existing_files) == 0) {
stop("No files found matching the date filter: ", paste(dates_filter, collapse = ", "))
}
return(existing_files)
}
#' Extract date from file path
#'
#' @param file_path Path to the file
#' @return Extracted date in YYYY-MM-DD format
#'
date_extract <- function(file_path) {
date <- stringr::str_extract(file_path, "\\d{4}-\\d{2}-\\d{2}")
if (is.na(date)) {
warning(paste("Could not extract date from file path: ", file_path))
}
return(date)
}
#' Extract CI values from a raster for each field or subfield
#'
#' @param file Path to the raster file
#' @param field_geojson Field boundaries as SF object
#' @param quadrants Boolean indicating whether to extract by quadrants
#' @param save_dir Directory to save the extracted values
#' @return Path to the saved RDS file
#'
extract_rasters_daily <- function(file, field_geojson, quadrants = TRUE, save_dir) {
# Validate inputs
if (!file.exists(file)) {
stop(paste("File not found: ", file))
}
if (!inherits(field_geojson, "sf") && !inherits(field_geojson, "sfc")) {
field_geojson <- sf::st_as_sf(field_geojson)
}
# Extract date from file path
date <- date_extract(file)
if (is.na(date)) {
stop(paste("Could not extract date from file path:", file))
}
# Log extraction start
safe_log(paste("Extracting CI values for", date, "- Using quadrants:", quadrants))
# Process with error handling
tryCatch({
# Load raster
x <- terra::rast(file)
# Check if CI band exists
if (!"CI" %in% names(x)) {
stop("CI band not found in raster")
}
# Extract statistics
pivot_stats <- cbind(
field_geojson,
mean_CI = round(exactextractr::exact_extract(x$CI, field_geojson, fun = "mean"), 2)
) %>%
sf::st_drop_geometry() %>%
dplyr::rename("{date}" := mean_CI)
# Determine save path
save_suffix <- if (quadrants) {"quadrant"} else {"whole_field"}
save_path <- here::here(save_dir, paste0("extracted_", date, "_", save_suffix, ".rds"))
# Save extracted data
saveRDS(pivot_stats, save_path)
# Log success
safe_log(paste("Successfully extracted and saved CI values for", date))
return(save_path)
}, error = function(e) {
err_msg <- paste("Error extracting CI values for", date, "-", e$message)
safe_log(err_msg, "ERROR")
return(NULL)
})
}
#' Combine daily CI values into a single dataset
#'
#' @param daily_CI_vals_dir Directory containing daily CI values
#' @param output_file Path to save the combined dataset
#' @return Combined dataset as a tibble
#'
combine_ci_values <- function(daily_CI_vals_dir, output_file = NULL) {
# List all RDS files in the daily CI values directory
files <- list.files(path = daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$", full.names = TRUE)
if (length(files) == 0) {
stop("No extracted CI values found in directory:", daily_CI_vals_dir)
}
# Log process start
safe_log(paste("Combining", length(files), "CI value files"))
# Load and combine all files
combined_data <- files %>%
purrr::map(readRDS) %>%
purrr::list_rbind() %>%
dplyr::group_by(sub_field)
# Save if output file is specified
if (!is.null(output_file)) {
saveRDS(combined_data, output_file)
safe_log(paste("Combined CI values saved to", output_file))
}
return(combined_data)
}
#' Update existing CI data with new values
#'
#' @param new_data New CI data to be added
#' @param existing_data_file Path to the existing data file
#' @return Updated combined dataset
#'
update_ci_data <- function(new_data, existing_data_file) {
if (!file.exists(existing_data_file)) {
warning(paste("Existing data file not found:", existing_data_file))
return(new_data)
}
# Load existing data
existing_data <- readRDS(existing_data_file)
# Combine data, handling duplicates by keeping the newer values
combined_data <- dplyr::bind_rows(new_data, existing_data) %>%
dplyr::distinct() %>%
dplyr::group_by(sub_field)
# Save updated data
saveRDS(combined_data, existing_data_file)
safe_log(paste("Updated CI data saved to", existing_data_file))
return(combined_data)
}
#' Process and combine CI values from raster files
#'
#' @param dates List of dates from date_list()
#' @param field_boundaries Field boundaries as vector object
#' @param merged_final_dir Directory with processed raster files
#' @param field_boundaries_sf Field boundaries as SF object
#' @param daily_CI_vals_dir Directory to save daily CI values
#' @param cumulative_CI_vals_dir Directory to save cumulative CI values
#' @return NULL (used for side effects)
#'
process_ci_values <- function(dates, field_boundaries, merged_final_dir,
field_boundaries_sf, daily_CI_vals_dir,
cumulative_CI_vals_dir) {
# Find processed raster files
raster_files <- list.files(merged_final_dir, full.names = TRUE, pattern = "\\.tif$")
# Define path for combined CI data
combined_ci_path <- here::here(cumulative_CI_vals_dir, "combined_CI_data.rds")
# Check if the combined CI data file exists
if (!file.exists(combined_ci_path)) {
# Process all available data if file doesn't exist
safe_log("combined_CI_data.rds does not exist. Creating new file with all available data.")
safe_log(paste("Processing", length(raster_files), "raster files"))
# Extract data from all raster files with error handling
tryCatch({
purrr::walk(
raster_files,
extract_rasters_daily,
field_geojson = field_boundaries_sf,
quadrants = FALSE,
save_dir = daily_CI_vals_dir
)
safe_log("Extraction complete for all raster files")
}, error = function(e) {
safe_log(paste("Error during extraction walk:", e$message), "ERROR")
})
# Combine all extracted data
tryCatch({
pivot_stats <- combine_ci_values(daily_CI_vals_dir, combined_ci_path)
safe_log("All CI values extracted from historic images and saved.")
}, error = function(e) {
safe_log(paste("Error combining CI values:", e$message), "ERROR")
stop(e$message)
})
} else {
# Process only the latest data and add to existing file
safe_log("combined_CI_data.rds exists, adding the latest image data.")
# Filter files by dates
filtered_files <- purrr::map(dates$days_filter, ~ raster_files[grepl(pattern = .x, x = raster_files)]) %>%
purrr::compact() %>%
purrr::flatten_chr()
safe_log(paste("Processing", length(filtered_files), "new raster files"))
# Extract data for the new files with error handling
tryCatch({
purrr::walk(
filtered_files,
extract_rasters_daily,
field_geojson = field_boundaries_sf,
quadrants = TRUE,
save_dir = daily_CI_vals_dir
)
safe_log("Extraction complete for new files")
}, error = function(e) {
safe_log(paste("Error during extraction walk:", e$message), "ERROR")
})
# Filter extracted values files by the current date range
extracted_values <- list.files(daily_CI_vals_dir, full.names = TRUE)
extracted_values <- purrr::map(dates$days_filter, ~ extracted_values[grepl(pattern = .x, x = extracted_values)]) %>%
purrr::compact() %>%
purrr::flatten_chr()
safe_log(paste("Found", length(extracted_values), "extracted value files to combine"))
# Combine new values
new_pivot_stats <- extracted_values %>%
purrr::map(readRDS) %>%
purrr::list_rbind() %>%
dplyr::group_by(sub_field)
# Update the combined data file
update_ci_data(new_pivot_stats, combined_ci_path)
safe_log("CI values from latest images added to combined_CI_data.rds")
}
}