SmartCane/r_app/20_ci_extraction.R

367 lines
15 KiB
R
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ============================================================================
# SCRIPT 20: Canopy Index (CI) Extraction from Satellite Imagery
# ============================================================================
# PURPOSE:
# Extract Canopy Index (CI) from 4-band or 8-band satellite imagery and
# mask by field boundaries. Supports automatic band detection, cloud masking
# with UDM2 (8-band), and per-field CI value extraction. Produces both
# per-field TIFFs and consolidated CI statistics for growth model input.
#
# INPUT DATA:
# - Source: laravel_app/storage/app/{project}/field_tiles/{FIELD}/{DATE}.tif
# - Format: GeoTIFF (4-band RGB+NIR from Planet API, or 8-band with UDM2)
# - Requirement: Field boundaries (pivot.geojson) for masking
#
# OUTPUT DATA:
# - Destination: laravel_app/storage/app/{project}/field_tiles_CI/{FIELD}/{DATE}.tif
# - Format: GeoTIFF (5-band: R,G,B,NIR,CI as float32)
# - Also exports: combined_CI/combined_CI_data.rds (wide format: fields × dates)
#
# USAGE:
# Rscript 20_ci_extraction.R [end_date] [offset] [project] [data_source]
#
# Example (Windows PowerShell):
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif
#
# PARAMETERS:
# - end_date: End date for processing (character, YYYY-MM-DD format)
# - offset: Days to look back from end_date (numeric, default 7)
# - project: Project name (character) - angata, chemba, xinavane, esa, simba
# - data_source: Data source directory (character, optional) - "merged_tif" (default), "merged_tif_8b", "merged_final_tif"
#
# CLIENT TYPES:
# - cane_supply (ANGATA): Yes - core data processing
# - agronomic_support (AURA): Yes - supports field health monitoring
#
# DEPENDENCIES:
# - Packages: terra, sf, tidyverse, lubridate, readxl, furrr, future
# - Utils files: parameters_project.R, 00_common_utils.R, 20_ci_extraction_utils.R
# - External data: Field boundaries (pivot.geojson), harvest data (harvest.xlsx)
# - Data directories: field_tiles/, field_tiles_CI/, combined_CI/
#
# NOTES:
# - CI formula: (NIR - Red) / (NIR + Red); normalized to 0-5 range
# - 8-band data automatically cloud-masked using UDM2 (band 7-8)
# - 4-band data assumes clear-sky Planet PSScene imagery
# - Parallel processing via furrr for speed optimization
# - Output RDS uses wide format (fields as rows, dates as columns) for growth model
# - Critical dependency for Script 30 (growth model) and Script 80 (KPIs)
#
# RELATED ISSUES:
# SC-112: Utilities restructuring
# SC-108: Core pipeline improvements
#
# ============================================================================
# 1. Load required packages
# -----------------------
suppressPackageStartupMessages({
# Spatial data handling
library(sf) # For reading/manipulating field boundaries (GeoJSON)
library(terra) # For raster operations (CI extraction from TIFFs)
# Data manipulation
library(tidyverse) # For dplyr, ggplot2, readr (data wrangling and visualization)
library(lubridate) # For date/time operations (parsing satellite dates)
# File I/O
library(readxl) # For reading harvest.xlsx (harvest dates for field mapping)
library(here) # For relative path resolution (platform-independent file paths)
})
# 2. Process command line arguments
# ------------------------------
main <- function() {
# Capture command line arguments
args <- commandArgs(trailingOnly = TRUE)
# Process end_date argument
if (length(args) >= 1 && !is.na(args[1]) && args[1] != "") {
# Parse date explicitly in YYYY-MM-DD format from command line
end_date <- as.Date(args[1], format = "%Y-%m-%d")
if (is.na(end_date)) {
warning("Invalid end_date provided. Using default (current date).")
end_date <- Sys.Date()
#end_date <- "2023-10-01"
}
} else {
end_date <- Sys.Date()
#end_date <- "2023-10-01"
}
# Process offset argument
if (length(args) >= 2 && !is.na(args[2])) {
offset <- as.numeric(args[2])
if (is.na(offset) || offset <= 0) {
warning("Invalid offset provided. Using default (7 days).")
offset <- 7
}
} else {
offset <- 7
}
# Process project_dir argument
if (length(args) >= 3 && !is.na(args[3])) {
project_dir <- as.character(args[3])
} else if (exists("project_dir", envir = .GlobalEnv)) {
project_dir <- get("project_dir", envir = .GlobalEnv)
} else {
project_dir <- "angata" # Changed default from "aura" to "esa"
}
# Process data_source argument (optional, for specifying merged_tif_8b vs merged_tif vs merged_final_tif)
if (length(args) >= 4 && !is.na(args[4])) {
data_source <- as.character(args[4])
# Validate data_source is a recognized option
if (!data_source %in% c("merged_tif_8b", "merged_tif", "merged_final_tif")) {
warning(paste("Data source", data_source, "not in standard list. Using as-is."))
}
} else if (exists("data_source", envir = .GlobalEnv)) {
data_source <- get("data_source", envir = .GlobalEnv)
} else {
data_source <- "merged_tif_8b" # Default to 8-band (newer data with cloud masking)
}
# Make project_dir and data_source available globally
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
cat(sprintf("CI Extraction: project=%s, end_date=%s, offset=%d days, data_source=%s\n",
project_dir, format(end_date, "%Y-%m-%d"), offset, data_source))
# Set flag to use pivot_2.geojson for ESA (extra fields for yield prediction)
ci_extraction_script <- TRUE
assign("ci_extraction_script", ci_extraction_script, envir = .GlobalEnv)
# 3. Initialize project configuration
# --------------------------------
new_project_question <- TRUE
cat("[DEBUG] Attempting to source r_app/parameters_project.R\n")
tryCatch({
source("r_app/parameters_project.R")
cat("[DEBUG] Successfully sourced r_app/parameters_project.R\n")
}, error = function(e) {
cat("[ERROR] Failed to source r_app/parameters_project.R:\n", e$message, "\n")
stop(e)
})
# Load centralized path structure (creates all directories automatically)
paths <- setup_project_directories(project_dir)
cat("[DEBUG] Attempting to source r_app/00_common_utils.R\n")
tryCatch({
source("r_app/00_common_utils.R")
cat("[DEBUG] Successfully sourced r_app/00_common_utils.R\n")
}, error = function(e) {
cat("[ERROR] Failed to source r_app/00_common_utils.R:\n", e$message, "\n")
stop(e)
})
cat("[DEBUG] Attempting to source r_app/20_ci_extraction_utils.R\n")
tryCatch({
source("r_app/20_ci_extraction_utils.R")
cat("[DEBUG] Successfully sourced r_app/20_ci_extraction_utils.R\n")
}, error = function(e) {
cat("[ERROR] Failed to source r_app/20_ci_extraction_utils.R:\n", e$message, "\n")
stop(e)
})
# 4. Generate date list for processing
# ---------------------------------
dates <- date_list(end_date, offset)
log_message(paste("Processing data for week", dates$week, "of", dates$year))
# 4a. CHECK DAILY CI EXTRACTION - Skip dates that already have extracted files
# -------------------------------------------------------------------------
log_message("\n===== CHECKING DAILY CI EXTRACTION STATUS =====")
# Check which dates already have extracted CI files
already_extracted <- c()
missing_extraction <- c()
if (dir.exists(daily_CI_vals_dir)) {
existing_ci_files <- list.files(daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$")
# Extract dates from filenames like "extracted_2025-12-31_quadrant.rds"
already_extracted <- sub("^extracted_(.+)_.*\\.rds$", "\\1", existing_ci_files)
}
# Find which dates in our processing range need extraction
missing_extraction <- dates$days_filter[!(dates$days_filter %in% already_extracted)]
cat(sprintf("[CI CHECK] Already extracted: %d dates\n", length(already_extracted)))
cat(sprintf("[CI CHECK] Need extraction: %d dates (from %s to %s)\n",
length(missing_extraction),
if(length(missing_extraction) > 0) min(missing_extraction) else "N/A",
if(length(missing_extraction) > 0) max(missing_extraction) else "N/A"))
# If any dates need extraction, we'll extract them
# If NO dates need extraction, we'll skip extraction but ALWAYS rebuild combined_CI_data.rds
skip_extraction <- (length(missing_extraction) == 0)
if (skip_extraction) {
log_message("✓ All dates in processing range already have extracted CI files - skipping extraction")
log_message("⚠ Will rebuild combined_CI_data.rds to ensure completeness")
}
# 4b. CHECK SOURCE DATA AVAILABILITY
# ---------------------------------------------------------------
# Verify that source data exists for dates we're going to extract
# If a date is missing from source, we'll skip it gracefully
log_message("\n===== CHECKING SOURCE DATA AVAILABILITY =====")
dates_with_source <- c()
dates_missing_source <- c()
if (!skip_extraction && length(missing_extraction) > 0) {
# Check which source dates are actually available
for (date_str in missing_extraction) {
# Look for the date in merged_tif directory
source_file_pattern <- sprintf("%s\\.tif$", date_str)
files_for_date <- list.files(planet_tif_folder, pattern = source_file_pattern)
if (length(files_for_date) > 0) {
dates_with_source <- c(dates_with_source, date_str)
} else {
dates_missing_source <- c(dates_missing_source, date_str)
}
}
cat(sprintf("[SOURCE CHECK] Dates with available source data: %d\n", length(dates_with_source)))
cat(sprintf("[SOURCE CHECK] Dates missing from source (will skip): %d\n", length(dates_missing_source)))
if (length(dates_missing_source) > 0) {
log_message(paste("⚠ Skipping extraction for missing source dates:", paste(dates_missing_source, collapse = ", ")))
}
}
# 5. Find and filter raster files by date - with grid size detection
# -----------------------------------
log_message("Searching for raster files")
# Check if tiles exist (Script 10 output) - detect grid size dynamically using centralized paths
tiles_split_base <- paths$daily_tiles_split_dir
# Detect grid size from daily_tiles_split folder structure
# Expected structure: daily_tiles_split/5x5/ or daily_tiles_split/10x10/ etc.
grid_size <- NA
if (dir.exists(tiles_split_base)) {
subfolders <- list.dirs(tiles_split_base, full.names = FALSE, recursive = FALSE)
# Look for grid size patterns like "5x5", "10x10", "20x20"
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
grid_size <- grid_patterns[1] # Use first grid size found
log_message(paste("Detected grid size:", grid_size))
}
}
# Construct tile folder path with grid size
if (!is.na(grid_size)) {
tile_folder <- file.path(tiles_split_base, grid_size)
} else {
tile_folder <- tiles_split_base
}
use_tiles <- dir.exists(tile_folder)
# Make grid_size available globally for other functions
assign("grid_size", grid_size, envir = .GlobalEnv)
tryCatch({
if (skip_extraction) {
log_message("\n===== SKIPPING CI EXTRACTION (all dates already processed) =====")
} else if (use_tiles) {
# Use tile-based processing
log_message(paste("Tile folder detected at", tile_folder))
log_message("Using tile-based CI extraction")
# Call the tile-based extraction function
process_ci_values_from_tiles(
dates = dates,
tile_folder = tile_folder,
field_boundaries = field_boundaries,
field_boundaries_sf = field_boundaries_sf,
daily_CI_vals_dir = daily_CI_vals_dir,
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
merged_final_dir = merged_final,
grid_size = grid_size
)
} else {
# Use legacy full-extent processing
log_message("No tiles found. Using legacy full-extent approach")
# Use the existing utility function to find satellite images
existing_files <- find_satellite_images(planet_tif_folder, dates$days_filter)
log_message(paste("Found", length(existing_files), "raster files for processing"))
# Process raster files and create VRT
vrt_list <- process_satellite_images(existing_files, field_boundaries, merged_final, daily_vrt)
# Process and combine CI values
process_ci_values(dates, field_boundaries, merged_final,
field_boundaries_sf, daily_CI_vals_dir, cumulative_CI_vals_dir)
}
}, error = function(e) {
log_message(paste("Error in main processing:", e$message), level = "ERROR")
stop(e$message)
})
# 6. REBUILD combined_CI_data.rds from ALL daily extracted files
# -----------------------------------------------
# This ensures the combined file is complete and up-to-date
# even if extraction was skipped (because dates already existed)
# NOTE: Only rebuild if new dates were successfully extracted
# If all dates were missing from source, skip this step to avoid corrupting the file
log_message("\n===== HANDLING combined_CI_data.rds =====")
if (length(dates_with_source) == 0 && length(missing_extraction) > 0) {
# All missing dates had no source data - skip combined_CI_data.rds update
log_message("⚠ No new dates extracted (all source data missing) - skipping combined_CI_data.rds update")
} else if (skip_extraction) {
# All dates already extracted - optionally rebuild for consistency
log_message("✓ All dates already extracted - combined_CI_data.rds is up-to-date")
} else {
# New dates were extracted - rebuild combined_CI_data.rds from ALL daily files
log_message("Rebuilding combined_CI_data.rds from all daily extracted files...")
tryCatch({
if (!dir.exists(daily_CI_vals_dir)) {
log_message("Daily CI directory does not exist yet", level = "WARNING")
} else {
# List ALL daily CI files (not just new ones)
all_daily_files <- list.files(path = daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$", full.names = TRUE)
if (length(all_daily_files) == 0) {
log_message("No daily CI files found to combine", level = "WARNING")
} else {
log_message(paste("Combining all", length(all_daily_files), "daily CI files into combined_CI_data.rds"))
# Load and combine ALL daily files (creates complete dataset)
combined_ci_path <- file.path(paths$cumulative_ci_vals_dir, "combined_CI_data.rds")
combined_data <- all_daily_files %>%
purrr::map(readRDS) %>%
purrr::list_rbind() %>%
dplyr::group_by(sub_field)
# Save the rebuilt combined data
saveRDS(combined_data, combined_ci_path)
log_message(paste("✓ Rebuilt combined_CI_data.rds with", nrow(combined_data), "total rows"))
}
}
}, error = function(e) {
log_message(paste("⚠ Error rebuilding combined_CI_data.rds (will skip):", e$message), level = "WARNING")
log_message(" Note: This is OK - Script 30 will use growth model RDS instead", level = "WARNING")
})
}
}
if (sys.nframe() == 0) {
main()
}