319 lines
13 KiB
R
319 lines
13 KiB
R
# CI_EXTRACTION.R
|
|
# ==============
|
|
# This script processes satellite imagery to extract Canopy Index (CI) values for agricultural fields.
|
|
# It handles image processing, masking, and extraction of statistics by field/sub-field.
|
|
# Supports both 4-band and 8-band PlanetScope data with automatic band detection and cloud masking.
|
|
#
|
|
# Usage: Rscript 02_ci_extraction.R [end_date] [offset] [project_dir] [data_source]
|
|
# - end_date: End date for processing (YYYY-MM-DD format)
|
|
# - offset: Number of days to look back from end_date
|
|
# - project_dir: Project directory name (e.g., "angata", "aura", "chemba")
|
|
# - data_source: Data source directory - "merged_tif_8b" (default) or "merged_tif" (4-band) or "merged_final_tif"
|
|
# If tiles exist (daily_tiles_split/), they are used automatically
|
|
#
|
|
# Examples:
|
|
# # Angata 8-band data (with UDM cloud masking)
|
|
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
|
|
#
|
|
# # Aura 4-band data
|
|
# Rscript 20_ci_extraction.R 2025-11-26 7 aura merged_tif
|
|
#
|
|
# # Auto-detects and uses tiles if available:
|
|
# Rscript 20_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists)
|
|
|
|
# 1. Load required packages
|
|
# -----------------------
|
|
suppressPackageStartupMessages({
|
|
library(sf)
|
|
library(terra)
|
|
library(tidyverse)
|
|
library(lubridate)
|
|
library(readxl)
|
|
library(here)
|
|
library(furrr)
|
|
library(future)
|
|
})
|
|
|
|
# 2. Process command line arguments
|
|
# ------------------------------
|
|
main <- function() {
|
|
# Capture command line arguments
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
|
|
# Process end_date argument
|
|
if (length(args) >= 1 && !is.na(args[1]) && args[1] != "") {
|
|
# Parse date explicitly in YYYY-MM-DD format from command line
|
|
end_date <- as.Date(args[1], format = "%Y-%m-%d")
|
|
if (is.na(end_date)) {
|
|
warning("Invalid end_date provided. Using default (current date).")
|
|
end_date <- Sys.Date()
|
|
#end_date <- "2023-10-01"
|
|
}
|
|
} else {
|
|
end_date <- Sys.Date()
|
|
#end_date <- "2023-10-01"
|
|
}
|
|
|
|
# Process offset argument
|
|
if (length(args) >= 2 && !is.na(args[2])) {
|
|
offset <- as.numeric(args[2])
|
|
if (is.na(offset) || offset <= 0) {
|
|
warning("Invalid offset provided. Using default (7 days).")
|
|
offset <- 7
|
|
}
|
|
} else {
|
|
offset <- 7
|
|
}
|
|
|
|
# Process project_dir argument
|
|
if (length(args) >= 3 && !is.na(args[3])) {
|
|
project_dir <- as.character(args[3])
|
|
} else if (exists("project_dir", envir = .GlobalEnv)) {
|
|
project_dir <- get("project_dir", envir = .GlobalEnv)
|
|
} else {
|
|
project_dir <- "angata" # Changed default from "aura" to "esa"
|
|
}
|
|
|
|
# Process data_source argument (optional, for specifying merged_tif_8b vs merged_tif vs merged_final_tif)
|
|
if (length(args) >= 4 && !is.na(args[4])) {
|
|
data_source <- as.character(args[4])
|
|
# Validate data_source is a recognized option
|
|
if (!data_source %in% c("merged_tif_8b", "merged_tif", "merged_final_tif")) {
|
|
warning(paste("Data source", data_source, "not in standard list. Using as-is."))
|
|
}
|
|
} else if (exists("data_source", envir = .GlobalEnv)) {
|
|
data_source <- get("data_source", envir = .GlobalEnv)
|
|
} else {
|
|
data_source <- "merged_tif_8b" # Default to 8-band (newer data with cloud masking)
|
|
}
|
|
|
|
# Make project_dir and data_source available globally
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
assign("data_source", data_source, envir = .GlobalEnv)
|
|
|
|
cat(sprintf("CI Extraction: project=%s, end_date=%s, offset=%d days, data_source=%s\n",
|
|
project_dir, format(end_date, "%Y-%m-%d"), offset, data_source))
|
|
|
|
# Set flag to use pivot_2.geojson for ESA (extra fields for yield prediction)
|
|
ci_extraction_script <- TRUE
|
|
assign("ci_extraction_script", ci_extraction_script, envir = .GlobalEnv)
|
|
|
|
# 3. Initialize project configuration
|
|
# --------------------------------
|
|
new_project_question <- TRUE
|
|
|
|
cat("[DEBUG] Attempting to source r_app/parameters_project.R\n")
|
|
tryCatch({
|
|
source("r_app/parameters_project.R")
|
|
cat("[DEBUG] Successfully sourced r_app/parameters_project.R\n")
|
|
}, error = function(e) {
|
|
cat("[ERROR] Failed to source r_app/parameters_project.R:\n", e$message, "\n")
|
|
stop(e)
|
|
})
|
|
|
|
cat("[DEBUG] Attempting to source r_app/20_ci_extraction_utils.R\n")
|
|
tryCatch({
|
|
source("r_app/20_ci_extraction_utils.R")
|
|
cat("[DEBUG] Successfully sourced r_app/20_ci_extraction_utils.R\n")
|
|
}, error = function(e) {
|
|
cat("[ERROR] Failed to source r_app/20_ci_extraction_utils.R:\n", e$message, "\n")
|
|
stop(e)
|
|
})
|
|
|
|
|
|
# 4. Generate date list for processing
|
|
# ---------------------------------
|
|
dates <- date_list(end_date, offset)
|
|
log_message(paste("Processing data for week", dates$week, "of", dates$year))
|
|
|
|
# 4a. CHECK DAILY CI EXTRACTION - Skip dates that already have extracted files
|
|
# -------------------------------------------------------------------------
|
|
log_message("\n===== CHECKING DAILY CI EXTRACTION STATUS =====")
|
|
|
|
# Check which dates already have extracted CI files
|
|
already_extracted <- c()
|
|
missing_extraction <- c()
|
|
|
|
if (dir.exists(daily_CI_vals_dir)) {
|
|
existing_ci_files <- list.files(daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$")
|
|
# Extract dates from filenames like "extracted_2025-12-31_quadrant.rds"
|
|
already_extracted <- sub("^extracted_(.+)_.*\\.rds$", "\\1", existing_ci_files)
|
|
}
|
|
|
|
# Find which dates in our processing range need extraction
|
|
missing_extraction <- dates$days_filter[!(dates$days_filter %in% already_extracted)]
|
|
|
|
cat(sprintf("[CI CHECK] Already extracted: %d dates\n", length(already_extracted)))
|
|
cat(sprintf("[CI CHECK] Need extraction: %d dates (from %s to %s)\n",
|
|
length(missing_extraction),
|
|
if(length(missing_extraction) > 0) min(missing_extraction) else "N/A",
|
|
if(length(missing_extraction) > 0) max(missing_extraction) else "N/A"))
|
|
|
|
# If any dates need extraction, we'll extract them
|
|
# If NO dates need extraction, we'll skip extraction but ALWAYS rebuild combined_CI_data.rds
|
|
skip_extraction <- (length(missing_extraction) == 0)
|
|
|
|
if (skip_extraction) {
|
|
log_message("✓ All dates in processing range already have extracted CI files - skipping extraction")
|
|
log_message("⚠ Will rebuild combined_CI_data.rds to ensure completeness")
|
|
}
|
|
|
|
# 4b. CHECK SOURCE DATA AVAILABILITY
|
|
# ---------------------------------------------------------------
|
|
# Verify that source data exists for dates we're going to extract
|
|
# If a date is missing from source, we'll skip it gracefully
|
|
log_message("\n===== CHECKING SOURCE DATA AVAILABILITY =====")
|
|
|
|
dates_with_source <- c()
|
|
dates_missing_source <- c()
|
|
|
|
if (!skip_extraction && length(missing_extraction) > 0) {
|
|
# Check which source dates are actually available
|
|
for (date_str in missing_extraction) {
|
|
# Look for the date in merged_tif directory
|
|
source_file_pattern <- sprintf("%s\\.tif$", date_str)
|
|
files_for_date <- list.files(planet_tif_folder, pattern = source_file_pattern)
|
|
|
|
if (length(files_for_date) > 0) {
|
|
dates_with_source <- c(dates_with_source, date_str)
|
|
} else {
|
|
dates_missing_source <- c(dates_missing_source, date_str)
|
|
}
|
|
}
|
|
|
|
cat(sprintf("[SOURCE CHECK] Dates with available source data: %d\n", length(dates_with_source)))
|
|
cat(sprintf("[SOURCE CHECK] Dates missing from source (will skip): %d\n", length(dates_missing_source)))
|
|
|
|
if (length(dates_missing_source) > 0) {
|
|
log_message(paste("⚠ Skipping extraction for missing source dates:", paste(dates_missing_source, collapse = ", ")))
|
|
}
|
|
}
|
|
|
|
# 5. Find and filter raster files by date - with grid size detection
|
|
# -----------------------------------
|
|
log_message("Searching for raster files")
|
|
|
|
# Check if tiles exist (Script 01 output) - detect grid size dynamically
|
|
tiles_split_base <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split")
|
|
|
|
# Detect grid size from daily_tiles_split folder structure
|
|
# Expected structure: daily_tiles_split/5x5/ or daily_tiles_split/10x10/ etc.
|
|
grid_size <- NA
|
|
if (dir.exists(tiles_split_base)) {
|
|
subfolders <- list.dirs(tiles_split_base, full.names = FALSE, recursive = FALSE)
|
|
# Look for grid size patterns like "5x5", "10x10", "20x20"
|
|
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
|
|
if (length(grid_patterns) > 0) {
|
|
grid_size <- grid_patterns[1] # Use first grid size found
|
|
log_message(paste("Detected grid size:", grid_size))
|
|
}
|
|
}
|
|
|
|
# Construct tile folder path with grid size
|
|
if (!is.na(grid_size)) {
|
|
tile_folder <- file.path(tiles_split_base, grid_size)
|
|
} else {
|
|
tile_folder <- tiles_split_base
|
|
}
|
|
|
|
use_tiles <- dir.exists(tile_folder)
|
|
|
|
# Make grid_size available globally for other functions
|
|
assign("grid_size", grid_size, envir = .GlobalEnv)
|
|
|
|
tryCatch({
|
|
if (skip_extraction) {
|
|
log_message("\n===== SKIPPING CI EXTRACTION (all dates already processed) =====")
|
|
} else if (use_tiles) {
|
|
# Use tile-based processing
|
|
log_message(paste("Tile folder detected at", tile_folder))
|
|
log_message("Using tile-based CI extraction")
|
|
|
|
# Call the tile-based extraction function
|
|
process_ci_values_from_tiles(
|
|
dates = dates,
|
|
tile_folder = tile_folder,
|
|
field_boundaries = field_boundaries,
|
|
field_boundaries_sf = field_boundaries_sf,
|
|
daily_CI_vals_dir = daily_CI_vals_dir,
|
|
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
|
|
merged_final_dir = merged_final,
|
|
grid_size = grid_size
|
|
)
|
|
|
|
} else {
|
|
# Use legacy full-extent processing
|
|
log_message("No tiles found. Using legacy full-extent approach")
|
|
|
|
# Use the existing utility function to find satellite images
|
|
existing_files <- find_satellite_images(planet_tif_folder, dates$days_filter)
|
|
log_message(paste("Found", length(existing_files), "raster files for processing"))
|
|
|
|
# Process raster files and create VRT
|
|
vrt_list <- process_satellite_images(existing_files, field_boundaries, merged_final, daily_vrt)
|
|
|
|
# Process and combine CI values
|
|
process_ci_values(dates, field_boundaries, merged_final,
|
|
field_boundaries_sf, daily_CI_vals_dir, cumulative_CI_vals_dir)
|
|
}
|
|
|
|
}, error = function(e) {
|
|
log_message(paste("Error in main processing:", e$message), level = "ERROR")
|
|
stop(e$message)
|
|
})
|
|
|
|
# 6. REBUILD combined_CI_data.rds from ALL daily extracted files
|
|
# -----------------------------------------------
|
|
# This ensures the combined file is complete and up-to-date
|
|
# even if extraction was skipped (because dates already existed)
|
|
# NOTE: Only rebuild if new dates were successfully extracted
|
|
# If all dates were missing from source, skip this step to avoid corrupting the file
|
|
log_message("\n===== HANDLING combined_CI_data.rds =====")
|
|
|
|
if (length(dates_with_source) == 0 && length(missing_extraction) > 0) {
|
|
# All missing dates had no source data - skip combined_CI_data.rds update
|
|
log_message("⚠ No new dates extracted (all source data missing) - skipping combined_CI_data.rds update")
|
|
} else if (skip_extraction) {
|
|
# All dates already extracted - optionally rebuild for consistency
|
|
log_message("✓ All dates already extracted - combined_CI_data.rds is up-to-date")
|
|
} else {
|
|
# New dates were extracted - rebuild combined_CI_data.rds from ALL daily files
|
|
log_message("Rebuilding combined_CI_data.rds from all daily extracted files...")
|
|
|
|
tryCatch({
|
|
if (!dir.exists(daily_CI_vals_dir)) {
|
|
log_message("Daily CI directory does not exist yet", level = "WARNING")
|
|
} else {
|
|
# List ALL daily CI files (not just new ones)
|
|
all_daily_files <- list.files(path = daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$", full.names = TRUE)
|
|
|
|
if (length(all_daily_files) == 0) {
|
|
log_message("No daily CI files found to combine", level = "WARNING")
|
|
} else {
|
|
log_message(paste("Combining all", length(all_daily_files), "daily CI files into combined_CI_data.rds"))
|
|
|
|
# Load and combine ALL daily files (creates complete dataset)
|
|
combined_ci_path <- file.path(cumulative_CI_vals_dir, "combined_CI_data.rds")
|
|
|
|
combined_data <- all_daily_files %>%
|
|
purrr::map(readRDS) %>%
|
|
purrr::list_rbind() %>%
|
|
dplyr::group_by(sub_field)
|
|
|
|
# Save the rebuilt combined data
|
|
saveRDS(combined_data, combined_ci_path)
|
|
|
|
log_message(paste("✓ Rebuilt combined_CI_data.rds with", nrow(combined_data), "total rows"))
|
|
}
|
|
}
|
|
}, error = function(e) {
|
|
log_message(paste("⚠ Error rebuilding combined_CI_data.rds (will skip):", e$message), level = "WARNING")
|
|
log_message(" Note: This is OK - Script 30 will use growth model RDS instead", level = "WARNING")
|
|
})
|
|
}
|
|
}
|
|
|
|
if (sys.nframe() == 0) {
|
|
main()
|
|
}
|