SmartCane/r_app/20_ci_extraction.R

# ============================================================================
# SCRIPT 20: Canopy Index (CI) Extraction from Satellite Imagery
# ============================================================================
# PURPOSE:
#   Extract Canopy Index (CI) from 4-band or 8-band satellite imagery and
#   mask by field boundaries. Supports automatic band detection, cloud masking
#   with UDM2 (8-band), and per-field CI value extraction. Produces both
#   per-field TIFFs and consolidated CI statistics for growth model input.
#
# INPUT DATA:
#   - Source: laravel_app/storage/app/{project}/field_tiles/{FIELD}/{DATE}.tif
#   - Format: GeoTIFF (4-band RGB+NIR from Planet API, or 8-band with UDM2)
#   - Requirement: Field boundaries (pivot.geojson) for masking
#
# OUTPUT DATA:
#   - Destination: laravel_app/storage/app/{project}/field_tiles_CI/{FIELD}/{DATE}.tif
#   - Format: GeoTIFF (5-band: R,G,B,NIR,CI as float32)
#   - Also exports: combined_CI/combined_CI_data.rds (wide format: fields × dates)
#
# USAGE:
#   Rscript 20_ci_extraction.R [end_date] [offset] [project] [data_source]
#
#   Example (Windows PowerShell):
#   & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif
#
# PARAMETERS:
#   - end_date: End date for processing (character, YYYY-MM-DD format)
#   - offset: Days to look back from end_date (numeric, default 7)
#   - project: Project name (character) - angata, chemba, xinavane, esa, simba
#   - data_source: Data source directory (character, optional) - "merged_tif" (default), "merged_tif_8b", "merged_final_tif"
#
# CLIENT TYPES:
#   - cane_supply (ANGATA): Yes - core data processing
#   - agronomic_support (AURA): Yes - supports field health monitoring
#
# DEPENDENCIES:
#   - Packages: terra, sf, tidyverse, lubridate, readxl, furrr, future
#   - Utils files: parameters_project.R, 00_common_utils.R, 20_ci_extraction_utils.R
#   - External data: Field boundaries (pivot.geojson), harvest data (harvest.xlsx)
#   - Data directories: field_tiles/, field_tiles_CI/, combined_CI/
#
# NOTES:
#   - CI formula: (NIR - Red) / (NIR + Red); normalized to 0-5 range
#   - 8-band data automatically cloud-masked using UDM2 (band 7-8)
#   - 4-band data assumes clear-sky Planet PSScene imagery
#   - Parallel processing via furrr for speed optimization
#   - Output RDS uses wide format (fields as rows, dates as columns) for growth model
#   - Critical dependency for Script 30 (growth model) and Script 80 (KPIs)
#
# RELATED ISSUES:
#   SC-112: Utilities restructuring
#   SC-108: Core pipeline improvements
#
# ============================================================================


# 1. Load required packages
# -----------------------
suppressPackageStartupMessages({
  library(sf)
  library(terra)
  library(tidyverse)
  library(lubridate)
  library(readxl)
  library(here)
  library(furrr)
  library(future)
})

# 2. Process command line arguments
# ------------------------------
main <- function() {
  # Capture command line arguments
  args <- commandArgs(trailingOnly = TRUE)

  # Process end_date argument
  if (length(args) >= 1 && !is.na(args[1]) && args[1] != "") {
    # Parse date explicitly in YYYY-MM-DD format from command line
    end_date <- as.Date(args[1], format = "%Y-%m-%d")
    if (is.na(end_date)) {
      warning("Invalid end_date provided. Using default (current date).")
      end_date <- Sys.Date()
      #end_date <- "2023-10-01"
    }
  } else {
    end_date <- Sys.Date()
    #end_date <- "2023-10-01"
  }

  # Process offset argument
  if (length(args) >= 2 && !is.na(args[2])) {
    offset <- as.numeric(args[2])
    if (is.na(offset) || offset <= 0) {
      warning("Invalid offset provided. Using default (7 days).")
      offset <- 7
    }
  } else {
    offset <- 7
  }

  # Process project_dir argument
  if (length(args) >= 3 && !is.na(args[3])) {
    project_dir <- as.character(args[3])
  } else if (exists("project_dir", envir = .GlobalEnv)) {
    project_dir <- get("project_dir", envir = .GlobalEnv)
  } else {
    project_dir <- "angata"  # Changed default from "aura" to "esa"
  }

  # Process data_source argument (optional, for specifying merged_tif_8b vs merged_tif vs merged_final_tif)
  if (length(args) >= 4 && !is.na(args[4])) {
    data_source <- as.character(args[4])
    # Validate data_source is a recognized option
    if (!data_source %in% c("merged_tif_8b", "merged_tif", "merged_final_tif")) {
      warning(paste("Data source", data_source, "not in standard list. Using as-is."))
    }
  } else if (exists("data_source", envir = .GlobalEnv)) {
    data_source <- get("data_source", envir = .GlobalEnv)
  } else {
    data_source <- "merged_tif_8b"  # Default to 8-band (newer data with cloud masking)
  }

  # Make project_dir and data_source available globally
  assign("project_dir", project_dir, envir = .GlobalEnv)
  assign("data_source", data_source, envir = .GlobalEnv)

  cat(sprintf("CI Extraction: project=%s, end_date=%s, offset=%d days, data_source=%s\n",
              project_dir, format(end_date, "%Y-%m-%d"), offset, data_source))

  # Set flag to use pivot_2.geojson for ESA (extra fields for yield prediction)
  ci_extraction_script <- TRUE
  assign("ci_extraction_script", ci_extraction_script, envir = .GlobalEnv)

  # 3. Initialize project configuration
  # --------------------------------
  new_project_question <- TRUE

  cat("[DEBUG] Attempting to source r_app/parameters_project.R\n")
  tryCatch({
    source("r_app/parameters_project.R")
    cat("[DEBUG] Successfully sourced r_app/parameters_project.R\n")
  }, error = function(e) {
    cat("[ERROR] Failed to source r_app/parameters_project.R:\n", e$message, "\n")
    stop(e)
  })

  # Load centralized path structure (creates all directories automatically)
  paths <- setup_project_directories(project_dir)

  cat("[DEBUG] Attempting to source r_app/00_common_utils.R\n")
  tryCatch({
    source("r_app/00_common_utils.R")
    cat("[DEBUG] Successfully sourced r_app/00_common_utils.R\n")
  }, error = function(e) {
    cat("[ERROR] Failed to source r_app/00_common_utils.R:\n", e$message, "\n")
    stop(e)
  })

  cat("[DEBUG] Attempting to source r_app/20_ci_extraction_utils.R\n")
  tryCatch({
    source("r_app/20_ci_extraction_utils.R")
    cat("[DEBUG] Successfully sourced r_app/20_ci_extraction_utils.R\n")
  }, error = function(e) {
    cat("[ERROR] Failed to source r_app/20_ci_extraction_utils.R:\n", e$message, "\n")
    stop(e)
  })


  # 4. Generate date list for processing
  # ---------------------------------
  dates <- date_list(end_date, offset)
  log_message(paste("Processing data for week", dates$week, "of", dates$year))

  # 4a. CHECK DAILY CI EXTRACTION - Skip dates that already have extracted files
  # -------------------------------------------------------------------------
  log_message("\n===== CHECKING DAILY CI EXTRACTION STATUS =====")

  # Check which dates already have extracted CI files
  already_extracted <- c()
  missing_extraction <- c()

  if (dir.exists(daily_CI_vals_dir)) {
    existing_ci_files <- list.files(daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$")
    # Extract dates from filenames like "extracted_2025-12-31_quadrant.rds"
    already_extracted <- sub("^extracted_(.+)_.*\\.rds$", "\\1", existing_ci_files)
  }

  # Find which dates in our processing range need extraction
  missing_extraction <- dates$days_filter[!(dates$days_filter %in% already_extracted)]

  cat(sprintf("[CI CHECK] Already extracted: %d dates\n", length(already_extracted)))
  cat(sprintf("[CI CHECK] Need extraction: %d dates (from %s to %s)\n",
              length(missing_extraction),
              if(length(missing_extraction) > 0) min(missing_extraction) else "N/A",
              if(length(missing_extraction) > 0) max(missing_extraction) else "N/A"))

  # If any dates need extraction, we'll extract them
  # If NO dates need extraction, we'll skip extraction but ALWAYS rebuild combined_CI_data.rds
  skip_extraction <- (length(missing_extraction) == 0)

  if (skip_extraction) {
    log_message("✓ All dates in processing range already have extracted CI files - skipping extraction")
    log_message("⚠ Will rebuild combined_CI_data.rds to ensure completeness")
  }

  # 4b. CHECK SOURCE DATA AVAILABILITY
  # ---------------------------------------------------------------
  # Verify that source data exists for dates we're going to extract
  # If a date is missing from source, we'll skip it gracefully
  log_message("\n===== CHECKING SOURCE DATA AVAILABILITY =====")

  dates_with_source <- c()
  dates_missing_source <- c()

  if (!skip_extraction && length(missing_extraction) > 0) {
    # Check which source dates are actually available
    for (date_str in missing_extraction) {
      # Look for the date in merged_tif directory
      source_file_pattern <- sprintf("%s\\.tif$", date_str)
      files_for_date <- list.files(planet_tif_folder, pattern = source_file_pattern)

      if (length(files_for_date) > 0) {
        dates_with_source <- c(dates_with_source, date_str)
      } else {
        dates_missing_source <- c(dates_missing_source, date_str)
      }
    }

    cat(sprintf("[SOURCE CHECK] Dates with available source data: %d\n", length(dates_with_source)))
    cat(sprintf("[SOURCE CHECK] Dates missing from source (will skip): %d\n", length(dates_missing_source)))

    if (length(dates_missing_source) > 0) {
      log_message(paste("⚠ Skipping extraction for missing source dates:", paste(dates_missing_source, collapse = ", ")))
    }
  }

  # 5. Find and filter raster files by date - with grid size detection
  # -----------------------------------
  log_message("Searching for raster files")

  # Check if tiles exist (Script 10 output) - detect grid size dynamically using centralized paths
  tiles_split_base <- paths$daily_tiles_split_dir

  # Detect grid size from daily_tiles_split folder structure
  # Expected structure: daily_tiles_split/5x5/ or daily_tiles_split/10x10/ etc.
  grid_size <- NA
  if (dir.exists(tiles_split_base)) {
    subfolders <- list.dirs(tiles_split_base, full.names = FALSE, recursive = FALSE)
    # Look for grid size patterns like "5x5", "10x10", "20x20"
    grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
    if (length(grid_patterns) > 0) {
      grid_size <- grid_patterns[1]  # Use first grid size found
      log_message(paste("Detected grid size:", grid_size))
    }
  }

  # Construct tile folder path with grid size
  if (!is.na(grid_size)) {
    tile_folder <- file.path(tiles_split_base, grid_size)
  } else {
    tile_folder <- tiles_split_base
  }

  use_tiles <- dir.exists(tile_folder)

  # Make grid_size available globally for other functions
  assign("grid_size", grid_size, envir = .GlobalEnv)

  tryCatch({
    if (skip_extraction) {
      log_message("\n===== SKIPPING CI EXTRACTION (all dates already processed) =====")
    } else if (use_tiles) {
      # Use tile-based processing
      log_message(paste("Tile folder detected at", tile_folder))
      log_message("Using tile-based CI extraction")

      # Call the tile-based extraction function
      process_ci_values_from_tiles(
        dates = dates,
        tile_folder = tile_folder,
        field_boundaries = field_boundaries,
        field_boundaries_sf = field_boundaries_sf,
        daily_CI_vals_dir = daily_CI_vals_dir,
        cumulative_CI_vals_dir = cumulative_CI_vals_dir,
        merged_final_dir = merged_final,
        grid_size = grid_size
      )

    } else {
      # Use legacy full-extent processing
      log_message("No tiles found. Using legacy full-extent approach")

      # Use the existing utility function to find satellite images
      existing_files <- find_satellite_images(planet_tif_folder, dates$days_filter)
      log_message(paste("Found", length(existing_files), "raster files for processing"))

      # Process raster files and create VRT
      vrt_list <- process_satellite_images(existing_files, field_boundaries, merged_final, daily_vrt)

      # Process and combine CI values
      process_ci_values(dates, field_boundaries, merged_final,
                       field_boundaries_sf, daily_CI_vals_dir, cumulative_CI_vals_dir)
    }

  }, error = function(e) {
    log_message(paste("Error in main processing:", e$message), level = "ERROR")
    stop(e$message)
  })

  # 6. REBUILD combined_CI_data.rds from ALL daily extracted files
  # -----------------------------------------------
  # This ensures the combined file is complete and up-to-date
  # even if extraction was skipped (because dates already existed)
  # NOTE: Only rebuild if new dates were successfully extracted
  # If all dates were missing from source, skip this step to avoid corrupting the file
  log_message("\n===== HANDLING combined_CI_data.rds =====")

  if (length(dates_with_source) == 0 && length(missing_extraction) > 0) {
    # All missing dates had no source data - skip combined_CI_data.rds update
    log_message("⚠ No new dates extracted (all source data missing) - skipping combined_CI_data.rds update")
  } else if (skip_extraction) {
    # All dates already extracted - optionally rebuild for consistency
    log_message("✓ All dates already extracted - combined_CI_data.rds is up-to-date")
  } else {
    # New dates were extracted - rebuild combined_CI_data.rds from ALL daily files
    log_message("Rebuilding combined_CI_data.rds from all daily extracted files...")

    tryCatch({
      if (!dir.exists(daily_CI_vals_dir)) {
        log_message("Daily CI directory does not exist yet", level = "WARNING")
      } else {
        # List ALL daily CI files (not just new ones)
        all_daily_files <- list.files(path = daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$", full.names = TRUE)

        if (length(all_daily_files) == 0) {
          log_message("No daily CI files found to combine", level = "WARNING")
        } else {
          log_message(paste("Combining all", length(all_daily_files), "daily CI files into combined_CI_data.rds"))

          # Load and combine ALL daily files (creates complete dataset)
          combined_ci_path <- file.path(paths$cumulative_ci_vals_dir, "combined_CI_data.rds")

          combined_data <- all_daily_files %>%
            purrr::map(readRDS) %>%
            purrr::list_rbind() %>%
            dplyr::group_by(sub_field)

          # Save the rebuilt combined data
          saveRDS(combined_data, combined_ci_path)

          log_message(paste("✓ Rebuilt combined_CI_data.rds with", nrow(combined_data), "total rows"))
        }
      }
    }, error = function(e) {
      log_message(paste("⚠ Error rebuilding combined_CI_data.rds (will skip):", e$message), level = "WARNING")
      log_message("  Note: This is OK - Script 30 will use growth model RDS instead", level = "WARNING")
    })
  }
}

if (sys.nframe() == 0) {
  main()
}