SmartCane/r_app/20_ci_extraction_utils.R

# CI_EXTRACTION_UTILS.R
# =====================
# Utility functions for the SmartCane CI (Chlorophill Index) extraction workflow.
# These functions support date handling, raster processing, and data extraction.
# Includes parallel tile processing using furrr for memory efficiency.
#
# Parallel Processing: Tile-based extraction uses furrr::future_map to process
# multiple tiles simultaneously (typically 2-4 tiles in parallel depending on CPU cores)
#
# Per-Field Functions (Script 20):
#   - calc_ci_from_raster(): Calculate CI from 4-band raster (Chlorophyll Index formula: NIR/Green - 1)
#   - extract_ci_by_subfield(): Extract per-sub_field CI statistics from raster

#' Generate a sequence of dates for processing
#'
#' @param end_date The end date for the sequence (Date object)
#' @param offset Number of days to look back from end_date
#' @return A list containing week number, year, and a sequence of dates for filtering
#'
date_list <- function(end_date, offset) {
  # Input validation
  if (!lubridate::is.Date(end_date)) {
    end_date <- as.Date(end_date)
    if (is.na(end_date)) {
      stop("Invalid end_date provided. Expected a Date object or a string convertible to Date.")
    }
  }

  offset <- as.numeric(offset)
  if (is.na(offset) || offset < 1) {
    stop("Invalid offset provided. Expected a positive number.")
  }

  # Calculate date range
  offset <- offset - 1  # Adjust offset to include end_date
  start_date <- end_date - lubridate::days(offset)

  # Extract week and year information
  week <- lubridate::week(start_date)
  year <- lubridate::year(start_date)

  # Generate sequence of dates
  days_filter <- seq(from = start_date, to = end_date, by = "day")
  days_filter <- format(days_filter, "%Y-%m-%d")  # Format for consistent filtering

  # Log the date range
  safe_log(paste("Date range generated from", start_date, "to", end_date))

  return(list(
    "week" = week,
    "year" = year,
    "days_filter" = days_filter,
    "start_date" = start_date,
    "end_date" = end_date
  ))
}

#' Detect band count and structure (4-band vs 8-band with optional UDM)
#'
#' @param loaded_raster Loaded raster object
#' @return List with structure info: $type (4b or 8b), $has_udm (logical), $band_names
#'
detect_raster_structure <- function(loaded_raster) {
  n_bands <- terra::nlyr(loaded_raster)

  # Determine raster type and structure
  if (n_bands == 4) {
    # 4-band optimized: RGB + NIR (cloud-masked server-side by Planet)
    return(list(
      type = "4b",
      has_udm = FALSE,
      band_names = c("Red", "Green", "Blue", "NIR"),
      red_idx = 1, green_idx = 2, blue_idx = 3, nir_idx = 4, udm_idx = NA
    ))
  } else if (n_bands == 5) {
    # 4-band + alpha channel (may be added by GDAL merge process)
    # Alpha channel is ignored, we use first 4 bands
    return(list(
      type = "4b",
      has_udm = FALSE,
      band_names = c("Red", "Green", "Blue", "NIR", "Alpha"),
      red_idx = 1, green_idx = 2, blue_idx = 3, nir_idx = 4, udm_idx = NA
    ))
  } else if (n_bands %in% c(8, 9)) {
    # PlanetScope 8-band structure:
    # 1=Coastal Blue, 2=Blue, 3=Green I, 4=Green, 5=Yellow, 6=Red, 7=Red Edge, 8=NIR
    # 9-band: includes UDM1 (Usable Data Mask) as final band
    has_udm <- n_bands == 9
    return(list(
      type = "8b",
      has_udm = has_udm,
      band_names = if (has_udm) {
        c("CoastalBlue", "Blue", "GreenI", "Green", "Yellow", "Red", "RedEdge", "NIR", "UDM1")
      } else {
        c("CoastalBlue", "Blue", "GreenI", "Green", "Yellow", "Red", "RedEdge", "NIR")
      },
      red_idx = 6, green_idx = 4, blue_idx = 2, nir_idx = 8, udm_idx = if (has_udm) 9 else NA
    ))
  } else {
    stop(paste("Unexpected number of bands:", n_bands,
               "Expected 4-band, 4-band+alpha, 8-band, or 9-band data"))
  }
}

#' Apply cloud masking for 8-band data with UDM layer
#'
#' @param loaded_raster Raster with UDM band
#' @param udm_idx Index of the UDM band
#' @return Raster with cloud-masked pixels set to NA
#'
apply_udm_masking <- function(loaded_raster, udm_idx) {
  if (is.na(udm_idx)) {
    return(loaded_raster)
  }

  # Extract UDM band (0 = clear sky, 1 = shadow, 2 = cloud, 3 = snow, 4 = water)
  # We only mask pixels where UDM = 2 (clouds)
  udm_band <- loaded_raster[[udm_idx]]

  # Create mask where UDM == 0 (clear/valid pixels only)
  cloud_mask <- udm_band == 0

  # Apply mask to all bands except UDM itself
  for (i in 1:(terra::nlyr(loaded_raster) - 1)) {
    loaded_raster[[i]][!cloud_mask] <- NA
  }

  safe_log(paste("Applied UDM cloud masking to raster (masking non-clear pixels)"))
  return(loaded_raster)
}

#' Create a Chlorophill Index (CI) mask from satellite imagery and crop to field boundaries
#'
#' Supports both 4-band and 8-band (with optional UDM) Planet Scope data.
#' For 8-band data, automatically applies cloud masking using the UDM layer if present.
#'
#' @param file Path to the satellite image file
#' @param field_boundaries Field boundaries vector object
#' @param merged_final_dir Directory to save the processed raster
#' @return Processed raster object with CI band
#'
create_mask_and_crop <- function(file, field_boundaries, merged_final_dir) {
  # Validate inputs
  if (!file.exists(file)) {
    stop(paste("File not found:", file))
  }

  if (is.null(field_boundaries)) {
    stop("Field boundaries are required but were not provided")
  }

  # Note: No conversion needed - sf::st_bbox() works with both sf and terra objects
  # field_boundaries is used only for spatial reference (bounding box)

  # Establish file names for output
  basename_no_ext <- tools::file_path_sans_ext(basename(file))
  new_file <- here::here(merged_final_dir, paste0(basename_no_ext, ".tif"))
  vrt_file <- here::here(daily_vrt, paste0(basename_no_ext, ".vrt"))

  # Process with error handling
  tryCatch({
    # Log processing start
    safe_log(paste("Processing", basename(file)))

    # Load and prepare raster
    loaded_raster <- terra::rast(file)

    # Validate raster has necessary bands
    if (terra::nlyr(loaded_raster) < 4) {
      stop("Raster must have at least 4 bands (Red, Green, Blue, NIR)")
    }

    # Detect raster structure (4b vs 8b with optional UDM)
    structure_info <- detect_raster_structure(loaded_raster)
    safe_log(paste("Detected", structure_info$type, "data",
                   if (structure_info$has_udm) "with UDM cloud masking" else "without cloud masking"))

    # Extract the bands we need FIRST
    # This ensures we're working with just the necessary data
    red_band <- loaded_raster[[structure_info$red_idx]]
    green_band <- loaded_raster[[structure_info$green_idx]]
    blue_band <- loaded_raster[[structure_info$blue_idx]]
    nir_band <- loaded_raster[[structure_info$nir_idx]]

    # Now apply cloud masking to these selected bands if UDM exists
    if (structure_info$has_udm) {
      udm_band <- loaded_raster[[structure_info$udm_idx]]
      # Create mask where UDM != 2 (mask only clouds, keep clear sky and shadows)
      cloud_mask <- udm_band != 2

      red_band[!cloud_mask] <- NA
      green_band[!cloud_mask] <- NA
      blue_band[!cloud_mask] <- NA
      nir_band[!cloud_mask] <- NA

      safe_log("Applied UDM cloud masking to selected bands (masking UDM=2 clouds only)")
    }

    # Name the bands
    names(red_band) <- "Red"
    names(green_band) <- "Green"
    names(blue_band) <- "Blue"
    names(nir_band) <- "NIR"

    # Calculate Canopy Index from Green and NIR
    # *** CRITICAL: Use CHLOROPHYLL INDEX formula ONLY ***
    # CORRECT: CI = NIR / Green - 1 (ranges ~1-7, sensitive to active chlorophyll)
    # WRONG: Do NOT use (NIR-Red)/(NIR+Red) - that is NDVI, ranges -1 to 1, different scale
    CI <- nir_band / green_band - 1
    names(CI) <- "CI"

    # Create output raster with essential bands: Red, Green, Blue, NIR, CI
    output_raster <- c(red_band, green_band, blue_band, nir_band, CI)
    names(output_raster) <- c("Red", "Green", "Blue", "NIR", "CI")

    # Ensure CRS compatibility before cropping
    tryCatch({
      raster_crs <- terra::crs(output_raster, proj = TRUE)
      raster_crs_char <- as.character(raster_crs)

      # Handle boundaries CRS - works for both terra and sf objects
      if (inherits(field_boundaries, "sf")) {
        boundaries_crs <- sf::st_crs(field_boundaries)
        boundaries_crs_char <- if (!is.na(boundaries_crs)) as.character(boundaries_crs$wkt) else ""
      } else {
        boundaries_crs <- terra::crs(field_boundaries, proj = TRUE)
        boundaries_crs_char <- as.character(boundaries_crs)
      }

      if (length(raster_crs_char) > 0 && length(boundaries_crs_char) > 0 &&
          nchar(raster_crs_char) > 0 && nchar(boundaries_crs_char) > 0) {
        if (raster_crs_char != boundaries_crs_char) {
          # Transform field boundaries to match raster CRS only if it's a terra object
          if (inherits(field_boundaries, "SpatVector")) {
            field_boundaries <- terra::project(field_boundaries, raster_crs_char)
            safe_log("Transformed field boundaries CRS to match raster CRS")
          } else {
            safe_log("Field boundaries is sf object - CRS transformation skipped")
          }
        }
      } else {
        # If CRS is missing, try to assign a default WGS84 CRS
        if (length(raster_crs_char) == 0 || nchar(raster_crs_char) == 0) {
          terra::crs(output_raster) <- "EPSG:4326"
          safe_log("Assigned default WGS84 CRS to raster")
        }
        if (length(boundaries_crs_char) == 0 || nchar(boundaries_crs_char) == 0) {
          if (inherits(field_boundaries, "SpatVector")) {
            terra::crs(field_boundaries) <- "EPSG:4326"
          } else {
            sf::st_crs(field_boundaries) <- 4326
          }
          safe_log("Assigned default WGS84 CRS to field boundaries")
        }
      }
    }, error = function(e) {
      safe_log(paste("CRS handling warning:", e$message), "WARNING")
    })

    output_raster <- tryCatch({
      # terra::crop can work with both terra and sf objects, but if it fails with sf, try conversion
      terra::crop(output_raster, field_boundaries, mask = TRUE)
    }, error = function(e) {
      # If crop fails (common with certain sf geometries), convert sf to terra first
      if (inherits(field_boundaries, "sf")) {
        safe_log(paste("Crop with sf failed, attempting alternative approach:", e$message), "WARNING")
        # Use terra mask operation instead of crop for sf objects
        # First, get the bbox from sf object and use it for rough crop
        bbox <- sf::st_bbox(field_boundaries)
        output_raster_cropped <- terra::crop(output_raster,
                                             terra::ext(bbox[1], bbox[3], bbox[2], bbox[4]))
        return(output_raster_cropped)
      } else {
        stop(e)
      }
    })

    # Note: Do NOT replace zeros with NA here - Red/Green/Blue/NIR reflectance can be near zero
    # Only CI can go negative (if NIR < Green), but that's valid vegetation index behavior
    # output_raster[output_raster == 0] <- NA  # REMOVED - this was causing data loss

    # Write output files
    terra::writeRaster(output_raster, new_file, overwrite = TRUE)

    # Check if the result has enough valid pixels
    valid_pixels <- terra::global(output_raster$CI, "notNA", na.rm=TRUE)

    # Log completion
    safe_log(paste("Completed processing", basename(file),
                   "- Valid pixels:", valid_pixels[1,]))

    return(output_raster)

  }, error = function(e) {
    err_msg <- paste("Error processing", basename(file), "-", e$message)
    safe_log(err_msg, "ERROR")
    return(NULL)
  }, finally = {
    # Clean up memory
    gc()
  })
}

#' Process a batch of satellite images and create VRT files
#'
#' @param files Vector of file paths to process
#' @param field_boundaries Field boundaries vector object for cropping
#' @param merged_final_dir Directory to save processed rasters
#' @param daily_vrt_dir Directory to save VRT files
#' @param min_valid_pixels Minimum number of valid pixels for a raster to be kept (default: 100)
#' @return List of valid VRT files created
#'
process_satellite_images <- function(files, field_boundaries, merged_final_dir, daily_vrt_dir, min_valid_pixels = 100) {
  vrt_list <- list()

  safe_log(paste("Starting batch processing of", length(files), "files"))

  # Process each file
  for (file in files) {
    # Process each raster file
    v_crop <- create_mask_and_crop(file, field_boundaries, merged_final_dir)

    # Skip if processing failed
    if (is.null(v_crop)) {
      next
    }

    # Check if the raster has enough valid data
    valid_data <- terra::global(v_crop, "notNA")
    vrt_file <- here::here(daily_vrt_dir, paste0(tools::file_path_sans_ext(basename(file)), ".vrt"))

    if (valid_data[1,] > min_valid_pixels) {
      vrt_list[[vrt_file]] <- vrt_file
    } else {
      # Remove VRT files with insufficient data
      if (file.exists(vrt_file)) {
        file.remove(vrt_file)
      }
      safe_log(paste("Skipping", basename(file), "- insufficient valid data"), "WARNING")
    }

    # Clean up memory
    rm(v_crop)
    gc()
  }

  safe_log(paste("Completed processing", length(vrt_list), "raster files"))

  return(vrt_list)
}

#' Find satellite image files filtered by date
#'
#' @param tif_folder Directory containing satellite imagery files
#' @param dates_filter Character vector of dates in YYYY-MM-DD format
#' @return Vector of file paths matching the date filter
#'
find_satellite_images <- function(tif_folder, dates_filter) {
  # Find all raster files
  raster_files <- list.files(tif_folder, full.names = TRUE, pattern = "\\.tif$")

  if (length(raster_files) == 0) {
    stop("No raster files found in directory: ", tif_folder)
  }

  # Filter files by dates
  filtered_files <- purrr::map(dates_filter, ~ raster_files[grepl(pattern = .x, x = raster_files)]) %>%
    purrr::compact() %>%
    purrr::flatten_chr()

  # Remove files that do not exist
  existing_files <- filtered_files[file.exists(filtered_files)]

  # Check if the list of existing files is empty
  if (length(existing_files) == 0) {
    stop("No files found matching the date filter: ", paste(dates_filter, collapse = ", "))
  }

  return(existing_files)
}

#' Extract date from file path
#'
#' @param file_path Path to the file
#' @return Extracted date in YYYY-MM-DD format
#'
date_extract <- function(file_path) {
  date <- stringr::str_extract(file_path, "\\d{4}-\\d{2}-\\d{2}")

  if (is.na(date)) {
    warning(paste("Could not extract date from file path: ", file_path))
  }

  return(date)
}

#' Extract CI values from a raster for each field or subfield
#'
#' @param file Path to the raster file
#' @param field_geojson Field boundaries as SF object
#' @param quadrants Boolean indicating whether to extract by quadrants
#' @param save_dir Directory to save the extracted values
#' @return Path to the saved RDS file
#'
extract_rasters_daily <- function(file, field_geojson, quadrants = TRUE, save_dir) {
  # Validate inputs
  if (!file.exists(file)) {
    stop(paste("File not found: ", file))
  }

  if (!inherits(field_geojson, "sf") && !inherits(field_geojson, "sfc")) {
    field_geojson <- sf::st_as_sf(field_geojson)
  }

  # Extract date from file path
  date <- date_extract(file)
  if (is.na(date)) {
    stop(paste("Could not extract date from file path:", file))
  }

  # Log extraction start
  safe_log(paste("Extracting CI values for", date, "- Using quadrants:", quadrants))

  # Process with error handling
  tryCatch({
    # Load raster
    x <- terra::rast(file)

    # Check if CI band exists
    if (!"CI" %in% names(x)) {
      stop("CI band not found in raster")
    }

    # Get raster info for logging (dimensions and memory scale)
    raster_cells <- terra::ncell(x$CI)
    raster_size_mb <- (raster_cells * 8) / (1024 * 1024)  # approximate MB for double precision
    safe_log(paste("  Raster size:", format(raster_cells, big.mark=","), "cells (~",
                   round(raster_size_mb, 1), "MB)"))

    # Crop raster to field boundaries extent BEFORE extraction
    # This reduces memory usage by working with a smaller spatial subset
    field_bbox <- sf::st_bbox(field_geojson)
    x_cropped <- terra::crop(x, terra::ext(field_bbox), snap = "out")

    cropped_cells <- terra::ncell(x_cropped$CI)
    cropped_mb <- (cropped_cells * 8) / (1024 * 1024)
    safe_log(paste("  After crop:", format(cropped_cells, big.mark=","),
                   "cells (~", round(cropped_mb, 1), "MB)"))

    # Extract statistics using terra::extract (memory-efficient, works with sf directly)
    # terra::extract returns a data.frame with ID (row numbers) and extracted values
    extracted_vals <- terra::extract(x_cropped$CI, field_geojson, fun = "mean", na.rm = TRUE)

    # Build result matching expected format (field, sub_field, date columns)
    pivot_stats <- field_geojson %>%
      sf::st_drop_geometry() %>%
      mutate(mean_CI = round(extracted_vals[, 2], 2)) %>%
      dplyr::rename("{date}" := mean_CI)

    # Determine save path
    save_suffix <- if (quadrants) {"quadrant"} else {"whole_field"}
    save_path <- here::here(save_dir, paste0("extracted_", date, "_", save_suffix, ".rds"))

    # Save extracted data
    saveRDS(pivot_stats, save_path)

    # Log success
    safe_log(paste("Successfully extracted and saved CI values for", date))

    return(save_path)

  }, error = function(e) {
    err_msg <- paste("Error extracting CI values for", date, "-", e$message)
    safe_log(err_msg, "ERROR")
    return(NULL)
  })
}

#' Combine daily CI values into a single dataset
#'
#' @param daily_CI_vals_dir Directory containing daily CI values
#' @param output_file Path to save the combined dataset
#' @return Combined dataset as a tibble
#'
combine_ci_values <- function(daily_CI_vals_dir, output_file = NULL) {
  # List all RDS files in the daily CI values directory
  files <- list.files(path = daily_CI_vals_dir, pattern = "^extracted_.*\\.rds$", full.names = TRUE)

  if (length(files) == 0) {
    stop("No extracted CI values found in directory:", daily_CI_vals_dir)
  }

  # Log process start
  safe_log(paste("Combining", length(files), "CI value files"))

  # Load and combine all files
  combined_data <- files %>%
    purrr::map(readRDS) %>%
    purrr::list_rbind() %>%
    dplyr::group_by(sub_field)

  # Save if output file is specified
  if (!is.null(output_file)) {
    saveRDS(combined_data, output_file)
    safe_log(paste("Combined CI values saved to", output_file))
  }

  return(combined_data)
}

#' Update existing CI data with new values
#'
#' @param new_data New CI data to be added
#' @param existing_data_file Path to the existing data file
#' @return Updated combined dataset
#'
update_ci_data <- function(new_data, existing_data_file) {
  if (!file.exists(existing_data_file)) {
    # File doesn't exist - create it with new data
    safe_log(paste("Creating new CI data file:", existing_data_file))

    # Ensure directory exists
    dir.create(dirname(existing_data_file), recursive = TRUE, showWarnings = FALSE)

    # Save new data
    saveRDS(new_data, existing_data_file)
    safe_log(paste("New CI data file created:", existing_data_file))

    return(new_data)
  }

  # File exists - load existing data and combine
  existing_data <- readRDS(existing_data_file)

  # Combine data, handling duplicates by keeping the newer values
  combined_data <- dplyr::bind_rows(new_data, existing_data) %>%
    dplyr::distinct() %>%
    dplyr::group_by(sub_field)

  # Save updated data
  saveRDS(combined_data, existing_data_file)
  safe_log(paste("Updated CI data saved to", existing_data_file))

  return(combined_data)
}

#' Process and combine CI values from raster files
#'
#' @param dates List of dates from date_list()
#' @param field_boundaries Field boundaries as vector object
#' @param merged_final_dir Directory with processed raster files
#' @param field_boundaries_sf Field boundaries as SF object
#' @param daily_CI_vals_dir Directory to save daily CI values
#' @param cumulative_CI_vals_dir Directory to save cumulative CI values
#' @return NULL (used for side effects)
#'
process_ci_values <- function(dates, field_boundaries, merged_final_dir,
                             field_boundaries_sf, daily_CI_vals_dir,
                             cumulative_CI_vals_dir) {
  # Find processed raster files
  raster_files <- list.files(merged_final_dir, full.names = TRUE, pattern = "\\.tif$")

  # Define path for combined CI data
  combined_ci_path <- here::here(cumulative_CI_vals_dir, "combined_CI_data.rds")

  # Check if the combined CI data file exists
  if (!file.exists(combined_ci_path)) {
    # Process all available data if file doesn't exist
    safe_log("combined_CI_data.rds does not exist. Creating new file with all available data.")
    safe_log(paste("Processing", length(raster_files), "raster files"))

    # Extract data from all raster files with error handling
    tryCatch({
      purrr::walk(
        raster_files,
        extract_rasters_daily,
        field_geojson = field_boundaries_sf,
        quadrants = FALSE,
        save_dir = daily_CI_vals_dir
      )
      safe_log("Extraction complete for all raster files")
    }, error = function(e) {
      safe_log(paste("Error during extraction walk:", e$message), "ERROR")
    })

    # Combine all extracted data
    tryCatch({
      pivot_stats <- combine_ci_values(daily_CI_vals_dir, combined_ci_path)
      safe_log("All CI values extracted from historic images and saved.")
    }, error = function(e) {
      safe_log(paste("Error combining CI values:", e$message), "ERROR")
      stop(e$message)
    })

  } else {
    # Process only the latest data and add to existing file
    safe_log("combined_CI_data.rds exists, adding the latest image data.")

    # Filter files by dates
    filtered_files <- purrr::map(dates$days_filter, ~ raster_files[grepl(pattern = .x, x = raster_files)]) %>%
      purrr::compact() %>%
      purrr::flatten_chr()

    safe_log(paste("Processing", length(filtered_files), "new raster files"))

    # Extract data for the new files with error handling
    tryCatch({
      purrr::walk(
        filtered_files,
        extract_rasters_daily,
        field_geojson = field_boundaries_sf,
        quadrants = TRUE,
        save_dir = daily_CI_vals_dir
      )
      safe_log("Extraction complete for new files")
    }, error = function(e) {
      safe_log(paste("Error during extraction walk:", e$message), "ERROR")
    })

    # Filter extracted values files by the current date range
    extracted_values <- list.files(daily_CI_vals_dir, full.names = TRUE)
    extracted_values <- purrr::map(dates$days_filter, ~ extracted_values[grepl(pattern = .x, x = extracted_values)]) %>%
      purrr::compact() %>%
      purrr::flatten_chr()

    safe_log(paste("Found", length(extracted_values), "extracted value files to combine"))

    # Combine new values
    new_pivot_stats <- extracted_values %>%
      purrr::map(readRDS) %>%
      purrr::list_rbind() %>%
      dplyr::group_by(sub_field)

    # Update the combined data file
    update_ci_data(new_data = new_pivot_stats, existing_data_file =combined_ci_path)
    safe_log("CI values from latest images added to combined_CI_data.rds")
  }
}
#' Process CI values from pre-split tiles (Script 01 output)
#'
#' This function processes CI values from tiles instead of full-extent rasters.
#' Tiles are created by Script 01 and stored in daily_tiles_split/[GRID_SIZE]/[DATE]/ folders.
#' For each field, it aggregates CI statistics from all tiles that intersect that field.
#' Output follows the same grid structure: merged_final_tif/[GRID_SIZE]/[DATE]/
#'
#' NOTE: Processes dates SEQUENTIALLY but tiles WITHIN EACH DATE in parallel (furrr)
#' This avoids worker process communication issues while still getting good speedup.
#'
#' @param dates List of dates from date_list()
#' @param tile_folder Path to the tile folder (daily_tiles_split/[GRID_SIZE])
#' @param field_boundaries Field boundaries as vector object
#' @param field_boundaries_sf Field boundaries as SF object
#' @param daily_CI_vals_dir Directory to save daily CI values
#' @param cumulative_CI_vals_dir Directory to save cumulative CI values
#' @param merged_final_dir Base directory to save processed tiles with CI band
#' @param grid_size Grid size label (e.g., "5x5", "10x10") for output path structure
#' @return NULL (used for side effects)
#'
process_ci_values_from_tiles <- function(dates, tile_folder, field_boundaries,
                                         field_boundaries_sf, daily_CI_vals_dir,
                                         cumulative_CI_vals_dir, merged_final_dir,
                                         grid_size = NA) {

  # Define path for combined CI data
  combined_ci_path <- here::here(cumulative_CI_vals_dir, "combined_CI_data.rds")

  # Discover all dates with tiles
  tile_dates <- list.dirs(tile_folder, full.names = FALSE, recursive = FALSE)
  tile_dates <- tile_dates[tile_dates != "master_grid_5x5.geojson"]  # Remove non-date entries
  tile_dates <- sort(tile_dates)

  # Filter to dates in current processing range
  dates_to_process <- tile_dates[tile_dates %in% dates$days_filter]

  if (length(dates_to_process) == 0) {
    safe_log("No tile dates found in processing date range", "WARNING")
    return(invisible(NULL))
  }

  safe_log(paste("Found", length(dates_to_process), "date(s) with tiles"))

  # Check if the combined CI data file exists
  if (!file.exists(combined_ci_path)) {
    safe_log("combined_CI_data.rds does not exist. Creating new file with all available tile data.")

    # Process all tile dates SEQUENTIALLY but with parallel tile processing
    # Tiles within each date are processed in parallel via extract_ci_from_tiles()
    all_pivot_stats <- list()

    for (i in seq_along(tile_dates)) {
      date <- tile_dates[i]

      # SKIP: Check if this date already has processed output tiles
      if (!is.na(grid_size)) {
        output_date_folder <- file.path(merged_final_dir, grid_size, date)
      } else {
        output_date_folder <- file.path(merged_final_dir, date)
      }

      if (dir.exists(output_date_folder)) {
        existing_tiles <- list.files(output_date_folder, pattern = "\\.tif$")
        if (length(existing_tiles) > 0) {
          safe_log(paste("[", i, "/", length(tile_dates), "] SKIP:", date, "- already has", length(existing_tiles), "tiles"))
          next
        }
      }

      safe_log(paste("[", i, "/", length(tile_dates), "] Processing tiles for date:", date))

      date_tile_dir <- file.path(tile_folder, date)
      tile_files <- list.files(date_tile_dir, pattern = "\\.tif$", full.names = TRUE)

      if (length(tile_files) == 0) {
        safe_log(paste("  No tile files found for", date), "WARNING")
        next
      }

      safe_log(paste("  Found", length(tile_files), "tiles - processing in parallel"))

      # Process all tiles for this date and aggregate to fields
      # Tiles are processed in parallel via furrr::future_map() inside extract_ci_from_tiles()
      date_stats <- extract_ci_from_tiles(
        tile_files = tile_files,
        date = date,
        field_boundaries_sf = field_boundaries_sf,
        daily_CI_vals_dir = daily_CI_vals_dir,
        grid_size = grid_size
      )

      if (!is.null(date_stats)) {
        all_pivot_stats[[date]] <- date_stats
      }
    }

    # Combine all dates
    if (length(all_pivot_stats) > 0) {
      # Use bind_rows() to handle varying column names across dates gracefully
      combined_stats <- dplyr::bind_rows(all_pivot_stats, .id = NULL)
      rownames(combined_stats) <- NULL

      # Save combined data
      saveRDS(combined_stats, combined_ci_path)
      safe_log("All tile CI values extracted and combined_CI_data.rds created")
    } else {
      safe_log("No tile data was processed", "WARNING")
    }

  } else {
    # Process only new dates SEQUENTIALLY but with parallel tile processing
    safe_log("combined_CI_data.rds exists, adding new tile data.")

    if (length(dates_to_process) == 0) {
      safe_log("No new dates to process", "WARNING")
      return(invisible(NULL))
    }

    safe_log(paste("Processing", length(dates_to_process), "new dates..."))

    new_pivot_stats_list <- list()

    for (i in seq_along(dates_to_process)) {
      date <- dates_to_process[i]

      # SKIP: Check if this date already has processed output tiles
      if (!is.na(grid_size)) {
        output_date_folder <- file.path(merged_final_dir, grid_size, date)
      } else {
        output_date_folder <- file.path(merged_final_dir, date)
      }

      if (dir.exists(output_date_folder)) {
        existing_tiles <- list.files(output_date_folder, pattern = "\\.tif$")
        if (length(existing_tiles) > 0) {
          safe_log(paste("[", i, "/", length(dates_to_process), "] SKIP:", date, "- already has", length(existing_tiles), "tiles"))
          next
        }
      }

      safe_log(paste("[", i, "/", length(dates_to_process), "] Processing tiles for date:", date))

      date_tile_dir <- file.path(tile_folder, date)
      tile_files <- list.files(date_tile_dir, pattern = "\\.tif$", full.names = TRUE)

      if (length(tile_files) == 0) {
        safe_log(paste("  No tile files found for", date), "WARNING")
        next
      }

      safe_log(paste("  Found", length(tile_files), "tiles - processing in parallel"))

      # Extract CI from tiles for this date
      date_stats <- extract_ci_from_tiles(
        tile_files = tile_files,
        date = date,
        field_boundaries_sf = field_boundaries_sf,
        daily_CI_vals_dir = daily_CI_vals_dir,
        grid_size = grid_size
      )

      if (!is.null(date_stats)) {
        new_pivot_stats_list[[date]] <- date_stats
      }
    }

    # Combine new data
    if (length(new_pivot_stats_list) > 0) {
      # Use bind_rows() to handle varying column names across dates gracefully
      new_pivot_stats <- dplyr::bind_rows(new_pivot_stats_list, .id = NULL)
      rownames(new_pivot_stats) <- NULL

      # Update combined file
      update_ci_data(new_pivot_stats, combined_ci_path)
      safe_log("Tile CI values from new dates added to combined_CI_data.rds")
    } else {
      safe_log("No new tile dates had valid data", "WARNING")
    }
  }
}

#' Process a single tile file, extract CI, save processed tile, and extract statistics
#'
#' Helper function for parallel processing of tiles. For each tile:
#' 1. Loads tile
#' 2. Creates/extracts CI band
#' 3. Creates output raster with Red, Green, Blue, NIR, CI bands
#' 4. Saves to merged_final_tif_dir/[GRID_SIZE]/[DATE]/ mirroring daily_tiles_split structure
#' 5. Extracts field-level CI statistics
#' Returns statistics aggregated to field level.
#'
#' @param tile_file Path to a single tile TIF file
#' @param field_boundaries_sf Field boundaries as SF object
#' @param date Character string of the date (YYYY-MM-DD format)
#' @param merged_final_tif_dir Base directory to save processed tiles with CI band
#' @param grid_size Grid size label (e.g., "5x5", "10x10") for output path structure
#' @return Data frame with field CI statistics for this tile, or NULL if processing failed
#'
process_single_tile <- function(tile_file, field_boundaries_sf, date, grid_size = NA) {
  tryCatch({
    tile_filename <- basename(tile_file)
    safe_log(paste("    [TILE] Loading:", tile_filename))

    # Load tile
    tile_rast <- terra::rast(tile_file)

    # Determine if this is 4-band or 8-band data
    raster_info <- detect_raster_structure(tile_rast)

    # Extract the bands we need
    red_band <- tile_rast[[raster_info$red_idx]]
    green_band <- tile_rast[[raster_info$green_idx]]
    blue_band <- tile_rast[[raster_info$blue_idx]]
    nir_band <- tile_rast[[raster_info$nir_idx]]

    # Apply cloud masking if UDM exists
    if (raster_info$has_udm) {
      udm_band <- tile_rast[[raster_info$udm_idx]]
      cloud_mask <- udm_band != 2  # Mask only UDM=2 (clouds)

      red_band[!cloud_mask] <- NA
      green_band[!cloud_mask] <- NA
      blue_band[!cloud_mask] <- NA
      nir_band[!cloud_mask] <- NA
    }

    # Name the bands
    names(red_band) <- "Red"
    names(green_band) <- "Green"
    names(blue_band) <- "Blue"
    names(nir_band) <- "NIR"

    # Create CI band inline: NIR/Green - 1
    ci_band <- nir_band / green_band - 1
    names(ci_band) <- "CI"

    # Create output raster with Red, Green, Blue, NIR, CI
    output_raster <- c(red_band, green_band, blue_band, nir_band, ci_band)
    names(output_raster) <- c("Red", "Green", "Blue", "NIR", "CI")

    # NOTE: Do NOT save processed tile - it's an intermediate only
    # The purpose is to calculate field-level CI statistics, not to create permanent tile files
    # This prevents bloat in merged_final_tif/ directory (would unnecessarily duplicate
    # daily_tiles_split data with an extra CI band added)

    # Extract statistics per field from CI band
    field_bbox <- sf::st_bbox(field_boundaries_sf)
    ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out")
    extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE)

    # Build statistics data frame for this tile
    tile_stats <- field_boundaries_sf %>%
      sf::st_drop_geometry() %>%
      mutate(mean_CI = round(extracted_vals[, 2], 2)) %>%
      mutate(tile_file = basename(tile_file))

    return(tile_stats)

  }, error = function(e) {
    safe_log(paste("  Error processing tile", basename(tile_file), "-", e$message), "WARNING")
    return(NULL)
  })
}

#' Extract CI values from multiple tiles and aggregate to fields
#'
#' Given a set of tile files for a single date, this function:
#' 1. Loads each tile IN PARALLEL using furrr
#' 2. Creates/extracts CI band
#' 3. Saves processed tile (Red, Green, Blue, NIR, CI) to merged_final_tif_dir/[GRID_SIZE]/[DATE]/
#' 4. Calculates field statistics from CI band
#' 5. Aggregates field statistics across tiles
#' 6. Saves individual date file (matching legacy workflow)
#'
#' Parallel processing: Uses future_map to process 2-4 tiles simultaneously depending on available cores.
#'
#' @param tile_files Character vector of full paths to tile TIF files
#' @param date Character string of the date (YYYY-MM-DD format)
#' @param field_boundaries_sf Field boundaries as SF object
#' @param daily_CI_vals_dir Directory to save individual date RDS files
#' @param merged_final_tif_dir Base directory to save processed tiles with CI band
#' @param grid_size Grid size label (e.g., "5x5", "10x10") for output path structure
#' @return Data frame with field CI statistics for the date
#'
extract_ci_from_tiles <- function(tile_files, date, field_boundaries_sf, daily_CI_vals_dir = NULL, grid_size = NA) {

  if (!inherits(field_boundaries_sf, "sf")) {
    field_boundaries_sf <- sf::st_as_sf(field_boundaries_sf)
  }

  safe_log(paste("  Processing", length(tile_files), "tiles for date", date, "(3-tile parallel batch)"))

  # Windows-compatible parallelization: Process tiles in small batches
  # Use future_map with 3 workers - stable and efficient on Windows

  # Set up minimal future plan (3 workers max)
  future::plan(future::multisession, workers = 3)

  # Process tiles using furrr with 2 workers
  # Use retry logic for worker stability
  stats_list <- tryCatch({
    furrr::future_map(
      tile_files,
      ~ process_single_tile(.x, field_boundaries_sf, date, grid_size = grid_size),
      .progress = FALSE,
      .options = furrr::furrr_options(seed = TRUE)
    )
  }, error = function(e) {
    safe_log(paste("Parallel processing failed:", e$message, "- falling back to sequential"), "WARNING")
    # Fallback to sequential if parallel fails
    lapply(
      tile_files,
      function(tile_file) {
        process_single_tile(tile_file, field_boundaries_sf, date, grid_size = grid_size)
      }
    )
  })

  # Extract names and filter out NULL results (failed tiles)
  tile_names <- basename(tile_files)
  all_stats <- stats_list[!sapply(stats_list, is.null)]
  names(all_stats) <- tile_names[!sapply(stats_list, is.null)]

  if (length(all_stats) == 0) {
    return(NULL)
  }

  # Combine all tiles and aggregate to field level
  # Use dplyr::bind_rows() to handle column name inconsistencies gracefully
  combined_tiles <- dplyr::bind_rows(all_stats)
  rownames(combined_tiles) <- NULL

  # Aggregate: For each field, compute mean CI across all tiles
  aggregated <- combined_tiles %>%
    group_by(across(-c(mean_CI, tile_file))) %>%
    summarise(!!date := round(mean(mean_CI, na.rm = TRUE), 2), .groups = "drop")

  # Save individual date file (matching legacy workflow format: extracted_YYYY-MM-DD_quadrant.rds)
  if (!is.null(daily_CI_vals_dir)) {
    save_path <- file.path(daily_CI_vals_dir, paste0("extracted_", date, "_quadrant.rds"))
    saveRDS(aggregated, save_path)
    safe_log(paste("[RDS SAVED] Date:", date, "-> File: extracted_", date, "_quadrant.rds"))
  }

  return(aggregated)
}

# =============================================================================
# Script 20 (Per-Field) Specific Functions
# =============================================================================

#' Calculate Canopy Index (CI) from 4-band raster
#'
#' *** CRITICAL FORMULA: CI = NIR / Green - 1 ***
#' This is the CHLOROPHYLL INDEX formula (ranges ~1-7 for vegetation).
#' NOT NDVI! Do NOT use (NIR-Red)/(NIR+Red) - that produces -1 to 1 range.
#'
#' Expects band order: Red (band 1), Green (band 2), Blue (band 3), NIR (band 4)
#'
#' @param raster_obj Loaded raster object with at least 4 bands
#' @return Raster object containing CI values (Chlorophyll Index, ranges ~1-7)
#'
calc_ci_from_raster <- function(raster_obj) {
  # Expected band order: Red (band 1), Green (band 2), Blue (band 3), NIR (band 4)
  if (terra::nlyr(raster_obj) < 4) {
    stop("Raster has fewer than 4 bands. Cannot calculate CI.")
  }

  green <- terra::subset(raster_obj, 2)  # Green band (required for proper CI calculation)
  nir <- terra::subset(raster_obj, 4)  # NIR

  # *** CHLOROPHYLL INDEX = NIR / Green - 1 ***
  # This formula is sensitive to active chlorophyll content and ranges ~1-7
  # DO NOT use (NIR-Red)/(NIR+Red) - that is NDVI (Normalized Difference Vegetation Index)
  # NDVI ranges -1 to 1 and is different from Chlorophyll Index
  ci <- nir / green - 1

  return(ci)
}

#' Extract CI statistics by sub_field from a CI raster
#'
#' For a given field, masks the CI raster to each sub_field polygon
#' and calculates statistics (mean, median, sd, min, max, count).
#'
#' @param ci_raster Raster object containing CI values
#' @param field_boundaries_sf SF object containing field/sub_field polygons
#' @param field_name Character string of the field name to process
#' @return Data frame with field, sub_field, and CI statistics; NULL if field not found
#'
extract_ci_by_subfield <- function(ci_raster, field_boundaries_sf, field_name) {
  # NOTE: Per-field TIFFs are already cropped to field boundaries by Script 10
  # No need to mask again - just extract all valid pixels from the raster

  # Extract ALL CI values (no masking needed for pre-cropped per-field TIFFs)
  ci_values <- terra::values(ci_raster, na.rm = TRUE)

  if (length(ci_values) > 0) {
    result_row <- data.frame(
      field = field_name,
      sub_field = field_name,  # Use field_name as sub_field since TIFF is already field-specific
      ci_mean = mean(ci_values, na.rm = TRUE),
      ci_median = median(ci_values, na.rm = TRUE),
      ci_sd = sd(ci_values, na.rm = TRUE),
      ci_min = min(ci_values, na.rm = TRUE),
      ci_max = max(ci_values, na.rm = TRUE),
      ci_count = length(ci_values),
      stringsAsFactors = FALSE
    )
  } else {
    result_row <- data.frame(
      field = field_name,
      sub_field = field_name,
      ci_mean = NA_real_,
      ci_median = NA_real_,
      ci_sd = NA_real_,
      ci_min = NA_real_,
      ci_max = NA_real_,
      ci_count = 0,
      stringsAsFactors = FALSE
    )
  }

  return(result_row)
}

#' Extract RDS from existing CI TIFF (Migration/Regeneration Mode)
#'
#' This function extracts CI statistics from an already-calculated CI TIFF
#' without needing to recalculate from raw 4-band imagery.
#' Used during migration when field_tiles_CI/ exists but daily_vals/{FIELD}/ is missing.
#'
#' @param ci_tiff_path Path to the 5-band TIFF containing CI as the 5th band
#' @param output_rds_path Path where to save the output RDS file
#' @param field_boundaries_sf SF object with field/sub_field polygons
#' @param field_name Name of the field to extract
#' @return The RDS data frame (invisibly) and saves to disk
#'
extract_rds_from_ci_tiff <- function(ci_tiff_path, output_rds_path, field_boundaries_sf, field_name) {
  tryCatch({
    # Load the 5-band TIFF
    raster_5band <- terra::rast(ci_tiff_path)

    # Extract CI (5th band)
    # Assuming structure: [1]=R, [2]=G, [3]=B, [4]=NIR, [5]=CI
    ci_raster <- raster_5band[[5]]

    # Extract CI statistics by sub_field
    ci_stats <- extract_ci_by_subfield(ci_raster, field_boundaries_sf, field_name)

    # Save RDS
    if (!is.null(ci_stats) && nrow(ci_stats) > 0) {
      saveRDS(ci_stats, output_rds_path)
      return(invisible(ci_stats))
    } else {
      safe_log(sprintf("No CI statistics extracted from %s", ci_tiff_path), "WARNING")
      return(invisible(NULL))
    }

  }, error = function(e) {
    safe_log(sprintf("Error extracting RDS from CI TIFF: %s", e$message), "ERROR")
    return(invisible(NULL))
  })
}

#' Regenerate ALL missing RDS files from existing CI TIFFs (Comprehensive Migration Mode)
#'
#' This function processes ALL dates in field_tiles_CI/ and extracts RDS for any missing daily_vals/
#' files. No date window filtering - processes the entire historical archive.
#'
#' Used for one-time migration of old projects where field_tiles_CI/ is populated but daily_vals/
#' RDS files are missing or incomplete.
#'
#' @param field_tiles_ci_dir Path to field_tiles_CI/ (input: pre-calculated CI TIFFs)
#' @param daily_vals_dir Path to daily_vals/ (output: RDS statistics files)
#' @param field_boundaries_sf SF object with field/sub_field polygons
#' @param fields Vector of field names to process
#'
#' @return List with summary stats: list(total_processed=N, total_skipped=M, total_errors=K)
#'
regenerate_all_missing_rds <- function(field_tiles_ci_dir, daily_vals_dir, field_boundaries_sf, fields) {
  safe_log("\n========================================")
  safe_log("MIGRATION MODE: REGENERATING ALL MISSING RDS")
  safe_log("Processing ALL dates in field_tiles_CI/")
  safe_log("========================================")

  total_processed <- 0
  total_skipped <- 0
  total_errors <- 0

  # Iterate through each field
  for (field in fields) {
    field_ci_path <- file.path(field_tiles_ci_dir, field)
    field_daily_vals_path <- file.path(daily_vals_dir, field)

    # Skip if field directory doesn't exist
    if (!dir.exists(field_ci_path)) {
      safe_log(sprintf("  Field %s: field_tiles_CI not found (skipping)", field))
      continue
    }

    # Create output directory for RDS
    dir.create(field_daily_vals_path, showWarnings = FALSE, recursive = TRUE)

    # Find ALL CI TIFFs for this field (no date filtering)
    ci_tiff_files <- list.files(
      path = field_ci_path,
      pattern = "^\\d{4}-\\d{2}-\\d{2}\\.tif$",
      full.names = TRUE
    )

    if (length(ci_tiff_files) == 0) {
      safe_log(sprintf("  Field %s: No CI TIFFs found (skipping)", field))
      next
    }

    safe_log(sprintf("  Field %s: Found %d CI TIFFs to process", field, length(ci_tiff_files)))

    # Process each CI TIFF
    for (ci_tiff in ci_tiff_files) {
      date_str <- tools::file_path_sans_ext(basename(ci_tiff))
      output_rds <- file.path(field_daily_vals_path, sprintf("%s.rds", date_str))

      # Skip if RDS already exists
      if (file.exists(output_rds)) {
        total_skipped <- total_skipped + 1
        next
      }

      # Extract RDS from CI TIFF
      tryCatch({
        extract_rds_from_ci_tiff(ci_tiff, output_rds, field_boundaries_sf, field)
        safe_log(sprintf("    %s: ✓ RDS extracted", date_str))
        total_processed <- total_processed + 1
      }, error = function(e) {
        safe_log(sprintf("    %s: ✗ Error - %s", date_str, e$message), "ERROR")
        total_errors <<- total_errors + 1
      })
    }
  }

  safe_log(sprintf("\nMigration complete: processed %d, skipped %d, errors %d",
                   total_processed, total_skipped, total_errors))

  return(list(
    total_processed = total_processed,
    total_skipped = total_skipped,
    total_errors = total_errors
  ))
}