```r
# Cloud detection analysis script

# Load necessary libraries
library(terra)
library(exactextractr)
library(sf)
library(dplyr)
library(ggplot2)
library(tidyr)
library(reshape2)

# Define file paths (these should be set to your actual file locations)
classes_file <- "path/to/classes.geojson"
rasters_dir <- "path/to/rasters"
diagnostic_dir <- "path/to/diagnostics"

# Helper function for logging
safe_log <- function(message, level = "INFO") {
  timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
  cat(paste0("[", timestamp, "] [", level, "] ", message, "\n"))
}

# Main processing block
# Load classification polygons
safe_log(paste("Loading classification polygons from:", classes_file))
classifications <- sf::st_read(classes_file, quiet = TRUE)
# Ensure the CRS is set (assuming WGS84 here, adjust if necessary)
safe_log("No CRS found for the classifications. Setting to WGS84 (EPSG:4326).", "WARNING")
sf::st_crs(classifications) <- 4326
# List all raster files in the directory
raster_files <- list.files(rasters_dir, pattern = "\\.tif$", full.names = TRUE)
# Create a named vector for extraction_rasters based on base names
extraction_rasters <- setNames(raster_files, tools::file_path_sans_ext(basename(raster_files)))
# Create a stack of all rasters
extraction_stack <- terra::rast(extraction_rasters)
# User-provided simplified extraction for mean statistics per polygon
safe_log("Extracting mean statistics per polygon using exactextractr...")
all_stats <- cbind(
  classifications, 
  round(exactextractr::exact_extract(extraction_stack, classifications, fun = "mean", progress = FALSE), 2)
) %>%
  sf::st_drop_geometry() # Ensures all_stats is a data frame
# Ensure 'class_name' column exists, if not, use 'class' as 'class_name'
all_stats$class_name <- all_stats$class

# Save the extracted statistics to a CSV file
stats_file <- file.path(diagnostic_dir, "polygon_mean_spectral_stats.csv")
write.csv(all_stats, stats_file, row.names = FALSE)


safe_log(paste("Saved mean spectral statistics per polygon to:", stats_file))
# Calculate optimized thresholds for cloud/shadow detection
threshold_results <- data.frame(
  parameter = character(),
  best_threshold = numeric(),
  direction = character(),
  target_class = character(),
  vs_class = character(),
  accuracy = numeric(),
  stringsAsFactors = FALSE
)
class_pairs <- list(
  c("cloud", "crop"),
  c("cloud", "bare_soil_dry"),
  c("cloud", "bare_soil_wet"),
  c("shadow_over_crop", "crop"),
  c("shadow_over_bare_soil", "bare_soil_dry"),
  c("shadow_over_bare_soil", "bare_soil_wet")
)
cloud_detection_params_for_threshold <- intersect(
  c("mean.brightness", "mean.very_bright_pixels", "mean.blue_dominant", "mean.low_ndvi", "mean.green_dominant_nir", "mean.high_ndwi", "mean.blue_ratio", "mean.ndvi"),
  colnames(all_stats)
)
shadow_detection_params_for_threshold <- intersect(
  c("mean.brightness", "mean.dark_pixels", "mean.very_dark_pixels", "mean.low_nir", "mean.shadow_ndvi", "mean.low_red_to_blue", "mean.high_blue_to_nir_ratio", "mean.blue_nir_ratio_raw", "mean.red_blue_ratio_raw"),
  colnames(all_stats)
)
for (pair in class_pairs) {
  target_class <- pair[1]
  vs_class <- pair[2]
  params_to_check <- c(cloud_detection_params_for_threshold, shadow_detection_params_for_threshold)
  for (param in params_to_check) {
    target_values <- all_stats[all_stats$class_name == target_class, param]
    vs_values <- all_stats[all_stats$class_name == vs_class, param]
    target_values <- target_values[!is.na(target_values)]
    vs_values <- vs_values[!is.na(vs_values)]
    # Only proceed if both groups have at least one value
    if (length(target_values) > 0 && length(vs_values) > 0) {
      target_mean <- mean(target_values)
      target_sd <- sd(target_values)
      vs_mean <- mean(vs_values)
      vs_sd <- sd(vs_values)
      target_sd[is.na(target_sd)] <- 0
      vs_sd[is.na(vs_sd)] <- 0
      direction <- ifelse(target_mean > vs_mean, ">", "<")
      all_values <- c(target_values, vs_values)
      min_val <- min(all_values)
      max_val <- max(all_values)
      # Only proceed if min and max are finite and not equal
      if (is.finite(min_val) && is.finite(max_val) && min_val != max_val) {
        potential_thresholds <- seq(min_val, max_val, length.out = 20)
        best_accuracy <- -1
        best_threshold <- ifelse(direction == ">", min(potential_thresholds), max(potential_thresholds))
        for (threshold in potential_thresholds) {
          if (direction == ">") {
            correct_target <- sum(target_values > threshold)
            correct_vs <- sum(vs_values <= threshold)
          } else {
            correct_target <- sum(target_values < threshold)
            correct_vs <- sum(vs_values >= threshold)
          }
          accuracy <- (correct_target + correct_vs) / (length(target_values) + length(vs_values))
          if (accuracy > best_accuracy) {
            best_accuracy <- accuracy
            best_threshold <- threshold
          }
        }
        threshold_results <- rbind(threshold_results, data.frame(
          parameter = param,
          best_threshold = best_threshold,
          direction = direction,
          target_class = target_class,
          vs_class = vs_class,
          accuracy = best_accuracy,
          stringsAsFactors = FALSE
        ))
      }
    }
  }
}

thresholds_file <- file.path(diagnostic_dir, "optimal_thresholds.csv")
write.csv(threshold_results, thresholds_file, row.names = FALSE)

safe_log(paste("Saved optimal threshold recommendations to:", thresholds_file))

# Fix: get plot_measure_cols by matching raster base names to all_stats columns with 'mean.' prefix
plot_measure_cols <- intersect(names(extraction_rasters), gsub('^mean\\.', '', colnames(all_stats)))
plot_data <- reshape2::melt(
  all_stats,
  id.vars = c("class", "class_name"),
  measure.vars = paste0("mean.", plot_measure_cols),
  variable.name = "parameter",
  value.name = "value"
)
# Remove 'mean.' prefix from parameter column for clarity
plot_data$parameter <- sub("^mean\\.", "", plot_data$parameter)

plots_dir <- file.path(diagnostic_dir, "class_plots")
dir.create(plots_dir, showWarnings = FALSE, recursive = TRUE)
key_params_for_plot_list <- c("brightness", "ndvi", "blue_ratio", "ndwi", 
                              "blue_nir_ratio_raw", "red_blue_ratio_raw")
key_params_to_plot <- intersect(key_params_for_plot_list, plot_measure_cols)
for (param_to_plot in key_params_to_plot) {
  param_data_subset <- plot_data[plot_data$parameter == param_to_plot, ]
  p <- ggplot2::ggplot(param_data_subset, ggplot2::aes(x = class_name, y = value, fill = class_name)) +
    ggplot2::geom_boxplot() +
    ggplot2::theme_minimal() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
    ggplot2::labs(
      title = paste("Distribution of", param_to_plot, "by Land Cover Class"),
      x = "Class",
      y = param_to_plot,
      fill = "Class"
    )
  plot_file <- file.path(plots_dir, paste0("boxplot_", param_to_plot, ".png"))
  ggplot2::ggsave(plot_file, p, width = 10, height = 6, dpi = 150)
}
summary_params_for_plot_list <- c("brightness", "ndvi", 
                                  "blue_nir_ratio_raw", "red_blue_ratio_raw")
summary_params_to_plot <- intersect(summary_params_for_plot_list, plot_measure_cols)
summary_data_subset <- plot_data[plot_data$parameter %in% summary_params_to_plot,]
p_summary <- ggplot2::ggplot(summary_data_subset, ggplot2::aes(x = class_name, y = value, fill = class_name)) +
  ggplot2::geom_boxplot() +
  ggplot2::facet_wrap(~parameter, scales = "free_y") +
  ggplot2::theme_minimal() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1),
                 strip.text = ggplot2::element_text(size = 8)) +
  ggplot2::labs(
    title = "Summary of Key Spectral Parameters by Land Cover Class",
    x = "Class",
    y = "Value",
    fill = "Class"
  )
summary_file <- file.path(plots_dir, "spectral_parameters_summary.png")
ggplot2::ggsave(summary_file, p_summary, width = 12, height = 8, dpi = 150)
safe_log(paste("Generated spectral parameter plots in:", plots_dir))
safe_log("Cloud detection analysis script finished.")
```