SmartCane/r_app/old_scripts/20_generate_kpi_excel.R

729 lines
24 KiB
R

# 20_GENERATE_KPI_EXCEL.R
# =======================
# This script generates an Excel file with KPIs for client delivery:
# - Field-level data with CI, changes, crop status, alerts, and cloud interference
# - Alerts summary sheet
# - Farm-wide overview statistics
#
# Usage: Rscript 20_generate_kpi_excel.R [end_date] [project_dir]
# - end_date: End date for KPI calculation (YYYY-MM-DD format), default: most recent week
# - project_dir: Project directory name (e.g., "aura", "esa"), default: "esa"
# 1. Load required libraries
# -------------------------
suppressPackageStartupMessages({
library(here)
library(sf)
library(terra)
library(dplyr)
library(tidyr)
library(lubridate)
library(readr)
library(writexl)
library(stringr)
})
# 2. Main function
# --------------
main <- function() {
# Process command line arguments
args <- commandArgs(trailingOnly = TRUE)
# Process end_date argument
if (length(args) >= 1 && !is.na(args[1])) {
end_date <- as.Date(args[1])
if (is.na(end_date)) {
warning("Invalid end_date provided. Using default (most recent available).")
end_date <- NULL
}
} else {
end_date <- NULL
}
# Process project_dir argument
if (length(args) >= 2 && !is.na(args[2])) {
project_dir <- as.character(args[2])
} else {
project_dir <- "esa" # Default project
}
# Make project_dir available globally so parameters_project.R can use it
assign("project_dir", project_dir, envir = .GlobalEnv)
# 3. Load project configuration
# ---------------------------
# Load project parameters (this sets up all directory paths and field boundaries)
tryCatch({
source(here("r_app", "parameters_project.R"))
}, error = function(e) {
stop("Error loading parameters_project.R: ", e$message)
})
# Check if required variables exist
if (!exists("project_dir")) {
stop("project_dir must be set before running this script")
}
if (!exists("field_boundaries_sf") || is.null(field_boundaries_sf)) {
stop("Field boundaries not loaded. Check parameters_project.R initialization.")
}
cat("\n=== STARTING KPI EXCEL GENERATION ===\n")
cat("Project:", project_dir, "\n")
# 4. Determine which week to analyze
# --------------------------------
if (is.null(end_date)) {
# Find most recent weekly mosaic
mosaic_files <- list.files(weekly_CI_mosaic, pattern = "^week_\\d+_\\d{4}\\.tif$", full.names = TRUE)
if (length(mosaic_files) == 0) {
stop("No weekly mosaic files found in: ", weekly_CI_mosaic)
}
# Extract week numbers and years
mosaic_info <- data.frame(
file = mosaic_files,
week = as.numeric(gsub(".*week_(\\d+)_\\d{4}\\.tif", "\\1", basename(mosaic_files))),
year = as.numeric(gsub(".*week_\\d+_(\\d{4})\\.tif", "\\1", basename(mosaic_files)))
) %>%
arrange(desc(year), desc(week))
current_week <- mosaic_info$week[1]
current_year <- mosaic_info$year[1]
cat("Using most recent week:", current_week, "of", current_year, "\n")
} else {
current_week <- isoweek(end_date)
current_year <- year(end_date)
cat("Using specified date:", as.character(end_date), "(Week", current_week, ")\n")
}
previous_week <- current_week - 1
previous_year <- current_year
# Handle year boundary
if (previous_week < 1) {
previous_week <- 52
previous_year <- current_year - 1
}
# 5. Load weekly mosaics
# --------------------
current_mosaic_path <- file.path(weekly_CI_mosaic, paste0("week_", current_week, "_", current_year, ".tif"))
previous_mosaic_path <- file.path(weekly_CI_mosaic, paste0("week_", previous_week, "_", previous_year, ".tif"))
if (!file.exists(current_mosaic_path)) {
stop("Current week mosaic not found: ", current_mosaic_path)
}
current_mosaic <- rast(current_mosaic_path)
cat("Loaded current week mosaic:", basename(current_mosaic_path), "\n")
previous_mosaic <- NULL
if (file.exists(previous_mosaic_path)) {
previous_mosaic <- rast(previous_mosaic_path)
cat("Loaded previous week mosaic:", basename(previous_mosaic_path), "\n")
} else {
warning("Previous week mosaic not found: ", basename(previous_mosaic_path))
}
# 6. Load 8-band data for cloud information
# ----------------------------------------
cloud_data_available <- check_cloud_data_availability(project_dir, current_week, current_year)
# 7. Build time series for harvest detection
# -----------------------------------------
cat("\nBuilding time series for harvest detection...\n")
time_series_data <- build_time_series_from_weekly_mosaics(
weekly_mosaic_dir = weekly_CI_mosaic,
field_boundaries_sf = field_boundaries_sf,
current_week = current_week,
current_year = current_year
)
# 8. Detect harvest events
# -----------------------
cat("Detecting harvest events...\n")
harvest_events <- detect_harvest_events(
time_series_data = time_series_data,
ci_threshold = 2,
consecutive_weeks = 2
)
# 9. Calculate crop status
# -----------------------
cat("Calculating crop status...\n")
crop_status_data <- calculate_crop_status(
time_series_data = time_series_data,
harvest_events = harvest_events,
current_week = current_week,
current_year = current_year
)
# 10. Extract field-level data
# --------------------------
cat("\nExtracting field-level data...\n")
field_data <- extract_field_kpis(
field_boundaries_sf = field_boundaries_sf,
current_mosaic = current_mosaic,
previous_mosaic = previous_mosaic,
crop_status_data = crop_status_data,
cloud_data_available = cloud_data_available,
project_dir = project_dir,
current_week = current_week,
current_year = current_year
)
# 11. Generate alerts
# -----------------
cat("Generating alerts...\n")
alerts_data <- generate_alerts(field_data, crop_status_data)
# 12. Create farm overview
# ----------------------
cat("Creating farm overview...\n")
farm_overview <- create_farm_overview(field_data, alerts_data)
# 13. Write Excel file
# ------------------
output_file <- file.path(
reports_dir,
paste0("kpi_excel_report_week", current_week, "_", project_dir, ".xlsx")
)
cat("\nWriting Excel file...\n")
write_xlsx(
list(
Field_Data = field_data,
Alerts_Summary = alerts_data,
Farm_Overview = farm_overview
),
path = output_file
)
cat("\n=== KPI EXCEL GENERATION COMPLETED ===\n")
cat("Output file:", output_file, "\n")
cat("Total fields:", nrow(field_data), "\n")
cat("Total alerts:", nrow(alerts_data), "\n\n")
return(output_file)
}
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
#' Check if cloud data is available for the current week
check_cloud_data_availability <- function(project_dir, current_week, current_year) {
# Check if merged_tif_8b directory exists and has data for this week
merged_8b_dir <- here("laravel_app/storage/app", project_dir, "merged_tif_8b")
if (!dir.exists(merged_8b_dir)) {
cat("Cloud data directory not found, cloud interference will not be calculated\n")
return(FALSE)
}
# Get files from the current week
files_8b <- list.files(merged_8b_dir, pattern = "\\.tif$", full.names = TRUE)
if (length(files_8b) == 0) {
cat("No 8-band files found, cloud interference will not be calculated\n")
return(FALSE)
}
cat("Cloud data available from", length(files_8b), "images\n")
return(TRUE)
}
#' Build time series from weekly mosaics
build_time_series_from_weekly_mosaics <- function(weekly_mosaic_dir, field_boundaries_sf, current_week, current_year) {
# Get all weekly mosaic files
mosaic_files <- list.files(weekly_mosaic_dir, pattern = "^week_\\d+_\\d{4}\\.tif$", full.names = TRUE)
if (length(mosaic_files) == 0) {
stop("No weekly mosaic files found")
}
# Extract week and year information
mosaic_info <- data.frame(
file = mosaic_files,
week = as.numeric(gsub(".*week_(\\d+)_\\d{4}\\.tif", "\\1", basename(mosaic_files))),
year = as.numeric(gsub(".*week_\\d+_(\\d{4})\\.tif", "\\1", basename(mosaic_files)))
) %>%
arrange(year, week)
# Extract CI values for all fields across all weeks
time_series_list <- list()
for (i in 1:nrow(mosaic_info)) {
week_num <- mosaic_info$week[i]
year_num <- mosaic_info$year[i]
tryCatch({
mosaic <- rast(mosaic_info$file[i])
# Extract values for each field
field_stats <- terra::extract(mosaic$CI, vect(field_boundaries_sf), fun = mean, na.rm = TRUE)
# Calculate date from week number (start of week)
jan1 <- as.Date(paste0(year_num, "-01-01"))
week_date <- jan1 + (week_num - 1) * 7
time_series_list[[i]] <- data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
week = week_num,
year = year_num,
date = week_date,
mean_ci = field_stats$CI, # First column is ID, second is the value
stringsAsFactors = FALSE
)
}, error = function(e) {
warning(paste("Error processing week", week_num, ":", e$message))
})
}
# Combine all weeks
time_series_df <- bind_rows(time_series_list)
return(time_series_df)
}
#' Detect harvest events based on CI time series
detect_harvest_events <- function(time_series_data, ci_threshold = 2.0, consecutive_weeks = 2,
recovery_threshold = 4.0, max_harvest_duration = 12) {
# For each field, find ALL periods where CI drops below threshold
# Key insight: A harvest event should be SHORT (few weeks), not 60+ weeks!
# After harvest, CI should recover (go above recovery_threshold)
harvest_events <- time_series_data %>%
arrange(field_id, sub_field, date) %>%
group_by(field_id, sub_field) %>%
mutate(
# Classify each week's state
state = case_when(
mean_ci < ci_threshold ~ "harvest", # Very low CI = harvested/bare
mean_ci >= recovery_threshold ~ "recovered", # High CI = fully grown
TRUE ~ "growing" # Medium CI = growing back
),
# Detect state changes to identify new harvest cycles
state_change = state != lag(state, default = "recovered"),
# Create run IDs based on state changes
run_id = cumsum(state_change)
) %>%
ungroup() %>%
# Group by run to identify continuous periods in each state
group_by(field_id, sub_field, run_id, state) %>%
summarize(
harvest_week = first(week),
harvest_year = first(year),
duration_weeks = n(),
mean_ci_during = mean(mean_ci, na.rm = TRUE),
start_date = first(date),
end_date = last(date),
.groups = "drop"
) %>%
# Only keep "harvest" state periods
filter(state == "harvest") %>%
# Filter for reasonable harvest duration (not multi-year periods!)
filter(
duration_weeks >= consecutive_weeks,
duration_weeks <= max_harvest_duration # Harvest shouldn't last more than ~3 months
) %>%
# Sort and add unique IDs
arrange(field_id, sub_field, start_date) %>%
mutate(harvest_event_id = row_number()) %>%
select(harvest_event_id, field_id, sub_field, harvest_week, harvest_year,
duration_weeks, start_date, end_date, mean_ci_during)
return(harvest_events)
}
#' Calculate crop status based on age and CI
calculate_crop_status <- function(time_series_data, harvest_events, current_week, current_year) {
# Join harvest events with current week data
current_data <- time_series_data %>%
filter(week == current_week, year == current_year)
# For each field, find most recent harvest
most_recent_harvest <- harvest_events %>%
group_by(field_id, sub_field) %>%
arrange(desc(harvest_year), desc(harvest_week)) %>%
slice(1) %>%
ungroup()
# Calculate weeks since harvest
crop_status <- current_data %>%
left_join(most_recent_harvest, by = c("field_id", "sub_field")) %>%
mutate(
weeks_since_harvest = ifelse(
!is.na(harvest_week),
(current_year - harvest_year) * 52 + (current_week - harvest_week),
NA
),
crop_status = case_when(
is.na(weeks_since_harvest) ~ "Unknown",
weeks_since_harvest <= 8 & mean_ci < 3.0 ~ "Germination",
weeks_since_harvest <= 20 & mean_ci >= 3.0 & mean_ci <= 6.0 ~ "Tillering",
weeks_since_harvest > 20 & weeks_since_harvest <= 52 & mean_ci > 6.0 ~ "Maturity",
weeks_since_harvest > 52 ~ "Over Maturity",
TRUE ~ "Transitional"
)
) %>%
select(field_id, sub_field, weeks_since_harvest, crop_status, harvest_week, harvest_year)
return(crop_status)
}
#' Extract field-level KPIs
#' Load per-field cloud coverage data from script 09 output
#'
#' @param project_dir Project directory name
#' @param current_week Current week number
#' @param reports_dir Reports directory where RDS is saved
#' @param field_boundaries_sf Field boundaries for fallback
#' @return Data frame with per-field cloud coverage (pct_clear, cloud_category)
#'
load_per_field_cloud_data <- function(project_dir, current_week, reports_dir, field_boundaries_sf) {
# Try to load cloud coverage RDS file from script 09
cloud_rds_file <- file.path(reports_dir,
paste0(project_dir, "_cloud_coverage_week", current_week, ".rds"))
if (file.exists(cloud_rds_file)) {
tryCatch({
cloud_data <- readRDS(cloud_rds_file)
if (!is.null(cloud_data) && nrow(cloud_data) > 0) {
# Ensure we have the right columns
if ("field_id" %in% names(cloud_data)) {
cloud_data <- cloud_data %>%
rename(field = field_id)
}
# Return with just the columns we need
if ("field" %in% names(cloud_data) && "sub_field" %in% names(cloud_data)) {
return(cloud_data %>%
select(field, sub_field, pct_clear, cloud_category))
}
}
}, error = function(e) {
message(paste("Warning: Could not load cloud RDS file:", e$message))
})
}
# Fallback: return default values if file not found or error
message("Warning: Cloud coverage RDS file not found, using default values")
return(data.frame(
field = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
pct_clear = NA_real_,
cloud_category = "Not calculated",
stringsAsFactors = FALSE
))
}
extract_field_kpis <- function(field_boundaries_sf, current_mosaic, previous_mosaic,
crop_status_data, cloud_data_available,
project_dir, current_week, current_year) {
# Calculate field areas in hectares and acres
# Use tryCatch to handle geometry issues
field_areas <- field_boundaries_sf %>%
st_drop_geometry() %>%
select(field, sub_field)
# Try to calculate areas, but skip if geometry is problematic
tryCatch({
areas <- field_boundaries_sf %>%
mutate(
area_ha = as.numeric(st_area(geometry)) / 10000, # m2 to hectares
area_acres = area_ha * 2.47105
) %>%
st_drop_geometry() %>%
select(field, sub_field, area_ha, area_acres)
field_areas <- areas
}, error = function(e) {
message(paste("Warning: Could not calculate field areas:", e$message))
# Return default NA values
field_areas <<- field_boundaries_sf %>%
st_drop_geometry() %>%
select(field, sub_field) %>%
mutate(
area_ha = NA_real_,
area_acres = NA_real_
)
})
# Extract current week CI statistics
current_ci_data <- tryCatch({
current_stats <- terra::extract(
current_mosaic,
vect(field_boundaries_sf),
fun = function(x, ...) {
c(mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE))
},
na.rm = TRUE
)
data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
ci_current = current_stats[, 2], # mean
ci_current_sd = current_stats[, 3] # sd
)
}, error = function(e) {
message(paste("Warning: Could not extract CI data:", e$message))
data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
ci_current = NA_real_,
ci_current_sd = NA_real_
)
})
# Extract previous week CI if available
if (!is.null(previous_mosaic)) {
previous_ci_data <- tryCatch({
previous_stats <- terra::extract(
previous_mosaic,
vect(field_boundaries_sf),
fun = function(x, ...) {
c(mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE))
},
na.rm = TRUE
)
data.frame(mc
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
ci_previous = previous_stats[, 2], # mean
ci_previous_sd = previous_stats[, 3] # sd
)
}, error = function(e) {
message(paste("Warning: Could not extract previous CI data:", e$message))
data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
ci_previous = NA_real_,
ci_previous_sd = NA_real_
)
})
} else {
previous_ci_data <- data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
ci_previous = NA,
ci_previous_sd = NA
)
}
# Calculate weekly change statistics
change_data <- current_ci_data %>%
left_join(previous_ci_data, by = c("field_id", "sub_field")) %>%
mutate(
weekly_ci_change = ci_current - ci_previous,
# Combined SD shows if change was uniform (low) or patchy (high)
weekly_change_heterogeneity = sqrt(ci_current_sd^2 + ci_previous_sd^2) / 2,
change_interpretation = case_when(
is.na(weekly_ci_change) ~ "No previous data",
abs(weekly_ci_change) < 0.3 & weekly_change_heterogeneity < 0.5 ~ "Stable (uniform)",
abs(weekly_ci_change) < 0.3 & weekly_change_heterogeneity >= 0.5 ~ "Stable (patchy)",
weekly_ci_change >= 0.3 & weekly_change_heterogeneity < 0.5 ~ "Increasing (uniform)",
weekly_ci_change >= 0.3 & weekly_change_heterogeneity >= 0.5 ~ "Increasing (patchy)",
weekly_ci_change <= -0.3 & weekly_change_heterogeneity < 0.5 ~ "Decreasing (uniform)",
weekly_ci_change <= -0.3 & weekly_change_heterogeneity >= 0.5 ~ "Decreasing (patchy)",
TRUE ~ "Mixed patterns"
)
)
# Load cloud coverage data from script 09 output
cloud_stats <- load_per_field_cloud_data(
project_dir = project_dir,
current_week = current_week,
reports_dir = file.path(here("laravel_app", "storage", "app", project_dir, "reports", "kpis")),
field_boundaries_sf = field_boundaries_sf
)
# Combine all data
field_kpi_data <- field_areas %>%
left_join(change_data, by = c("field" = "field_id", "sub_field")) %>%
left_join(crop_status_data, by = c("field" = "field_id", "sub_field")) %>%
left_join(cloud_stats, by = c("field" = "field_id", "sub_field")) %>%
mutate(
# Calculate clear acres based on pct_clear
clear_acres = round(area_acres * (pct_clear / 100), 2),
clear_acres_pct = paste0(round(pct_clear, 1), "%"),
# Format for Excel
Field_ID = paste(field, sub_field, sep = "_"),
Acreage_ha = round(area_ha, 2),
Acreage_acres = round(area_acres, 1),
Clear_Acres = paste0(clear_acres, " (", clear_acres_pct, ")"),
Chlorophyll_Index = round(ci_current, 2),
Weekly_Change = round(weekly_ci_change, 2),
Change_Uniformity = round(weekly_change_heterogeneity, 2),
Change_Pattern = change_interpretation,
Crop_Status = crop_status,
Weeks_Since_Harvest = weeks_since_harvest,
Cloud_Category = cloud_category,
Alerts = "999 - test weed" # Placeholder
) %>%
select(Field_ID, Acreage_ha, Acreage_acres, Clear_Acres, Chlorophyll_Index,
Weekly_Change, Change_Uniformity, Change_Pattern,
Crop_Status, Weeks_Since_Harvest, Cloud_Category, Alerts)
return(field_kpi_data)
}
#' Calculate cloud interference from 8-band data
calculate_cloud_interference <- function(field_boundaries_sf, project_dir, current_week, current_year) {
merged_8b_dir <- here("laravel_app/storage/app", project_dir, "merged_tif_8b")
# Find files from the current week
# We need to map week numbers to dates
# Week X of year Y corresponds to dates in that week
week_start <- as.Date(paste(current_year, "-01-01", sep = "")) + (current_week - 1) * 7
week_end <- week_start + 6
files_8b <- list.files(merged_8b_dir, pattern = "\\.tif$", full.names = TRUE)
if (length(files_8b) == 0) {
return(data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
cloud_free_pct = NA,
cloud_quality = "No data"
))
}
# Extract dates from filenames (format: YYYY-MM-DD.tif)
file_dates <- as.Date(gsub("\\.tif$", "", basename(files_8b)))
# Filter files within the week
week_files <- files_8b[file_dates >= week_start & file_dates <= week_end]
if (length(week_files) == 0) {
return(data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
cloud_free_pct = NA,
cloud_quality = "No data for week"
))
}
# Process each file and calculate cloud-free percentage
cloud_results_list <- list()
for (file in week_files) {
tryCatch({
r <- rast(file)
# Band 9 is udm1 (cloud mask): 0 = clear, 1 = cloudy
if (nlyr(r) >= 9) {
cloud_band <- r[[9]]
# Extract cloud mask for each field
cloud_stats <- terra::extract(
cloud_band,
vect(field_boundaries_sf),
fun = function(x, ...) {
clear_pixels <- sum(x == 0, na.rm = TRUE)
total_pixels <- sum(!is.na(x))
if (total_pixels > 0) {
return(clear_pixels / total_pixels * 100)
} else {
return(NA)
}
},
na.rm = TRUE
)
cloud_results_list[[basename(file)]] <- data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
cloud_free_pct = cloud_stats[, 2]
)
}
}, error = function(e) {
warning(paste("Error processing cloud data from", basename(file), ":", e$message))
})
}
if (length(cloud_results_list) == 0) {
return(data.frame(
field_id = field_boundaries_sf$field,
sub_field = field_boundaries_sf$sub_field,
cloud_free_pct = NA,
cloud_quality = "Processing error"
))
}
# Average cloud-free percentage across all images in the week
cloud_summary <- bind_rows(cloud_results_list) %>%
group_by(field_id, sub_field) %>%
summarize(
cloud_free_pct = mean(cloud_free_pct, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
cloud_quality = case_when(
is.na(cloud_free_pct) ~ "No data",
cloud_free_pct >= 90 ~ "Excellent",
cloud_free_pct >= 75 ~ "Good",
cloud_free_pct >= 50 ~ "Moderate",
TRUE ~ "Poor"
)
)
return(cloud_summary)
}
#' Generate alerts based on field data
generate_alerts <- function(field_data, crop_status_data) {
# For now, just extract placeholder alerts
# In future, this will include real alert logic
alerts <- field_data %>%
filter(Alerts != "" & !is.na(Alerts)) %>%
select(Field_ID, Crop_Status, Chlorophyll_Index, Weekly_Change, Alerts) %>%
mutate(Alert_Type = "Placeholder")
return(alerts)
}
#' Create farm-wide overview
create_farm_overview <- function(field_data, alerts_data) {
overview <- data.frame(
Metric = c(
"Total Fields",
"Total Area (ha)",
"Total Area (acres)",
"Average CI",
"Fields with Increasing CI",
"Fields with Decreasing CI",
"Fields with Stable CI",
"Average Cloud Free %",
"Total Alerts"
),
Value = c(
nrow(field_data),
round(sum(field_data$Acreage_ha, na.rm = TRUE), 1),
round(sum(field_data$Acreage_acres, na.rm = TRUE), 0),
round(mean(field_data$Chlorophyll_Index, na.rm = TRUE), 2),
sum(grepl("Increasing", field_data$Change_Pattern), na.rm = TRUE),
sum(grepl("Decreasing", field_data$Change_Pattern), na.rm = TRUE),
sum(grepl("Stable", field_data$Change_Pattern), na.rm = TRUE),
round(mean(field_data$Cloud_Free_Percent, na.rm = TRUE), 1),
nrow(alerts_data)
)
)
return(overview)
}
# 14. Script execution
# ------------------
if (sys.nframe() == 0) {
main()
}