SmartCane/analyze_image_availability.R

# R script to analyze image dates and missing weeks
library(dplyr)
library(lubridate)
library(ggplot2)

# Set folder path
folder <- "laravel_app/storage/app/esa/merged_final_tif"
files <- list.files(folder, pattern = "\\.tif$", full.names = FALSE)

df <- data.frame(date = dates)
# Extract dates and file sizes
dates <- as.Date(sub(".tif$", "", files))
sizes_kb <- file.info(file.path(folder, files))$size / 1024
df <- data.frame(date = dates, size_kb = sizes_kb, file = files) %>%
  mutate(year = year(date),
         week = isoweek(date),
         completeness = ifelse(size_kb >= 9000, "Complete", "Incomplete"))

# Get all years in data
years <- sort(unique(df$year))

# Prepare output table
output <- data.frame(
  year = integer(),
  n_images = integer(),
  n_weeks_missing = integer(),
  max_consec_weeks_missing = integer(),
  avg_images_per_week = numeric(),
  stringsAsFactors = FALSE
)

missing_weeks_list <- list()
current_year <- as.integer(format(Sys.Date(), "%Y"))
# For plotting: build a data frame with all year/week combinations and count images per week

# For plotting: count complete/incomplete images per week/year
plot_weeks <- expand.grid(year = years, week = 1:52, completeness = c("Complete", "Incomplete"))
plot_weeks$n_images <- 0
for (i in seq_len(nrow(plot_weeks))) {
  y <- plot_weeks$year[i]
  w <- plot_weeks$week[i]
  ctype <- plot_weeks$completeness[i]
  plot_weeks$n_images[i] <- sum(df$year == y & df$week == w & df$completeness == ctype)
}


# Plot: X = week, Y = number of images, fill = completeness, color = year (stacked bar chart)
gg <- ggplot(plot_weeks, aes(x = week, y = n_images, fill = completeness)) +
  geom_col(position = "stack") +
  facet_wrap(~ year, ncol = 1) +
  scale_x_continuous(breaks = 1:52) +
  scale_y_continuous(breaks = 0:max(plot_weeks$n_images)) +
  labs(x = "Week number", y = "Number of images", fill = "Completeness",
       title = "Complete vs Incomplete Images per Week (by Year)") +
  theme_minimal()

ggsave("images_per_week_by_year_stacked.png", gg, width = 12, height = 10)
cat("Plot saved as images_per_week_by_year_stacked.png\n")
current_week <- isoweek(Sys.Date())


for (y in years) {
  # For current year, only consider weeks up to today; for past years, all 1:52
  if (y == current_year) {
    all_weeks <- 1:current_week
  } else {
    all_weeks <- 1:52
  }
  weeks_with_images <- unique(df$week[df$year == y])
  weeks_missing <- setdiff(all_weeks, weeks_with_images)
  n_weeks_missing <- length(weeks_missing)
  n_images <- sum(df$year == y)
  if ((y == current_year) && (current_week - n_weeks_missing > 0)) {
    avg_images_per_week <- n_images / (current_week - n_weeks_missing)
  } else if (y != current_year && (52 - n_weeks_missing > 0)) {
    avg_images_per_week <- n_images / (52 - n_weeks_missing)
  } else {
    avg_images_per_week <- NA
  }
  # Find longest run of consecutive missing weeks
  if (n_weeks_missing == 0) {
    max_consec <- 0
  } else {
    w <- sort(weeks_missing)
    runs <- rle(c(1, diff(w)) == 1)
    max_consec <- max(runs$lengths[runs$values], na.rm = TRUE)
  }
  output <- rbind(output, data.frame(
    year = y,
    n_images = n_images,
    n_weeks_missing = n_weeks_missing,
    max_consec_weeks_missing = max_consec,
    avg_images_per_week = round(avg_images_per_week, 2)
  ))
  if (n_weeks_missing > 0) {
    missing_weeks_list[[as.character(y)]] <- weeks_missing
  }
}


# Write to CSV

print(output)

write.csv(output, file = "image_availability_by_year.csv", row.names = FALSE)


# Print missing weeks for years with missing data
for (y in names(missing_weeks_list)) {
  cat(sprintf("Year %s missing weeks: %s\n", y, paste(missing_weeks_list[[y]], collapse=", ")))
}

# Calculate and print max consecutive weeks with only incomplete data per year
cat("\nMax consecutive weeks with only incomplete images per year:\n")
for (y in years) {
  if (y == current_year) {
    all_weeks <- 1:current_week
  } else {
    all_weeks <- 1:52
  }
  # Weeks where all images are incomplete (no complete images)
  weeks_incomplete <- plot_weeks$week[plot_weeks$year == y & plot_weeks$completeness == "Complete" & plot_weeks$n_images == 0]
  # Only keep weeks that actually have at least one image (i.e., not missing entirely)
  weeks_with_any_image <- unique(df$week[df$year == y])
  weeks_incomplete <- intersect(weeks_incomplete, weeks_with_any_image)
  if (length(weeks_incomplete) == 0) {
    max_consec_incomplete <- 0
  } else {
    w <- sort(weeks_incomplete)
    runs <- rle(c(1, diff(w)) == 1)
    max_consec_incomplete <- max(runs$lengths[runs$values], na.rm = TRUE)
  }
  cat(sprintf("Year %d: %d\n", y, max_consec_incomplete))
}