From 506af5079fbd93e5879d409cc7fb4f237b93fdd9 Mon Sep 17 00:00:00 2001 From: Timon Date: Tue, 24 Mar 2026 11:23:59 +0100 Subject: [PATCH] feat: add functionality to skip empty tiles in TIFF processing and clean up orphaned files --- python_app/clean_empty_tiles.py | 154 ++++++++++++++++++++++++ r_app/10_create_per_field_tiffs_utils.R | 14 ++- r_app/20_ci_extraction_per_field.R | 40 +++--- r_app/MANUAL_PIPELINE_RUNNER.R | 4 +- r_app/parameters_project.R | 2 +- 5 files changed, 194 insertions(+), 20 deletions(-) create mode 100644 python_app/clean_empty_tiles.py diff --git a/python_app/clean_empty_tiles.py b/python_app/clean_empty_tiles.py new file mode 100644 index 0000000..26021cc --- /dev/null +++ b/python_app/clean_empty_tiles.py @@ -0,0 +1,154 @@ +""" +Clean empty field-tile TIFFs and orphaned RDS files +==================================================== +Scans field_tiles/ and/or field_tiles_CI/ directories and identifies TIF files +where ALL pixels have RGBNIR == 0 (no satellite data collected). + +Partially-covered tiles (some valid pixels present) are kept. + +When deleting from field_tiles_CI/, also deletes the paired +daily_ci_vals/{FIELD}/{DATE}.rds file if it exists. + +USAGE: + # Dry run — list empty files (default, scans both dirs): + & "C:\\Users\\timon\\anaconda3\\envs\\pytorch_gpu\\python.exe" python_app/clean_empty_tiles.py + + # Actually delete: + & "...\\python.exe" python_app/clean_empty_tiles.py --delete + + # Only one directory type: + & "...\\python.exe" python_app/clean_empty_tiles.py --dirs field_tiles + & "...\\python.exe" python_app/clean_empty_tiles.py --dirs field_tiles_CI + + # Specific projects or fields: + & "...\\python.exe" python_app/clean_empty_tiles.py --projects angata aura --delete + & "...\\python.exe" python_app/clean_empty_tiles.py --fields 544 301 --delete +""" + +import argparse +from pathlib import Path + +import numpy as np +import rasterio + +ROOT = Path(__file__).resolve().parent.parent +DEFAULT_DIRS = ["field_tiles", "field_tiles_CI"] + + +def is_empty_tif(path: Path) -> bool: + """Return True if ALL pixels in RGBNIR bands are 0 or NaN (no satellite data). + + Cloud-masked pixels are stored as 0 in uint16 (NaN is not representable). + A tile is considered empty only when every pixel across bands 1-4 is 0 or NaN, + meaning no valid satellite data was captured for that field on that date. + Partially-covered tiles (some pixels valid) return False and are left alone. + """ + try: + with rasterio.open(path) as src: + if src.count < 4: + return False # unexpected band count — leave it alone + rgbnir = src.read([1, 2, 3, 4]).astype(np.float32) + except Exception as e: + print(f" WARNING: could not open {path.name}: {e}") + return False + + return bool(np.all((rgbnir == 0) | np.isnan(rgbnir))) + + +def scan_directory(storage_root: Path, dir_name: str, delete: bool, fields: list = None) -> dict: + """Scan one tile directory within a project storage root. + + When dir_name == 'field_tiles_CI' and delete=True, also removes the paired + daily_ci_vals/{FIELD}/{DATE}.rds file for each deleted TIF. + + Returns: + dict mapping field_id -> list of empty Path objects + """ + tiff_root = storage_root / dir_name + # Paired RDS files only exist for field_tiles_CI output + rds_root = storage_root / "daily_ci_vals" if dir_name == "field_tiles_CI" else None + + if not tiff_root.exists(): + print(f" [{dir_name}] Directory not found: {tiff_root}") + return {} + + field_dirs = sorted(d for d in tiff_root.iterdir() if d.is_dir()) + if fields: + field_dirs = [d for d in field_dirs if d.name in fields] + + print(f"\n [{dir_name}] Scanning {len(field_dirs)} fields ...") + + results = {} + for field_dir in field_dirs: + tif_files = sorted(field_dir.glob("*.tif")) + empty = [f for f in tif_files if is_empty_tif(f)] + if empty: + results[field_dir.name] = empty + print(f" Field {field_dir.name:>6}: {len(empty)}/{len(tif_files)} empty" + f" ({', '.join(f.stem for f in empty)})") + + total_empty = sum(len(v) for v in results.values()) + total_tifs = sum(len(list(d.glob("*.tif"))) for d in field_dirs) + print(f"\n [{dir_name}] Summary: {total_empty} empty / {total_tifs} total TIFs" + f" across {len(results)} fields") + + if delete and total_empty > 0: + print(f"\n [{dir_name}] Deleting {total_empty} empty TIFs ...") + rds_deleted = 0 + for field_id, files in results.items(): + for f in files: + f.unlink() + print(f" Deleted TIF: {f.relative_to(ROOT)}") + # Also remove the paired RDS from daily_ci_vals/ (Script 20 output) + if rds_root is not None: + paired_rds = rds_root / field_id / f"{f.stem}.rds" + if paired_rds.exists(): + paired_rds.unlink() + print(f" Deleted RDS: {paired_rds.relative_to(ROOT)}") + rds_deleted += 1 + print(f" [{dir_name}] Done. ({rds_deleted} paired RDS files also removed)") + elif not delete and total_empty > 0: + print(f"\n [{dir_name}] Dry run — pass --delete to remove these files.") + + return results + + +def scan_project(project: str, delete: bool, fields: list = None, dirs: list = None) -> None: + storage_root = ROOT / "laravel_app" / "storage" / "app" / project + if not storage_root.exists(): + print(f"[{project}] Project directory not found: {storage_root}") + return + print(f"\n[{project}] ========================================") + for dir_name in (dirs or DEFAULT_DIRS): + scan_directory(storage_root, dir_name, delete, fields) + + +def main(): + parser = argparse.ArgumentParser( + description="Remove empty field-tile TIFFs and paired RDS files" + ) + parser.add_argument( + "--delete", action="store_true", + help="Actually delete empty files (default: dry run)" + ) + parser.add_argument( + "--projects", nargs="+", default=["angata"], + help="Project names to scan (default: angata)" + ) + parser.add_argument( + "--fields", nargs="+", default=None, + help="Limit to specific field IDs, e.g. --fields 544 301" + ) + parser.add_argument( + "--dirs", nargs="+", default=None, choices=DEFAULT_DIRS, + help=f"Which subdirs to scan (default: both {DEFAULT_DIRS})" + ) + args = parser.parse_args() + + print("=== Mode: DELETE ===" if args.delete else "=== Mode: DRY RUN (use --delete to remove) ===") + for project in args.projects: + scan_project(project, args.delete, args.fields, args.dirs) + + +if __name__ == "__main__": + main() diff --git a/r_app/10_create_per_field_tiffs_utils.R b/r_app/10_create_per_field_tiffs_utils.R index af490e0..b600115 100644 --- a/r_app/10_create_per_field_tiffs_utils.R +++ b/r_app/10_create_per_field_tiffs_utils.R @@ -129,8 +129,18 @@ crop_tiff_to_fields <- function(tif_path, tif_date, fields, output_base_dir) { # Crop raster to field boundary tryCatch({ field_rast <- crop(rast, field_geom) - writeRaster(field_rast, output_path, overwrite = TRUE) - created <- created + 1 + + # Skip empty tiles: cloud-masked pixels are stored as 0 in uint16 + # (NaN cannot be represented in that format). A band sum of 0 means + # no satellite data was captured for this field on this date. + band_sums <- terra::global(field_rast, fun = "sum", na.rm = TRUE) + if (sum(band_sums$sum, na.rm = TRUE) == 0) { + safe_log(paste("SKIP (no data):", field_name, tif_date), "WARNING") + skipped <- skipped + 1 + } else { + writeRaster(field_rast, output_path, overwrite = TRUE) + created <- created + 1 + } }, error = function(e) { safe_log(paste("ERROR cropping field", field_name, ":", e$message), "ERROR") errors <<- errors + 1 diff --git a/r_app/20_ci_extraction_per_field.R b/r_app/20_ci_extraction_per_field.R index d0289ee..4a3e0d3 100644 --- a/r_app/20_ci_extraction_per_field.R +++ b/r_app/20_ci_extraction_per_field.R @@ -200,22 +200,32 @@ main <- function() { # Crop 5-band TIFF to field boundary field_geom <- field_boundaries_sf %>% filter(field == !!field) five_band_cropped <- terra::crop(five_band, field_geom, mask = TRUE) - - # Save 5-band field TIFF - terra::writeRaster(five_band_cropped, output_tif_path, overwrite = TRUE) - - # Extract CI statistics by sub_field (from cropped CI raster) - ci_cropped <- five_band_cropped[[5]] # 5th band is CI - ci_stats <- extract_ci_by_subfield(ci_cropped, field_boundaries_sf, field) - - # Save RDS - if (!is.null(ci_stats) && nrow(ci_stats) > 0) { - saveRDS(ci_stats, output_rds_path) + + # Skip empty tiles: cloud-masked pixels are stored as 0 in uint16 + # (NaN cannot be represented in that format). Sum of RGBNIR bands == 0 + # means no valid satellite data was captured for this field on this date. + rgbnir_sum <- sum( + terra::global(five_band_cropped[[1:4]], fun = "sum", na.rm = TRUE)$sum, + na.rm = TRUE + ) + if (rgbnir_sum == 0) { + safe_log(sprintf(" SKIP (no data): field %s on %s", field, date_str), "WARNING") + } else { + # Save 5-band field TIFF + terra::writeRaster(five_band_cropped, output_tif_path, overwrite = TRUE) + + # Extract CI statistics by sub_field (from cropped CI raster) + ci_cropped <- five_band_cropped[[5]] # 5th band is CI + ci_stats <- extract_ci_by_subfield(ci_cropped, field_boundaries_sf, field) + + # Save RDS + if (!is.null(ci_stats) && nrow(ci_stats) > 0) { + saveRDS(ci_stats, output_rds_path) + } + + fields_processed_this_date <- fields_processed_this_date + 1 + raster_processed_this_date <- raster_processed_this_date + 1 } - - fields_processed_this_date <- fields_processed_this_date + 1 - raster_processed_this_date <- raster_processed_this_date + 1 - }, error = function(e) { # Error in individual field, continue to next safe_log(sprintf(" Error processing field %s: %s", field, e$message), "WARNING") diff --git a/r_app/MANUAL_PIPELINE_RUNNER.R b/r_app/MANUAL_PIPELINE_RUNNER.R index 86483a6..0640616 100644 --- a/r_app/MANUAL_PIPELINE_RUNNER.R +++ b/r_app/MANUAL_PIPELINE_RUNNER.R @@ -438,8 +438,8 @@ # rmarkdown::render( rmarkdown::render( "r_app/90_CI_report_with_kpis_agronomic_support.Rmd", - params = list(data_dir = "aura", report_date = as.Date("2026-02-18"), language = "en" ), - output_file = "SmartCane_Report_agronomic_support_aura_2026-02-18_en_test.docx", + params = list(data_dir = "aura", report_date = as.Date("2026-03-23"), language = "en" ), + output_file = "SmartCane_Report_agronomic_support_aura_2026-03-23_en.docx", output_dir = "laravel_app/storage/app/aura/reports" ) diff --git a/r_app/parameters_project.R b/r_app/parameters_project.R index 12c7108..6fbf83b 100644 --- a/r_app/parameters_project.R +++ b/r_app/parameters_project.R @@ -57,7 +57,7 @@ AREA_UNIT_PREFERENCE <- tolower(Sys.getenv("AREA_UNIT", unset = "hectare")) # Validate area unit value if (!AREA_UNIT_PREFERENCE %in% c("hectare", "acre")) { warning(paste0("Invalid AREA_UNIT env var: '", AREA_UNIT_PREFERENCE, "'. Using 'hectare'.")) - AREA_UNIT_PREFERENCE <- "hectare" + AREA_UNIT_PREFERENCE <- "acre" } #' Get area unit label for display