feat: add functionality to skip empty tiles in TIFF processing and clean up orphaned files
This commit is contained in:
parent
32cbf5c0db
commit
506af5079f
154
python_app/clean_empty_tiles.py
Normal file
154
python_app/clean_empty_tiles.py
Normal file
|
|
@ -0,0 +1,154 @@
|
||||||
|
"""
|
||||||
|
Clean empty field-tile TIFFs and orphaned RDS files
|
||||||
|
====================================================
|
||||||
|
Scans field_tiles/ and/or field_tiles_CI/ directories and identifies TIF files
|
||||||
|
where ALL pixels have RGBNIR == 0 (no satellite data collected).
|
||||||
|
|
||||||
|
Partially-covered tiles (some valid pixels present) are kept.
|
||||||
|
|
||||||
|
When deleting from field_tiles_CI/, also deletes the paired
|
||||||
|
daily_ci_vals/{FIELD}/{DATE}.rds file if it exists.
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
# Dry run — list empty files (default, scans both dirs):
|
||||||
|
& "C:\\Users\\timon\\anaconda3\\envs\\pytorch_gpu\\python.exe" python_app/clean_empty_tiles.py
|
||||||
|
|
||||||
|
# Actually delete:
|
||||||
|
& "...\\python.exe" python_app/clean_empty_tiles.py --delete
|
||||||
|
|
||||||
|
# Only one directory type:
|
||||||
|
& "...\\python.exe" python_app/clean_empty_tiles.py --dirs field_tiles
|
||||||
|
& "...\\python.exe" python_app/clean_empty_tiles.py --dirs field_tiles_CI
|
||||||
|
|
||||||
|
# Specific projects or fields:
|
||||||
|
& "...\\python.exe" python_app/clean_empty_tiles.py --projects angata aura --delete
|
||||||
|
& "...\\python.exe" python_app/clean_empty_tiles.py --fields 544 301 --delete
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import rasterio
|
||||||
|
|
||||||
|
ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
DEFAULT_DIRS = ["field_tiles", "field_tiles_CI"]
|
||||||
|
|
||||||
|
|
||||||
|
def is_empty_tif(path: Path) -> bool:
|
||||||
|
"""Return True if ALL pixels in RGBNIR bands are 0 or NaN (no satellite data).
|
||||||
|
|
||||||
|
Cloud-masked pixels are stored as 0 in uint16 (NaN is not representable).
|
||||||
|
A tile is considered empty only when every pixel across bands 1-4 is 0 or NaN,
|
||||||
|
meaning no valid satellite data was captured for that field on that date.
|
||||||
|
Partially-covered tiles (some pixels valid) return False and are left alone.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with rasterio.open(path) as src:
|
||||||
|
if src.count < 4:
|
||||||
|
return False # unexpected band count — leave it alone
|
||||||
|
rgbnir = src.read([1, 2, 3, 4]).astype(np.float32)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" WARNING: could not open {path.name}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return bool(np.all((rgbnir == 0) | np.isnan(rgbnir)))
|
||||||
|
|
||||||
|
|
||||||
|
def scan_directory(storage_root: Path, dir_name: str, delete: bool, fields: list = None) -> dict:
|
||||||
|
"""Scan one tile directory within a project storage root.
|
||||||
|
|
||||||
|
When dir_name == 'field_tiles_CI' and delete=True, also removes the paired
|
||||||
|
daily_ci_vals/{FIELD}/{DATE}.rds file for each deleted TIF.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict mapping field_id -> list of empty Path objects
|
||||||
|
"""
|
||||||
|
tiff_root = storage_root / dir_name
|
||||||
|
# Paired RDS files only exist for field_tiles_CI output
|
||||||
|
rds_root = storage_root / "daily_ci_vals" if dir_name == "field_tiles_CI" else None
|
||||||
|
|
||||||
|
if not tiff_root.exists():
|
||||||
|
print(f" [{dir_name}] Directory not found: {tiff_root}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
field_dirs = sorted(d for d in tiff_root.iterdir() if d.is_dir())
|
||||||
|
if fields:
|
||||||
|
field_dirs = [d for d in field_dirs if d.name in fields]
|
||||||
|
|
||||||
|
print(f"\n [{dir_name}] Scanning {len(field_dirs)} fields ...")
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
for field_dir in field_dirs:
|
||||||
|
tif_files = sorted(field_dir.glob("*.tif"))
|
||||||
|
empty = [f for f in tif_files if is_empty_tif(f)]
|
||||||
|
if empty:
|
||||||
|
results[field_dir.name] = empty
|
||||||
|
print(f" Field {field_dir.name:>6}: {len(empty)}/{len(tif_files)} empty"
|
||||||
|
f" ({', '.join(f.stem for f in empty)})")
|
||||||
|
|
||||||
|
total_empty = sum(len(v) for v in results.values())
|
||||||
|
total_tifs = sum(len(list(d.glob("*.tif"))) for d in field_dirs)
|
||||||
|
print(f"\n [{dir_name}] Summary: {total_empty} empty / {total_tifs} total TIFs"
|
||||||
|
f" across {len(results)} fields")
|
||||||
|
|
||||||
|
if delete and total_empty > 0:
|
||||||
|
print(f"\n [{dir_name}] Deleting {total_empty} empty TIFs ...")
|
||||||
|
rds_deleted = 0
|
||||||
|
for field_id, files in results.items():
|
||||||
|
for f in files:
|
||||||
|
f.unlink()
|
||||||
|
print(f" Deleted TIF: {f.relative_to(ROOT)}")
|
||||||
|
# Also remove the paired RDS from daily_ci_vals/ (Script 20 output)
|
||||||
|
if rds_root is not None:
|
||||||
|
paired_rds = rds_root / field_id / f"{f.stem}.rds"
|
||||||
|
if paired_rds.exists():
|
||||||
|
paired_rds.unlink()
|
||||||
|
print(f" Deleted RDS: {paired_rds.relative_to(ROOT)}")
|
||||||
|
rds_deleted += 1
|
||||||
|
print(f" [{dir_name}] Done. ({rds_deleted} paired RDS files also removed)")
|
||||||
|
elif not delete and total_empty > 0:
|
||||||
|
print(f"\n [{dir_name}] Dry run — pass --delete to remove these files.")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def scan_project(project: str, delete: bool, fields: list = None, dirs: list = None) -> None:
|
||||||
|
storage_root = ROOT / "laravel_app" / "storage" / "app" / project
|
||||||
|
if not storage_root.exists():
|
||||||
|
print(f"[{project}] Project directory not found: {storage_root}")
|
||||||
|
return
|
||||||
|
print(f"\n[{project}] ========================================")
|
||||||
|
for dir_name in (dirs or DEFAULT_DIRS):
|
||||||
|
scan_directory(storage_root, dir_name, delete, fields)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Remove empty field-tile TIFFs and paired RDS files"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--delete", action="store_true",
|
||||||
|
help="Actually delete empty files (default: dry run)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--projects", nargs="+", default=["angata"],
|
||||||
|
help="Project names to scan (default: angata)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--fields", nargs="+", default=None,
|
||||||
|
help="Limit to specific field IDs, e.g. --fields 544 301"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dirs", nargs="+", default=None, choices=DEFAULT_DIRS,
|
||||||
|
help=f"Which subdirs to scan (default: both {DEFAULT_DIRS})"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print("=== Mode: DELETE ===" if args.delete else "=== Mode: DRY RUN (use --delete to remove) ===")
|
||||||
|
for project in args.projects:
|
||||||
|
scan_project(project, args.delete, args.fields, args.dirs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -129,8 +129,18 @@ crop_tiff_to_fields <- function(tif_path, tif_date, fields, output_base_dir) {
|
||||||
# Crop raster to field boundary
|
# Crop raster to field boundary
|
||||||
tryCatch({
|
tryCatch({
|
||||||
field_rast <- crop(rast, field_geom)
|
field_rast <- crop(rast, field_geom)
|
||||||
writeRaster(field_rast, output_path, overwrite = TRUE)
|
|
||||||
created <- created + 1
|
# Skip empty tiles: cloud-masked pixels are stored as 0 in uint16
|
||||||
|
# (NaN cannot be represented in that format). A band sum of 0 means
|
||||||
|
# no satellite data was captured for this field on this date.
|
||||||
|
band_sums <- terra::global(field_rast, fun = "sum", na.rm = TRUE)
|
||||||
|
if (sum(band_sums$sum, na.rm = TRUE) == 0) {
|
||||||
|
safe_log(paste("SKIP (no data):", field_name, tif_date), "WARNING")
|
||||||
|
skipped <- skipped + 1
|
||||||
|
} else {
|
||||||
|
writeRaster(field_rast, output_path, overwrite = TRUE)
|
||||||
|
created <- created + 1
|
||||||
|
}
|
||||||
}, error = function(e) {
|
}, error = function(e) {
|
||||||
safe_log(paste("ERROR cropping field", field_name, ":", e$message), "ERROR")
|
safe_log(paste("ERROR cropping field", field_name, ":", e$message), "ERROR")
|
||||||
errors <<- errors + 1
|
errors <<- errors + 1
|
||||||
|
|
|
||||||
|
|
@ -200,22 +200,32 @@ main <- function() {
|
||||||
# Crop 5-band TIFF to field boundary
|
# Crop 5-band TIFF to field boundary
|
||||||
field_geom <- field_boundaries_sf %>% filter(field == !!field)
|
field_geom <- field_boundaries_sf %>% filter(field == !!field)
|
||||||
five_band_cropped <- terra::crop(five_band, field_geom, mask = TRUE)
|
five_band_cropped <- terra::crop(five_band, field_geom, mask = TRUE)
|
||||||
|
|
||||||
# Save 5-band field TIFF
|
# Skip empty tiles: cloud-masked pixels are stored as 0 in uint16
|
||||||
terra::writeRaster(five_band_cropped, output_tif_path, overwrite = TRUE)
|
# (NaN cannot be represented in that format). Sum of RGBNIR bands == 0
|
||||||
|
# means no valid satellite data was captured for this field on this date.
|
||||||
# Extract CI statistics by sub_field (from cropped CI raster)
|
rgbnir_sum <- sum(
|
||||||
ci_cropped <- five_band_cropped[[5]] # 5th band is CI
|
terra::global(five_band_cropped[[1:4]], fun = "sum", na.rm = TRUE)$sum,
|
||||||
ci_stats <- extract_ci_by_subfield(ci_cropped, field_boundaries_sf, field)
|
na.rm = TRUE
|
||||||
|
)
|
||||||
# Save RDS
|
if (rgbnir_sum == 0) {
|
||||||
if (!is.null(ci_stats) && nrow(ci_stats) > 0) {
|
safe_log(sprintf(" SKIP (no data): field %s on %s", field, date_str), "WARNING")
|
||||||
saveRDS(ci_stats, output_rds_path)
|
} else {
|
||||||
|
# Save 5-band field TIFF
|
||||||
|
terra::writeRaster(five_band_cropped, output_tif_path, overwrite = TRUE)
|
||||||
|
|
||||||
|
# Extract CI statistics by sub_field (from cropped CI raster)
|
||||||
|
ci_cropped <- five_band_cropped[[5]] # 5th band is CI
|
||||||
|
ci_stats <- extract_ci_by_subfield(ci_cropped, field_boundaries_sf, field)
|
||||||
|
|
||||||
|
# Save RDS
|
||||||
|
if (!is.null(ci_stats) && nrow(ci_stats) > 0) {
|
||||||
|
saveRDS(ci_stats, output_rds_path)
|
||||||
|
}
|
||||||
|
|
||||||
|
fields_processed_this_date <- fields_processed_this_date + 1
|
||||||
|
raster_processed_this_date <- raster_processed_this_date + 1
|
||||||
}
|
}
|
||||||
|
|
||||||
fields_processed_this_date <- fields_processed_this_date + 1
|
|
||||||
raster_processed_this_date <- raster_processed_this_date + 1
|
|
||||||
|
|
||||||
}, error = function(e) {
|
}, error = function(e) {
|
||||||
# Error in individual field, continue to next
|
# Error in individual field, continue to next
|
||||||
safe_log(sprintf(" Error processing field %s: %s", field, e$message), "WARNING")
|
safe_log(sprintf(" Error processing field %s: %s", field, e$message), "WARNING")
|
||||||
|
|
|
||||||
|
|
@ -438,8 +438,8 @@
|
||||||
# rmarkdown::render(
|
# rmarkdown::render(
|
||||||
rmarkdown::render(
|
rmarkdown::render(
|
||||||
"r_app/90_CI_report_with_kpis_agronomic_support.Rmd",
|
"r_app/90_CI_report_with_kpis_agronomic_support.Rmd",
|
||||||
params = list(data_dir = "aura", report_date = as.Date("2026-02-18"), language = "en" ),
|
params = list(data_dir = "aura", report_date = as.Date("2026-03-23"), language = "en" ),
|
||||||
output_file = "SmartCane_Report_agronomic_support_aura_2026-02-18_en_test.docx",
|
output_file = "SmartCane_Report_agronomic_support_aura_2026-03-23_en.docx",
|
||||||
output_dir = "laravel_app/storage/app/aura/reports"
|
output_dir = "laravel_app/storage/app/aura/reports"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ AREA_UNIT_PREFERENCE <- tolower(Sys.getenv("AREA_UNIT", unset = "hectare"))
|
||||||
# Validate area unit value
|
# Validate area unit value
|
||||||
if (!AREA_UNIT_PREFERENCE %in% c("hectare", "acre")) {
|
if (!AREA_UNIT_PREFERENCE %in% c("hectare", "acre")) {
|
||||||
warning(paste0("Invalid AREA_UNIT env var: '", AREA_UNIT_PREFERENCE, "'. Using 'hectare'."))
|
warning(paste0("Invalid AREA_UNIT env var: '", AREA_UNIT_PREFERENCE, "'. Using 'hectare'."))
|
||||||
AREA_UNIT_PREFERENCE <- "hectare"
|
AREA_UNIT_PREFERENCE <- "acre"
|
||||||
}
|
}
|
||||||
|
|
||||||
#' Get area unit label for display
|
#' Get area unit label for display
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue