SmartCane/python_app/00_download_8band_pu_optimized.py
2026-01-12 16:40:51 +01:00

553 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Planet 4-Band Download Script - PU-Optimized (RGB+NIR, Cloud-Masked, uint16)
============================================================================
Strategy: Minimize Processing Units using three techniques:
1. 4-band output (RGB+NIR) with cloud masking on server (uint16, not FLOAT32)
→ Cuts data transfer by ~60% (4 bands uint16 vs 9 bands FLOAT32)
2. Dynamically reduced bounding boxes (reduce_bbox_sizes=True)
→ Shrinks tiles to fit field geometry boundaries, reducing wasted pixels
3. Date availability filtering + geometry-aware grid
→ Skips empty dates and non-field areas
Usage:
python download_8band_pu_optimized.py [PROJECT] [OPTIONS]
Arguments:
PROJECT Project name (angata, chemba, xinavane, etc.) [required]
Options:
--date DATE Date to download (YYYY-MM-DD). Default: today
--resolution RES Resolution in meters (default: 3)
--skip-merge Skip merge step (download only, keep individual tiles)
--cleanup Delete intermediate single_images folder after merge
--clear-singles Clear single_images_8b folder before download
--clear-merged Clear merged_tif_8b and merged_virtual_8b folders before download
--clear-all Clear all output folders (singles, merged, virtual) before download
Examples:
python download_8band_pu_optimized.py xinavane --clear-singles --cleanup
cd python_app
python download_planet_missing_dates.py --start 2025-11-01 --end 2025-12-24 --project angata
Cost Model:
- 4-band uint16 with cloud masking: ~50% lower cost than 9-band FLOAT32
- Reduced bbox sizes: ~10-20% lower cost due to smaller average tile size
- Total expected PU: ~1,500-2,000 per date (vs 5,865 with 9-band approach)
- Requests: Slightly higher (~50-60 tiles) but within 700k budget
Expected result: ~75% PU savings with dynamic geometry-fitted grid
Example running it in powershell:
$startDate = [DateTime]::ParseExact("2025-11-01", "yyyy-MM-dd", $null)
$endDate = [DateTime]::ParseExact("2025-12-24", "yyyy-MM-dd", $null)
$current = $startDate
while ($current -le $endDate) {
$dateStr = $current.ToString("yyyy-MM-dd")
Write-Host "Downloading $dateStr..."
python download_8band_pu_optimized.py angata --date $dateStr
$current = $current.AddDays(1)
}
"""
import os
import sys
import json
import datetime
import argparse
from pathlib import Path
from typing import List, Tuple, Optional
import warnings
import numpy as np
import geopandas as gpd
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import unary_union
from osgeo import gdal
# Suppress GDAL TIFF metadata warnings
warnings.filterwarnings('ignore', category=RuntimeWarning, module='osgeo.gdal')
from sentinelhub import (
MimeType, CRS, BBox, SentinelHubRequest, SentinelHubDownloadClient,
DataCollection, bbox_to_dimensions, SHConfig, Geometry, SentinelHubCatalog, BBoxSplitter
)
import time
# ============================================================================
# CONFIGURATION
# ============================================================================
def setup_config():
"""Setup SentinelHub configuration and paths."""
config = SHConfig()
config.sh_client_id = os.environ.get('SH_CLIENT_ID', '1a72d811-4f0e-4447-8282-df09608cff44')
config.sh_client_secret = os.environ.get('SH_CLIENT_SECRET', 'FcBlRL29i9ZmTzhmKTv1etSMFs5PxSos')
# BYOC collection for Planet 8-band data
collection_id = '4e56d0cb-c402-40ff-97bb-c2b9e6bfcf2a'
byoc = DataCollection.define_byoc(collection_id, name='planet_data_8b', is_timeless=True)
catalog = SentinelHubCatalog(config=config)
return config, byoc, catalog
# ============================================================================
# EVALSCRIPT: 4 bands (RGB + NIR) with cloud masking, uint16 output
# ============================================================================
EVALSCRIPT_4BAND_MASKED = """
//VERSION=3
function setup() {
return {
input: [{
bands: ["red", "green", "blue", "nir", "udm1"],
units: "DN"
}],
output: {
bands: 4
}
};
}
function evaluatePixel(sample) {
// Cloud masking: return NaN for cloudy/bad pixels (udm1 != 0)
// This reduces output pixels and avoids NaN interpolation on client side
if (sample.udm1 == 0) {
// Scale reflectance: DN → [0, 1] range
var scaledRed = 2.5 * sample.red / 10000;
var scaledGreen = 2.5 * sample.green / 10000;
var scaledBlue = 2.5 * sample.blue / 10000;
var scaledNIR = 2.5 * sample.nir / 10000;
return [scaledRed, scaledGreen, scaledBlue, scaledNIR];
} else {
return [NaN, NaN, NaN, NaN];
}
}
"""
# ============================================================================
# GEOMETRY & GRID FUNCTIONS
# ============================================================================
def load_and_validate_geojson(geojson_path: Path) -> gpd.GeoDataFrame:
"""Load GeoJSON and ensure WGS84 CRS."""
gdf = gpd.read_file(str(geojson_path))
print(f"✓ Loaded {len(gdf)} field(s)")
print(f" CRS: {gdf.crs}")
print(f" Bounds (WGS84): {gdf.total_bounds}")
# Ensure WGS84
if gdf.crs is None:
print(" ⚠️ No CRS defined. Assuming WGS84.")
gdf = gdf.set_crs('EPSG:4326')
elif gdf.crs != 'EPSG:4326':
print(f" Converting to WGS84...")
gdf = gdf.to_crs('EPSG:4326')
return gdf
def create_optimal_grid_with_filtering(
gdf: gpd.GeoDataFrame,
resolution: int = 3,
max_pixels: int = 2500
) -> Tuple[List[BBox], List[Polygon]]:
"""
Create fine grid of bounding boxes using BBoxSplitter with reduce_bbox_sizes=True.
Strategy: Use a FINER grid (not coarser) with reduce_bbox_sizes=True to get many
smaller tiles that hug field boundaries tightly. This reduces wasted pixel area
while still respecting max pixel limit per tile.
Example from SentinelHub docs shows: finer grid + reduce_bbox_sizes=True creates
significantly more, smaller tiles that match geometry much better than uniform grid.
Returns:
(bbox_list, geometry_list) where geometry_list contains field geometries
that intersect each bbox (for reference only, not for masking download)
"""
union_geom = gdf.geometry.union_all()
bounds = gdf.total_bounds # [minx, miny, maxx, maxy]
# Calculate area in meters
minx, miny, maxx, maxy = bounds
width_m = (maxx - minx) * 111320 # Rough conversion to meters
height_m = (maxy - miny) * 111320
max_size_m = max_pixels * resolution # Max bbox size in meters
# Calculate BASE grid dimensions
nx_base = max(1, int(np.ceil(width_m / max_size_m)))
ny_base = max(1, int(np.ceil(height_m / max_size_m)))
# Use FINE grid (6x multiplier) with reduce_bbox_sizes=True
# This creates many smaller tiles that fit field geometry boundaries tightly
# 6x multiplier = ~20×24 theoretical tiles → ~120-150 active after reduce_bbox_sizes
# Avoids creating tiles that shrink to 0 pixels (as happened with 7x)
nx_fine = nx_base * 5
ny_fine = ny_base * 5
print(f"\nGrid Calculation (fine grid with reduce_bbox_sizes=True):")
print(f" Area extent: {width_m:.0f}m × {height_m:.0f}m")
print(f" Max bbox size: {max_size_m:.0f}m ({max_pixels}px @ {resolution}m)")
print(f" Base grid: {nx_base}×{ny_base} = {nx_base*ny_base} tiles")
print(f" Fine grid (6x): {nx_fine}×{ny_fine} = {nx_fine*ny_fine} theoretical tiles")
# Convert geometries to Shapely for BBoxSplitter
shapely_geoms = [geom for geom in gdf.geometry]
# Use BBoxSplitter with FINER grid and reduce_bbox_sizes=True
# This creates many smaller tiles that fit field geometry boundaries tightly
bbox_splitter = BBoxSplitter(
shapely_geoms,
CRS.WGS84,
(nx_fine, ny_fine),
reduce_bbox_sizes=True # Shrink tiles to fit geometry - creates many smaller tiles
)
bbox_list = bbox_splitter.get_bbox_list()
print(f" BBoxSplitter returned: {len(bbox_list)} bbox(es) (after reduce_bbox_sizes)")
# Show bbox dimensions to verify tiles are smaller
if bbox_list:
sizes = []
for bbox in bbox_list[:min(5, len(bbox_list))]:
bbox_width = (bbox.max_x - bbox.min_x) * 111320
bbox_height = (bbox.max_y - bbox.min_y) * 111320
sizes.append((bbox_width, bbox_height))
avg_width = np.mean([s[0] for s in sizes])
avg_height = np.mean([s[1] for s in sizes])
print(f" Sample tiles (avg): {avg_width:.0f}m × {avg_height:.0f}m")
# Filter to keep only tiles intersecting field geometries
geometry_list = []
filtered_bbox_list = []
for bbox in bbox_list:
tile_poly = box(
bbox.min_x, bbox.min_y,
bbox.max_x, bbox.max_y
)
intersection = tile_poly.intersection(union_geom)
if not intersection.is_empty:
filtered_bbox_list.append(bbox)
geometry_list.append(intersection)
print(f" ✓ Final active tiles: {len(filtered_bbox_list)}")
return filtered_bbox_list, geometry_list
# ============================================================================
# DATA AVAILABILITY CHECK
# ============================================================================
def check_date_has_data(date_str: str, test_bbox: BBox, catalog, byoc) -> bool:
"""
Check if Planet imagery exists for the given date.
Returns False if no data, avoiding wasted downloads.
"""
try:
search_results = catalog.search(
collection=byoc,
bbox=test_bbox,
time=(date_str, date_str),
filter=None
)
tiles = list(search_results)
if len(tiles) > 0:
print(f" ✓ Date {date_str}: Found {len(tiles)} image tile(s)")
return True
else:
print(f" ✗ Date {date_str}: No imagery available")
return False
except Exception as e:
print(f" ⚠️ Date {date_str}: Check failed ({e}) — assuming data exists")
return True # Optimistic default
# ============================================================================
# DOWNLOAD FUNCTIONS
# ============================================================================
def download_tile(
date_str: str,
bbox: BBox,
output_dir: Path,
config,
byoc,
resolution: int = 3
) -> bool:
"""Download a single full tile (no geometry masking = lower PU) with exponential backoff."""
max_retries = 3
retry_delay = 1.0
for attempt in range(max_retries):
try:
size = bbox_to_dimensions(bbox, resolution=resolution)
# Create download request with 4-band cloud-masked evalscript (uint16)
request = SentinelHubRequest(
evalscript=EVALSCRIPT_4BAND_MASKED,
input_data=[
SentinelHubRequest.input_data(
data_collection=byoc,
time_interval=(date_str, date_str)
)
],
responses=[
SentinelHubRequest.output_response('default', MimeType.TIFF)
],
bbox=bbox,
size=size,
config=config,
data_folder=str(output_dir),
)
# Download
download_list = request.download_list
if not download_list:
print(f" ✗ No download requests generated for bbox {bbox}")
return False
client = SentinelHubDownloadClient(config=config)
client.download(download_list, max_threads=1) # Sequential to track PU
print(f" ✓ Downloaded tile")
return True
except Exception as e:
error_str = str(e).lower()
is_rate_limit = "rate" in error_str or "429" in error_str or "too many" in error_str
if is_rate_limit and attempt < max_retries - 1:
print(f" ⚠️ Rate limited, retrying in {retry_delay}s...")
time.sleep(retry_delay)
retry_delay *= 2 # Exponential backoff: 1s → 2s → 4s
else:
print(f" ✗ Download failed: {e}")
return False
return False
def download_date(
date_str: str,
bbox_list: List[BBox],
base_path: Path,
config,
byoc,
resolution: int = 3
) -> int:
"""
Download all tiles for a single date.
Returns number of successfully downloaded tiles.
"""
output_dir = base_path / 'single_images_8b' / date_str
output_dir.mkdir(parents=True, exist_ok=True)
print(f"\nDownloading {len(bbox_list)} tiles for {date_str}...")
successful = 0
for idx, bbox in enumerate(bbox_list, 1):
print(f" [{idx}/{len(bbox_list)}]", end=" ")
if download_tile(date_str, bbox, output_dir, config, byoc, resolution):
successful += 1
# Delay to avoid rate limiting (0.002s between requests - can be aggressive with small tiles)
time.sleep(0.002)
print(f"\n Result: {successful}/{len(bbox_list)} tiles downloaded")
return successful
# ============================================================================
# MERGE FUNCTION
# ============================================================================
def merge_tiles(date_str: str, base_path: Path) -> bool:
"""Merge downloaded tiles into single GeoTIFF using GDAL."""
single_images_dir = base_path / 'single_images_8b' / date_str
# Find all response.tiff files
file_list = [str(p) for p in single_images_dir.rglob('response.tiff')]
if not file_list:
print(f" ✗ No tiles found to merge")
return False
merged_tif_dir = base_path / 'merged_tif_8b'
merged_vrt_dir = base_path / 'merged_virtual_8b'
merged_tif_dir.mkdir(parents=True, exist_ok=True)
merged_vrt_dir.mkdir(parents=True, exist_ok=True)
merged_tif_path = merged_tif_dir / f"{date_str}.tif"
merged_vrt_path = merged_vrt_dir / f"merged_{date_str}.vrt"
try:
# Create virtual raster from tiles
print(f" Building VRT from {len(file_list)} tiles...")
vrt = gdal.BuildVRT(str(merged_vrt_path), file_list)
if vrt is None:
print(f" ✗ Failed to create VRT")
return False
vrt = None # Close VRT
# Convert to compressed GeoTIFF
print(f" Converting to GeoTIFF...")
options = gdal.TranslateOptions(
outputType=gdal.GDT_Float32,
creationOptions=[
'COMPRESS=LZW',
'TILED=YES',
'BLOCKXSIZE=256',
'BLOCKYSIZE=256',
'NUM_THREADS=ALL_CPUS'
]
)
result = gdal.Translate(str(merged_tif_path), str(merged_vrt_path), options=options)
if result is None:
print(f" ✗ Failed to convert VRT to TIFF")
return False
result = None # Close dataset
print(f" ✓ Merged to {merged_tif_path.name}")
return True
except Exception as e:
print(f" ✗ Merge failed: {e}")
return False
# ============================================================================
# MAIN WORKFLOW
# ============================================================================
def main():
"""Main download and merge workflow."""
# Parse arguments
parser = argparse.ArgumentParser(
description='Download Planet 8-band imagery with PU optimization'
)
parser.add_argument('project', help='Project name (angata, chemba, xinavane, etc.)')
parser.add_argument('--date', default=None, help='Date to download (YYYY-MM-DD). Default: today')
parser.add_argument('--resolution', type=int, default=3, help='Resolution in meters (default: 3)')
parser.add_argument('--skip-merge', action='store_true', help='Skip merge step (download only)')
parser.add_argument('--cleanup', action='store_true', help='Delete intermediate single_images after merge')
args = parser.parse_args()
# Setup paths
base_path = Path('../laravel_app/storage/app') / args.project
if not base_path.exists():
print(f"✗ Project path not found: {base_path}")
sys.exit(1)
geojson_file = base_path / 'Data' / 'pivot.geojson'
if not geojson_file.exists():
print(f"✗ GeoJSON not found: {geojson_file}")
sys.exit(1)
# Determine date
if args.date:
date_str = args.date
else:
date_str = datetime.date.today().strftime('%Y-%m-%d')
print(f"{'='*70}")
print(f"Planet 8-Band Download - PU Optimized")
print(f"{'='*70}")
print(f"Project: {args.project}")
print(f"Date: {date_str}")
print(f"Resolution: {args.resolution}m")
# Setup SentinelHub
print(f"\nSetting up SentinelHub...")
config, byoc, catalog = setup_config()
print(f"✓ SentinelHub configured")
# Load geometries
print(f"\nLoading field geometries...")
gdf = load_and_validate_geojson(geojson_file)
# Create optimal grid
print(f"\nCreating optimal grid...")
bbox_list, _ = create_optimal_grid_with_filtering(gdf, resolution=args.resolution)
if not bbox_list:
print(f"\n✗ No tiles intersect field geometries. Exiting.")
sys.exit(1)
# Check date availability
print(f"\nChecking data availability...")
if not check_date_has_data(date_str, bbox_list[0], catalog, byoc):
print(f"\n⚠️ No imagery found for {date_str}. Exiting without download.")
sys.exit(0)
# Download tiles
print(f"\n{'='*70}")
downloaded = download_date(date_str, bbox_list, base_path, config, byoc, args.resolution)
if downloaded == 0:
print(f"\n✗ No tiles downloaded. Exiting.")
sys.exit(1)
# Merge tiles
if not args.skip_merge:
print(f"\n{'='*70}")
print(f"Merging tiles...")
if merge_tiles(date_str, base_path):
print(f"✓ Merge complete")
# Cleanup intermediate files
if args.cleanup:
print(f"\nCleaning up intermediate files...")
import shutil
single_images_dir = base_path / 'single_images_8b' / date_str
merged_vrt_dir = base_path / 'merged_virtual_8b'
try:
if single_images_dir.exists():
shutil.rmtree(single_images_dir)
print(f" ✓ Deleted {single_images_dir.name}/{date_str}")
# Clean old VRT files
for vrt_file in merged_vrt_dir.glob(f"merged_{date_str}.vrt"):
vrt_file.unlink()
print(f" ✓ Deleted {vrt_file.name}")
except Exception as e:
print(f" ⚠️ Cleanup error: {e}")
else:
print(f"✗ Merge failed")
sys.exit(1)
print(f"\n{'='*70}")
print(f"✓ Done!")
print(f"Output: {base_path / 'merged_tif_8b' / f'{date_str}.tif'}")
print(f"{'='*70}")
if __name__ == '__main__':
main()