""" Script: download_planet_missing_dates.py Purpose: Download Planet satellite data for missing dates only (skip existing files). Can be called from batch scripts or other Python scripts. Usage: python download_planet_missing_dates.py --start 2026-01-17 --end 2026-12-20 --project angata python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run Environment variables (alternative to CLI args): DAYS: Number of days to download (default: 365) DATE: End date in YYYY-MM-DD format (default: today) PROJECT_DIR: Project name (default: angata) """ import os import sys import json import datetime import argparse import subprocess from pathlib import Path from osgeo import gdal import time import shutil import warnings import numpy as np import geopandas as gpd from shapely.geometry import MultiPolygon, Polygon, MultiLineString, box from shapely.ops import unary_union # Suppress GDAL TIFF metadata warnings (9-band files trigger false positives) warnings.filterwarnings('ignore', message='.*TIFFReadDirectory.*SamplesPerPixel.*') from sentinelhub import ( MimeType, CRS, BBox, SentinelHubRequest, SentinelHubDownloadClient, DataCollection, bbox_to_dimensions, SHConfig, BBoxSplitter, Geometry, SentinelHubCatalog ) # ============================================================================ # CONFIGURATION # ============================================================================ def get_config(): """Parse command line arguments and environment variables.""" parser = argparse.ArgumentParser(description="Download Planet satellite data for missing dates") parser.add_argument('--start', type=str, help='Start date (YYYY-MM-DD)', default=None) parser.add_argument('--end', type=str, help='End date (YYYY-MM-DD)', default=None) parser.add_argument('--project', type=str, default=os.getenv('PROJECT_DIR', 'angata'), help='Project name (default: angata)') parser.add_argument('--resolution', type=int, default=3, help='Resolution in meters') parser.add_argument('--days', type=int, default=365, help='Days to download (if --start not specified)') parser.add_argument('--dry-run', action='store_true', help='Show what would be downloaded without downloading') args = parser.parse_args() # Determine date range if args.end: end_date = datetime.datetime.strptime(args.end, "%Y-%m-%d").date() else: end_date = datetime.date.today() if args.start: start_date = datetime.datetime.strptime(args.start, "%Y-%m-%d").date() else: start_date = end_date - datetime.timedelta(days=args.days - 1) return { 'start_date': start_date, 'end_date': end_date, 'project': args.project, 'resolution': args.resolution, 'dry_run': args.dry_run } # ============================================================================ # SETUP # ============================================================================ config = SHConfig() config.sh_client_id = '1a72d811-4f0e-4447-8282-df09608cff44' config.sh_client_secret = 'FcBlRL29i9ZmTzhmKTv1etSMFs5PxSos' catalog = SentinelHubCatalog(config=config) collection_id = '4e56d0cb-c402-40ff-97bb-c2b9e6bfcf2a' byoc = DataCollection.define_byoc( collection_id, name='planet_data_8b', is_timeless=True ) # ============================================================================ # FUNCTIONS # ============================================================================ def setup_paths(project): """Create and return folder paths.""" BASE_PATH = Path('../laravel_app/storage/app') / project BASE_PATH_SINGLE_IMAGES = Path(BASE_PATH / 'single_images_8b') folder_for_merged_tifs = str(BASE_PATH / 'merged_tif_8b') folder_for_virtual_raster = str(BASE_PATH / 'merged_virtual_8b') geojson_file = Path(BASE_PATH / 'Data' / 'pivot.geojson') # Create folders if missing for folder in [BASE_PATH_SINGLE_IMAGES, folder_for_merged_tifs, folder_for_virtual_raster]: Path(folder).mkdir(parents=True, exist_ok=True) return { 'base': BASE_PATH, 'single_images': BASE_PATH_SINGLE_IMAGES, 'merged_tifs': folder_for_merged_tifs, 'virtual_raster': folder_for_virtual_raster, 'geojson': geojson_file } def get_existing_dates(merged_tifs_folder): """Get list of dates that already have merged TIF files.""" merged_tifs_path = Path(merged_tifs_folder) if not merged_tifs_path.exists(): return set() existing_dates = set() for tif_file in merged_tifs_path.glob('*.tif'): # Filename format: YYYY-MM-DD.tif date_str = tif_file.stem try: datetime.datetime.strptime(date_str, "%Y-%m-%d") existing_dates.add(date_str) except ValueError: pass # Ignore files that don't match date format return existing_dates def get_missing_dates(start_date, end_date, existing_dates): """Generate list of missing dates to download.""" current_date = start_date missing_dates = [] while current_date <= end_date: date_str = current_date.strftime('%Y-%m-%d') if date_str not in existing_dates: missing_dates.append(date_str) current_date += datetime.timedelta(days=1) return missing_dates def setup_bbox_list_clustered(geojson_file, resolution=3, max_pixels=2500): """ Load field geometries and create clustered BBox list. Instead of a uniform grid over the entire area, this creates bboxes ONLY around field clusters, eliminating PU waste on empty space between scattered fields. Args: geojson_file: Path to pivot.geojson resolution: Resolution in meters max_pixels: Max image dimension (SentinelHub limit) Returns: List of BBox objects covering field clusters """ try: geo_json = gpd.read_file(str(geojson_file)) except Exception as e: print(f"ERROR: Failed to load GeoJSON: {e}") return None geometries = geo_json.geometry.tolist() # Step 1: Cluster fields by proximity (tight threshold for small, efficient clusters) clusters = cluster_fields_by_proximity(geometries, threshold_km=1) print(f"\n✓ Detected {len(clusters)} field cluster(s)") # Step 2: Create bbox for each cluster (no buffer - will mosaic daily images anyway) bbox_list = [] max_size_m = max_pixels * resolution for i, cluster_geoms in enumerate(clusters, 1): # Get cluster bounds (tight around actual fields) cluster_union = unary_union(cluster_geoms) bounds = cluster_union.bounds # (minx, miny, maxx, maxy) minx, miny, maxx, maxy = bounds # Check size and split if needed width_m = (maxx - minx) * 111320 height_m = (maxy - miny) * 111320 if width_m <= max_size_m and height_m <= max_size_m: # Single bbox for this cluster bbox = BBox(bbox=[minx, miny, maxx, maxy], crs=CRS.WGS84) bbox_list.append(bbox) print(f" Cluster {i}: {len(cluster_geoms)} field(s) → 1 bbox ({width_m:.0f}m × {height_m:.0f}m)") else: # Need to split this large cluster sub_grid = calculate_dynamic_grid(cluster_geoms, resolution=resolution) sub_splitter = BBoxSplitter(cluster_geoms, CRS.WGS84, sub_grid, reduce_bbox_sizes=True) sub_bboxes = sub_splitter.get_bbox_list() bbox_list.extend(sub_bboxes) print(f" Cluster {i}: {len(cluster_geoms)} field(s) → {len(sub_bboxes)} bbox(es) (large cluster split)") return bbox_list def cluster_fields_by_proximity(geometries, threshold_km=3.0): """ Cluster field geometries by proximity. Fields within `threshold_km` of each other are grouped into same cluster. Uses a simple greedy approach: - Start with first ungrouped field - Find all fields within threshold - Repeat until all grouped Args: geometries: List of Shapely geometries threshold_km: Distance threshold in kilometers Returns: List of clusters, where each cluster is a list of geometries """ from scipy.spatial.distance import cdist # Get centroids centroids = np.array([geom.centroid.coords[0] for geom in geometries]) # Convert degrees to km (rough) threshold_deg = threshold_km / 111.0 # Simple clustering: if distance < threshold, same cluster clusters = [] used = set() for i, centroid in enumerate(centroids): if i in used: continue # Start new cluster with this field cluster_indices = [i] used.add(i) # Find all nearby fields for j, other_centroid in enumerate(centroids): if j in used: continue dist = np.sqrt((centroid[0] - other_centroid[0])**2 + (centroid[1] - other_centroid[1])**2) if dist < threshold_deg: cluster_indices.append(j) used.add(j) # Add this cluster cluster_geoms = [geometries[idx] for idx in cluster_indices] clusters.append(cluster_geoms) return clusters def setup_bbox_list(geojson_file, resolution=3): """Load field geometries and create BBox list (clustered approach).""" return setup_bbox_list_clustered(geojson_file, resolution=resolution) def calculate_dynamic_grid(shapely_geometries, resolution=3, max_pixels=2500): """Calculate optimal grid size for BBox splitting.""" flattened_geoms = [] for geom in shapely_geometries: if isinstance(geom, MultiPolygon): flattened_geoms.extend(list(geom.geoms)) else: flattened_geoms.append(geom) if len(flattened_geoms) == 1: bounds = flattened_geoms[0].bounds else: multi = MultiPolygon(flattened_geoms) bounds = multi.bounds minx, miny, maxx, maxy = bounds width_m = (maxx - minx) * 111320 height_m = (maxy - miny) * 111320 max_size_m = max_pixels * resolution nx = max(1, int(np.ceil(width_m / max_size_m))) ny = max(1, int(np.ceil(height_m / max_size_m))) return (nx, ny) def is_image_available(slot, bbox_list, collection_id): """Check if Planet imagery is available for the given date.""" try: test_bbox = bbox_list[0] if bbox_list else None if test_bbox is None: return True search_results = catalog.search( collection=DataCollection.define_byoc(collection_id), bbox=test_bbox, time=(slot, slot), filter=None ) tiles = list(search_results) available = len(tiles) > 0 if available: print(f" ✓ Imagery available for {slot}") else: print(f" ✗ No imagery found for {slot}") return available except Exception as e: print(f" ⚠ Error checking availability for {slot}: {e}") return True def download_function(slot, bbox, size, base_path_single_images, dry_run=False): """Download Planet imagery for a specific date and bbox.""" if dry_run: print(f" [DRY-RUN] Would download {slot}") return try: request = SentinelHubRequest( evalscript=get_evalscript(), input_data=[ SentinelHubRequest.input_data( data_collection=byoc, time_interval=(slot, slot) ) ], responses=[ SentinelHubRequest.output_response('default', MimeType.TIFF) ], bbox=bbox, size=size, config=config, data_folder=str(base_path_single_images / slot), ) list_of_requests = [request.download_list[0]] # Use max_threads=1 to respect SentinelHub rate limits data = SentinelHubDownloadClient(config=config).download(list_of_requests, max_threads=1) print(f' ✓ Downloaded image for {slot}') # Increase delay to 2.0s between requests to avoid rate limit warnings time.sleep(1.0) except Exception as e: print(f' ✗ Error downloading {slot}: {e}') def merge_files(slot, base_path_single_images, merged_tifs_folder, virtual_raster_folder, dry_run=False): """Merge downloaded tiles for a specific date.""" slot_dir = Path(base_path_single_images / slot) file_list = [str(p) for p in slot_dir.rglob('response.tiff') if p.is_file()] if not file_list: print(f" ✗ No response.tiff files found for {slot}") return False if dry_run: print(f" [DRY-RUN] Would merge {len(file_list)} tiles for {slot}") return True merged_tif_path = str(Path(merged_tifs_folder) / f"{slot}.tif") merged_vrt_path = str(Path(virtual_raster_folder) / f"merged{slot}.vrt") try: vrt_all = gdal.BuildVRT(merged_vrt_path, file_list) if vrt_all is None: print(f" ✗ Failed to create VRT for {slot}") return False vrt_all = None options = gdal.TranslateOptions( outputType=gdal.GDT_Float32, creationOptions=[ 'COMPRESS=LZW', 'TILED=YES', 'BLOCKXSIZE=256', 'BLOCKYSIZE=256', 'NUM_THREADS=ALL_CPUS' ] ) result = gdal.Translate(merged_tif_path, merged_vrt_path, options=options) if result is None: print(f" ✗ Failed to translate VRT to TIFF for {slot}") return False result = None print(f" ✓ Merged {len(file_list)} tiles for {slot}") # Clean up single images folder for this date try: shutil.rmtree(slot_dir) print(f" ✓ Cleaned up single images for {slot}") except Exception as e: print(f" ⚠ Could not clean up {slot_dir}: {e}") return True except Exception as e: print(f" ✗ Exception while processing {slot}: {e}") return False def get_evalscript(): """Return Planet Scope evalscript with 8 bands + UDM1.""" return """ //VERSION=3 function setup() { return { input: [{ bands: ["coastal_blue", "blue", "green_i", "green", "yellow", "red", "rededge", "nir", "udm1"], units: "DN" }], output: { bands: 9, sampleType: "FLOAT32" } }; } function evaluatePixel(sample) { var scaledCoastalBlue = 2.5 * sample.coastal_blue / 10000; var scaledBlue = 2.5 * sample.blue / 10000; var scaledGreenI = 2.5 * sample.green_i / 10000; var scaledGreen = 2.5 * sample.green / 10000; var scaledYellow = 2.5 * sample.yellow / 10000; var scaledRed = 2.5 * sample.red / 10000; var scaledRedEdge = 2.5 * sample.rededge / 10000; var scaledNIR = 2.5 * sample.nir / 10000; var udm1 = sample.udm1; return [scaledCoastalBlue, scaledBlue, scaledGreenI, scaledGreen, scaledYellow, scaledRed, scaledRedEdge, scaledNIR, udm1]; } """ # ============================================================================ # MAIN # ============================================================================ def main(): print("="*80) print("PLANET SATELLITE DATA DOWNLOADER - MISSING DATES ONLY") print("Wrapper for 00_download_8band_pu_optimized.py") print("="*80) config_dict = get_config() print(f"\nConfiguration:") print(f" Start date: {config_dict['start_date']}") print(f" End date: {config_dict['end_date']}") print(f" Project: {config_dict['project']}") print(f" Resolution: {config_dict['resolution']}m") if config_dict['dry_run']: print(f" Mode: DRY-RUN (no actual downloads)") # Setup paths paths = setup_paths(config_dict['project']) print(f"\nPaths:") print(f" Merged TIFs: {paths['merged_tifs']}") print(f" GeoJSON: {paths['geojson']}") # Check GeoJSON exists if not paths['geojson'].exists(): print(f"\nERROR: GeoJSON not found at {paths['geojson']}") return 1 # Get existing dates print(f"\nScanning existing dates...") existing_dates = get_existing_dates(paths['merged_tifs']) print(f" Found {len(existing_dates)} existing dates") # Get missing dates print(f"\nFinding missing dates...") missing_dates = get_missing_dates( config_dict['start_date'], config_dict['end_date'], existing_dates ) print(f" {len(missing_dates)} dates to download") if not missing_dates: print("\n✓ All dates already downloaded!") return 0 # Show missing date range if missing_dates: print(f"\n Date range: {missing_dates[0]} to {missing_dates[-1]}") if len(missing_dates) <= 10: for date in missing_dates: print(f" - {date}") else: for date in missing_dates[:3]: print(f" - {date}") print(f" ... ({len(missing_dates) - 6} more) ...") for date in missing_dates[-3:]: print(f" - {date}") if config_dict['dry_run']: print("\n[DRY-RUN] Would download above dates using 00_download_8band_pu_optimized.py") return 0 # Download each missing date using the optimized downloader print(f"\n{'='*80}") print(f"Downloading missing dates using optimized script...") print(f"{'='*80}") success_count = 0 for i, date_str in enumerate(missing_dates, 1): print(f"\n[{i}/{len(missing_dates)}] Downloading {date_str}...") # Call 00_download_8band_pu_optimized.py for this date cmd = [ sys.executable, "00_download_8band_pu_optimized.py", config_dict['project'], "--date", date_str, "--resolution", str(config_dict['resolution']), "--cleanup" ] try: result = subprocess.run(cmd, check=True, capture_output=False) success_count += 1 print(f" ✓ Successfully downloaded {date_str}") except subprocess.CalledProcessError as e: print(f" ✗ Failed to download {date_str}: {e}") # Continue with next date instead of stopping continue # Summary print(f"\n{'='*80}") print(f"SUMMARY:") print(f" Successfully processed: {success_count}/{len(missing_dates)} dates") print(f" Output folder: {paths['merged_tifs']}") print(f"{'='*80}") return 0 if success_count == len(missing_dates) else 1 if __name__ == "__main__": sys.exit(main())