{ "cells": [ { "cell_type": "markdown", "id": "5ea10771", "metadata": {}, "source": [ "# Cloud Detection - Step 1: Identify Cloudy Images\n", "\n", "This notebook downloads Planet imagery for the **Aura** project (last 3 weeks) and helps identify which images contain clouds.\n", "\n", "**Workflow:**\n", "1. Connect to SentinelHub\n", "2. Define Aura project area\n", "3. Download images from last 3 weeks\n", "4. Generate quick-look visualizations\n", "5. Identify cloudy images for testing with OmniCloudMask" ] }, { "cell_type": "markdown", "id": "4f43a8b9", "metadata": {}, "source": [ "## 1. Setup and Imports" ] }, { "cell_type": "code", "execution_count": null, "id": "1b300ebc", "metadata": {}, "outputs": [], "source": [ "# Install required packages (uncomment if needed)\n", "# !pip install sentinelhub\n", "# !pip install geopandas matplotlib pillow\n", "\n", "import os\n", "import json\n", "import datetime\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from pathlib import Path\n", "from osgeo import gdal\n", "\n", "from sentinelhub import (\n", " MimeType, CRS, BBox, SentinelHubRequest, SentinelHubDownloadClient,\n", " DataCollection, bbox_to_dimensions, SHConfig, BBoxSplitter, Geometry, SentinelHubCatalog\n", ")\n", "\n", "import time\n", "import shutil\n", "import geopandas as gpd\n", "from shapely.geometry import MultiLineString, MultiPolygon, Polygon\n", "from PIL import Image" ] }, { "cell_type": "markdown", "id": "6b0d9534", "metadata": {}, "source": [ "## 2. Configure SentinelHub" ] }, { "cell_type": "code", "execution_count": null, "id": "72a2d6ca", "metadata": {}, "outputs": [], "source": [ "config = SHConfig()\n", "config.sh_client_id = '1a72d811-4f0e-4447-8282-df09608cff44'\n", "config.sh_client_secret = 'FcBlRL29i9ZmTzhmKTv1etSMFs5PxSos'\n", "\n", "catalog = SentinelHubCatalog(config=config)\n", "\n", "# Define BYOC collection\n", "collection_id = 'c691479f-358c-46b1-b0f0-e12b70a9856c'\n", "byoc = DataCollection.define_byoc(\n", " collection_id,\n", " name='planet_data2',\n", " is_timeless=True\n", ")\n", "\n", "print(\"✓ SentinelHub configured\")" ] }, { "cell_type": "markdown", "id": "b43e776d", "metadata": {}, "source": [ "## 3. Define Project and Paths" ] }, { "cell_type": "code", "execution_count": null, "id": "595021b5", "metadata": {}, "outputs": [], "source": [ "project = 'aura'\n", "resolution = 3 # 3m resolution for Planet\n", "\n", "# Define paths\n", "BASE_PATH = Path('../laravel_app/storage/app') / project\n", "BASE_PATH_SINGLE_IMAGES = BASE_PATH / 'cloud_test_single_images'\n", "folder_for_merged_tifs = BASE_PATH / 'cloud_test_merged_tif'\n", "folder_for_virtual_raster = BASE_PATH / 'cloud_test_merged_virtual'\n", "geojson_file = BASE_PATH / 'Data' / 'pivot.geojson'\n", "\n", "# Create folders if they don't exist\n", "for folder in [BASE_PATH_SINGLE_IMAGES, folder_for_merged_tifs, folder_for_virtual_raster]:\n", " folder.mkdir(parents=True, exist_ok=True)\n", "\n", "print(f\"Project: {project}\")\n", "print(f\"Base path: {BASE_PATH}\")\n", "print(f\"GeoJSON: {geojson_file}\")\n", "print(f\"✓ Folders created/verified\")" ] }, { "cell_type": "markdown", "id": "ca46160a", "metadata": {}, "source": [ "## 4. Define Time Period (Last 3 Weeks)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e6d4013", "metadata": {}, "outputs": [], "source": [ "# Calculate last 3 weeks (21 days)\n", "end_date = datetime.date.today()\n", "start_date = end_date - datetime.timedelta(days=21)\n", "\n", "# Generate daily slots\n", "days_needed = 21\n", "slots = [(start_date + datetime.timedelta(days=i)).strftime('%Y-%m-%d') for i in range(days_needed)]\n", "\n", "print(f\"Date range: {start_date} to {end_date}\")\n", "print(f\"Total days: {len(slots)}\")\n", "print(f\"\\nFirst 5 dates: {slots[:5]}\")\n", "print(f\"Last 5 dates: {slots[-5:]}\")" ] }, { "cell_type": "markdown", "id": "df16c395", "metadata": {}, "source": [ "## 5. Load Field Boundaries and Create BBox Grid" ] }, { "cell_type": "code", "execution_count": null, "id": "cf88f697", "metadata": {}, "outputs": [], "source": [ "# Load GeoJSON\n", "geo_json = gpd.read_file(str(geojson_file))\n", "print(f\"Loaded {len(geo_json)} field polygons\")\n", "\n", "# Create geometries\n", "geometries = [Geometry(geometry, crs=CRS.WGS84) for geometry in geo_json.geometry]\n", "shapely_geometries = [geometry.geometry for geometry in geometries]\n", "\n", "# Get total bounds\n", "from shapely.geometry import box\n", "total_bounds = geo_json.total_bounds # [minx, miny, maxx, maxy]\n", "print(f\"\\nTotal bounds: {total_bounds}\")\n", "\n", "# Calculate approximate image size for single bbox\n", "single_bbox_test = BBox(bbox=tuple(total_bounds), crs=CRS.WGS84)\n", "single_size = bbox_to_dimensions(single_bbox_test, resolution=resolution)\n", "print(f\"Single bbox would create image of: {single_size[0]} x {single_size[1]} pixels\")\n", "\n", "# SentinelHub limit is 2500x2500 pixels\n", "if single_size[0] > 2500 or single_size[1] > 2500:\n", " print(f\"⚠️ Image too large for single download (max 2500x2500)\")\n", " print(f\" Using 2x2 grid to split into smaller tiles...\")\n", " \n", " # Use BBoxSplitter with 2x2 grid\n", " bbox_splitter = BBoxSplitter(\n", " shapely_geometries, CRS.WGS84, (2, 2), reduce_bbox_sizes=True\n", " )\n", " bbox_list = bbox_splitter.get_bbox_list()\n", " print(f\" Split into {len(bbox_list)} tiles\")\n", "else:\n", " print(f\"✓ Single bbox works - using 1 tile per date\")\n", " bbox_list = [single_bbox_test]\n", "\n", "# Verify tile sizes\n", "print(f\"\\nVerifying tile sizes:\")\n", "for i, bbox in enumerate(bbox_list, 1):\n", " size = bbox_to_dimensions(bbox, resolution=resolution)\n", " status = \"✓\" if size[0] <= 2500 and size[1] <= 2500 else \"✗\"\n", " print(f\" Tile {i}: {size[0]} x {size[1]} pixels {status}\")\n" ] }, { "cell_type": "markdown", "id": "f78964df", "metadata": {}, "source": [ "## 6. Check Image Availability" ] }, { "cell_type": "markdown", "id": "09c2fcc6", "metadata": {}, "source": [ "## 5.5. Visualize Download Grid (Optional)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e1a7660", "metadata": {}, "outputs": [], "source": [ "# Visualize the download grid to ensure good coverage\n", "fig, ax = plt.subplots(1, 1, figsize=(12, 12))\n", "\n", "# Plot field boundaries\n", "geo_json.boundary.plot(ax=ax, color='green', linewidth=2, label='Fields')\n", "\n", "# Plot bboxes\n", "for i, bbox in enumerate(bbox_list):\n", " bbox_geom = box(bbox[0], bbox[1], bbox[2], bbox[3])\n", " x, y = bbox_geom.exterior.xy\n", " ax.plot(x, y, 'r--', linewidth=1, alpha=0.7)\n", " # Add bbox number\n", " centroid = bbox_geom.centroid\n", " ax.text(centroid.x, centroid.y, str(i+1), fontsize=10, ha='center', \n", " bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))\n", "\n", "ax.set_xlabel('Longitude')\n", "ax.set_ylabel('Latitude')\n", "ax.set_title('Download Grid (Red) vs Field Boundaries (Green)', fontsize=14, fontweight='bold')\n", "ax.legend()\n", "ax.grid(True, alpha=0.3)\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "print(f\"✓ Visualization complete - verify that red boxes cover green field boundaries\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2fcded08", "metadata": {}, "outputs": [], "source": [ "def is_image_available(date):\n", " \"\"\"Check if Planet images are available for a given date.\"\"\"\n", " for bbox in bbox_list:\n", " search_iterator = catalog.search(\n", " collection=byoc,\n", " bbox=bbox,\n", " time=(date, date)\n", " )\n", " if len(list(search_iterator)) > 0:\n", " return True\n", " return False\n", "\n", "# Filter to available dates only\n", "print(\"Checking image availability...\")\n", "available_slots = [slot for slot in slots if is_image_available(slot)]\n", "\n", "print(f\"\\n{'='*60}\")\n", "print(f\"Total requested dates: {len(slots)}\")\n", "print(f\"Available dates: {len(available_slots)}\")\n", "print(f\"Excluded (no data): {len(slots) - len(available_slots)}\")\n", "print(f\"{'='*60}\")\n", "print(f\"\\nAvailable dates:\")\n", "for slot in available_slots:\n", " print(f\" - {slot}\")" ] }, { "cell_type": "markdown", "id": "b67f5deb", "metadata": {}, "source": [ "## 7. Define Download Functions" ] }, { "cell_type": "code", "execution_count": null, "id": "26cd367f", "metadata": {}, "outputs": [], "source": [ "# Evalscript to get RGB + NIR + UDM1 mask\n", "# NOTE: Not specifying sampleType makes SentinelHub auto-convert 0-1 float to 0-255 byte\n", "# This matches the production script behavior\n", "evalscript_with_udm = \"\"\"\n", " //VERSION=3\n", "\n", " function setup() {\n", " return {\n", " input: [{\n", " bands: [\"red\", \"green\", \"blue\", \"nir\", \"udm1\"]\n", " }],\n", " output: {\n", " bands: 5\n", " // sampleType: \"FLOAT32\" - commented out to get 0-255 byte output like production\n", " }\n", " };\n", " }\n", "\n", " function evaluatePixel(sample) {\n", " // Return all bands including udm1 (last band)\n", " return [\n", " 2.5 * sample.red / 10000,\n", " 2.5 * sample.green / 10000,\n", " 2.5 * sample.blue / 10000,\n", " 2.5 * sample.nir / 10000,\n", " sample.udm1 // 0 = usable, 1 = unusable (clouds, shadows, etc.)\n", " ];\n", " }\n", "\"\"\"\n", "\n", "def get_download_request(time_interval, bbox, size):\n", " \"\"\"Create a SentinelHub request for a given date and bbox.\"\"\"\n", " return SentinelHubRequest(\n", " evalscript=evalscript_with_udm,\n", " input_data=[\n", " SentinelHubRequest.input_data(\n", " data_collection=DataCollection.planet_data2,\n", " time_interval=(time_interval, time_interval)\n", " )\n", " ],\n", " responses=[\n", " SentinelHubRequest.output_response('default', MimeType.TIFF)\n", " ],\n", " bbox=bbox,\n", " size=size,\n", " config=config,\n", " data_folder=str(BASE_PATH_SINGLE_IMAGES / time_interval),\n", " )\n", "\n", "def download_for_date_and_bbox(slot, bbox, size):\n", " \"\"\"Download image for a specific date and bounding box.\"\"\"\n", " list_of_requests = [get_download_request(slot, bbox, size)]\n", " list_of_requests = [request.download_list[0] for request in list_of_requests]\n", " \n", " data = SentinelHubDownloadClient(config=config).download(list_of_requests, max_threads=5)\n", " time.sleep(0.1)\n", " return data\n", "\n", "def merge_tiles_for_date(slot):\n", " \"\"\"Merge all tiles for a given date into one GeoTIFF.\"\"\"\n", " # List downloaded tiles\n", " file_list = [str(x / \"response.tiff\") for x in Path(BASE_PATH_SINGLE_IMAGES / slot).iterdir() if x.is_dir()]\n", " \n", " if not file_list:\n", " print(f\" No tiles found for {slot}\")\n", " return None\n", " \n", " vrt_path = str(folder_for_virtual_raster / f\"merged_{slot}.vrt\")\n", " output_path = str(folder_for_merged_tifs / f\"{slot}.tif\")\n", " \n", " # Create virtual raster with proper options\n", " vrt_options = gdal.BuildVRTOptions(\n", " resolution='highest',\n", " separate=False,\n", " addAlpha=False\n", " )\n", " vrt = gdal.BuildVRT(vrt_path, file_list, options=vrt_options)\n", " vrt = None # Close\n", " \n", " # Convert to GeoTIFF with proper options\n", " # Use COMPRESS=LZW to save space, TILED for better performance\n", " translate_options = gdal.TranslateOptions(\n", " creationOptions=['COMPRESS=LZW', 'TILED=YES', 'BIGTIFF=IF_SAFER']\n", " )\n", " gdal.Translate(output_path, vrt_path, options=translate_options)\n", " \n", " return output_path\n", "\n", "print(\"✓ Download functions defined\")" ] }, { "cell_type": "markdown", "id": "e9f17ba8", "metadata": {}, "source": [ "## 8. Download Images" ] }, { "cell_type": "code", "execution_count": null, "id": "e66173ea", "metadata": {}, "outputs": [], "source": [ "print(f\"Starting download for {len(available_slots)} dates...\\n\")\n", "\n", "for i, slot in enumerate(available_slots, 1):\n", " print(f\"[{i}/{len(available_slots)}] Downloading {slot}...\")\n", " \n", " for j, bbox in enumerate(bbox_list, 1):\n", " bbox_obj = BBox(bbox=bbox, crs=CRS.WGS84)\n", " size = bbox_to_dimensions(bbox_obj, resolution=resolution)\n", " \n", " try:\n", " download_for_date_and_bbox(slot, bbox_obj, size)\n", " print(f\" ✓ Tile {j}/{len(bbox_list)} downloaded\")\n", " except Exception as e:\n", " print(f\" ✗ Tile {j}/{len(bbox_list)} failed: {e}\")\n", " \n", " time.sleep(0.2)\n", " \n", " print()\n", "\n", "print(\"\\n✓ All downloads complete!\")" ] }, { "cell_type": "markdown", "id": "e4bec74c", "metadata": {}, "source": [ "## 9. Merge Tiles into Single Images" ] }, { "cell_type": "code", "execution_count": null, "id": "e9b270be", "metadata": {}, "outputs": [], "source": [ "print(\"Merging tiles for each date...\\n\")\n", "\n", "merged_files = {}\n", "for slot in available_slots:\n", " print(f\"Merging {slot}...\")\n", " output_path = merge_tiles_for_date(slot)\n", " if output_path:\n", " merged_files[slot] = output_path\n", " print(f\" ✓ Saved to: {output_path}\")\n", " else:\n", " print(f\" ✗ Failed to merge\")\n", "\n", "print(f\"\\n✓ Successfully merged {len(merged_files)} images\")" ] }, { "cell_type": "markdown", "id": "ec3f1a6d", "metadata": {}, "source": [ "## 10. Analyze Cloud Coverage Using UDM1" ] }, { "cell_type": "code", "execution_count": null, "id": "9f4047e5", "metadata": {}, "outputs": [], "source": [ "def analyze_cloud_coverage(tif_path):\n", " \"\"\"Calculate cloud coverage percentage using UDM1 band (band 5).\"\"\"\n", " ds = gdal.Open(tif_path)\n", " if ds is None:\n", " return None, None\n", " \n", " # Band 5 is UDM1 (0 = clear, 1 = cloudy/unusable)\n", " udm_band = ds.GetRasterBand(5).ReadAsArray()\n", " \n", " total_pixels = udm_band.size\n", " cloudy_pixels = np.sum(udm_band == 1)\n", " cloud_percentage = (cloudy_pixels / total_pixels) * 100\n", " \n", " ds = None\n", " return cloud_percentage, udm_band\n", "\n", "# Analyze all images\n", "cloud_stats = {}\n", "print(\"Analyzing cloud coverage...\\n\")\n", "print(f\"{'Date':<12} {'Cloud %':<10} {'Status'}\")\n", "print(\"-\" * 40)\n", "\n", "for date, path in sorted(merged_files.items()):\n", " cloud_pct, _ = analyze_cloud_coverage(path)\n", " if cloud_pct is not None:\n", " cloud_stats[date] = cloud_pct\n", " \n", " # Categorize\n", " if cloud_pct < 5:\n", " status = \"☀️ Clear\"\n", " elif cloud_pct < 20:\n", " status = \"🌤️ Mostly clear\"\n", " elif cloud_pct < 50:\n", " status = \"⛅ Partly cloudy\"\n", " else:\n", " status = \"☁️ Very cloudy\"\n", " \n", " print(f\"{date:<12} {cloud_pct:>6.2f}% {status}\")\n", "\n", "print(f\"\\n✓ Analysis complete for {len(cloud_stats)} images\")" ] }, { "cell_type": "markdown", "id": "3d966858", "metadata": {}, "source": [ "## 11. Visualize Images with Cloud Coverage" ] }, { "cell_type": "code", "execution_count": null, "id": "f8b2b2fc", "metadata": {}, "outputs": [], "source": [ "def create_quicklook(tif_path, date, cloud_pct):\n", " \"\"\"Create RGB quicklook with UDM1 overlay.\"\"\"\n", " ds = gdal.Open(tif_path)\n", " if ds is None:\n", " return None\n", " \n", " # Read RGB bands (1=R, 2=G, 3=B)\n", " red = ds.GetRasterBand(1).ReadAsArray()\n", " green = ds.GetRasterBand(2).ReadAsArray()\n", " blue = ds.GetRasterBand(3).ReadAsArray()\n", " udm = ds.GetRasterBand(5).ReadAsArray()\n", " \n", " # Clip to 0-1 range\n", " rgb = np.dstack([np.clip(red, 0, 1), np.clip(green, 0, 1), np.clip(blue, 0, 1)])\n", " \n", " # Create figure\n", " fig, axes = plt.subplots(1, 2, figsize=(14, 6))\n", " \n", " # RGB image\n", " axes[0].imshow(rgb)\n", " axes[0].set_title(f\"RGB - {date}\", fontsize=14, fontweight='bold')\n", " axes[0].axis('off')\n", " \n", " # UDM1 mask (clouds in red)\n", " cloud_overlay = rgb.copy()\n", " cloud_overlay[udm == 1] = [1, 0, 0] # Red for clouds\n", " axes[1].imshow(cloud_overlay)\n", " axes[1].set_title(f\"Cloud Mask (UDM1) - {cloud_pct:.1f}% cloudy\", fontsize=14, fontweight='bold')\n", " axes[1].axis('off')\n", " \n", " plt.tight_layout()\n", " ds = None\n", " return fig\n", "\n", "# Display images sorted by cloud coverage (most cloudy first)\n", "sorted_by_clouds = sorted(cloud_stats.items(), key=lambda x: x[1], reverse=True)\n", "\n", "print(\"Generating visualizations...\\n\")\n", "for date, cloud_pct in sorted_by_clouds[:5]: # Show top 5 cloudiest\n", " if date in merged_files:\n", " fig = create_quicklook(merged_files[date], date, cloud_pct)\n", " if fig:\n", " plt.show()\n", " plt.close()\n", "\n", "print(\"✓ Visualizations complete\")" ] }, { "cell_type": "markdown", "id": "94de1b4b", "metadata": {}, "source": [ "## 12. Select Candidate Images for OmniCloudMask Testing" ] }, { "cell_type": "code", "execution_count": null, "id": "4ae8c727", "metadata": {}, "outputs": [], "source": [ "# Select images with moderate to high cloud coverage (20-70%)\n", "# These are good candidates for testing cloud detection\n", "test_candidates = [\n", " (date, cloud_pct, merged_files[date]) \n", " for date, cloud_pct in cloud_stats.items() \n", " if 20 <= cloud_pct <= 70\n", "]\n", "\n", "test_candidates.sort(key=lambda x: x[1], reverse=True)\n", "\n", "print(\"\\n\" + \"=\"*60)\n", "print(\"RECOMMENDED IMAGES FOR OMNICLOUDMASK TESTING\")\n", "print(\"=\"*60)\n", "print(f\"\\n{'Rank':<6} {'Date':<12} {'Cloud %':<10} {'Path'}\")\n", "print(\"-\" * 80)\n", "\n", "for i, (date, cloud_pct, path) in enumerate(test_candidates[:5], 1):\n", " print(f\"{i:<6} {date:<12} {cloud_pct:>6.2f}% {path}\")\n", "\n", "if test_candidates:\n", " print(f\"\\n✓ Top candidate: {test_candidates[0][0]} ({test_candidates[0][1]:.1f}% cloudy)\")\n", " print(f\" Path: {test_candidates[0][2]}\")\n", " print(\"\\n👉 Use this image in Step 2 (cloud_detection_step2_test_omnicloudmask.ipynb)\")\n", "else:\n", " print(\"\\n⚠️ No suitable cloudy images found in this period.\")\n", " print(\" Try extending the date range or select any available image.\")" ] }, { "cell_type": "markdown", "id": "ea103951", "metadata": {}, "source": [ "## 13. Export Summary" ] }, { "cell_type": "code", "execution_count": null, "id": "b5c78310", "metadata": {}, "outputs": [], "source": [ "# Save summary to JSON for Step 2\n", "summary = {\n", " \"project\": project,\n", " \"date_range\": f\"{start_date} to {end_date}\",\n", " \"total_dates\": len(slots),\n", " \"available_dates\": len(available_slots),\n", " \"cloud_statistics\": cloud_stats,\n", " \"test_candidates\": [\n", " {\"date\": date, \"cloud_percentage\": cloud_pct, \"path\": path}\n", " for date, cloud_pct, path in test_candidates[:5]\n", " ],\n", " \"merged_files\": merged_files\n", "}\n", "\n", "summary_path = BASE_PATH / 'cloud_detection_summary.json'\n", "with open(summary_path, 'w') as f:\n", " json.dump(summary, f, indent=2)\n", "\n", "print(f\"✓ Summary saved to: {summary_path}\")\n", "print(\"\\n\" + \"=\"*60)\n", "print(\"NEXT STEP: Open cloud_detection_step2_test_omnicloudmask.ipynb\")\n", "print(\"=\"*60)" ] }, { "cell_type": "markdown", "id": "f6f6d142", "metadata": {}, "source": [ "## 14. Cleanup (Optional)" ] }, { "cell_type": "code", "execution_count": null, "id": "88a775f8", "metadata": {}, "outputs": [], "source": [ "# Uncomment to delete intermediate files (single tiles and virtual rasters)\n", "# Keep merged GeoTIFFs for Step 2\n", "\n", "cleanup = False # Set to True to enable cleanup\n", "\n", "if cleanup:\n", " folders_to_clean = [BASE_PATH_SINGLE_IMAGES, folder_for_virtual_raster]\n", " \n", " for folder in folders_to_clean:\n", " if folder.exists():\n", " shutil.rmtree(folder)\n", " folder.mkdir()\n", " print(f\"✓ Cleaned: {folder}\")\n", " \n", " print(\"\\n✓ Cleanup complete - merged GeoTIFFs preserved\")\n", "else:\n", " print(\"Cleanup disabled. Set cleanup=True to remove intermediate files.\")" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }