diff --git a/python_app/31_harvest_imminent_weekly.py b/python_app/31_harvest_imminent_weekly.py index 09cf3de..8722eda 100644 --- a/python_app/31_harvest_imminent_weekly.py +++ b/python_app/31_harvest_imminent_weekly.py @@ -7,18 +7,19 @@ is approaching harvest. Use this for operational decision-making and real-time a RUN FREQUENCY: Weekly (or daily if required) INPUT: - - ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R) + - harvest.xlsx (baseline from scripts 22+23 - contains last harvest date per field) + Location: laravel_app/storage/app/{project}/Data/harvest.xlsx + - ci_data_for_python.csv (complete CI data from R script) Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv - - harvest_production_export.xlsx (baseline from script 01 - optional, for reference) OUTPUT: - - harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year) + - reports/kpis/field_stats/{project}_harvest_imminent_week_{WW}_{YYYY}.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year) Workflow: -1. Load harvest_production_export.xlsx (baseline dates - optional, for context) -2. Load ci_data_for_python.csv (recent CI data) -3. For each field, extract last 300 days of history -4. Run Model 307 inference on full sequence (last timestep probabilities) -5. Export harvest_imminent_weekly.csv with probabilities +1. Load harvest.xlsx to find last harvest date (season_end) per field +2. Load ci_data_for_python.csv (complete CI data) +3. For each field, extract all CI data AFTER last harvest (complete current season) +4. Run Model 307 inference on full season sequence (last timestep probabilities) +5. Export week_WW_YYYY.csv with probabilities Output Columns: - field: Field ID @@ -61,33 +62,34 @@ from harvest_date_pred_utils import ( def load_harvest_dates(harvest_file): - """Load latest harvest end dates from Excel file (from harvest_production_export.xlsx).""" - print("[1/5] Loading harvest dates...") + """Load last harvest end dates from harvest.xlsx (output from scripts 22+23).""" + print("[1/5] Loading harvest data for season boundaries...") if not Path(harvest_file).exists(): print(f" ERROR: {harvest_file} not found") - print(" Using 180-day lookback as default") + print(f" harvest.xlsx is required to determine current season boundaries") return None try: harvest_df = pd.read_excel(harvest_file) - print(f" Loaded {len(harvest_df)} field-season records") + print(f" Loaded {len(harvest_df)} season records") - # Use season_end_date column (output from harvest prediction script) - harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date']) + # season_end contains the last harvest date for each season + harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end']) + harvest_df['field'] = harvest_df['field'].astype(str).str.strip() - # Group by field and get the latest season_end_date + # Group by field and get the LATEST season_end_date (most recent harvest) + # This marks the start of the current season harvest_dates = {} for field_id, group in harvest_df.groupby('field'): - latest_end = group['season_end_date'].max() - harvest_dates[str(field_id).strip()] = latest_end + latest_harvest = group['season_end'].max() + harvest_dates[field_id] = latest_harvest print(f" Successfully mapped {len(harvest_dates)} fields") - print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}") + print(f" Last harvest dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}") return harvest_dates except Exception as e: - print(f" ERROR loading harvest file: {e}") - print(f" Using 180-day lookback instead") + print(f" ERROR loading harvest.xlsx: {e}") return None @@ -212,26 +214,37 @@ def main(): # Get project name from command line or use default project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" - # Construct paths - base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + # Construct paths - work from either python_app/ or root smartcane/ directory + # Try root first (laravel_app/...), then fall back to ../ (running from python_app/) + if Path("laravel_app/storage/app").exists(): + base_storage = Path("laravel_app/storage/app") / project_name / "Data" + else: + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" - harvest_data_dir = base_storage / "HarvestData" - BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx" - OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv" - harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist + HARVEST_FILE = base_storage / "harvest.xlsx" # Output from scripts 22+23 + + # Determine week and year from current date for timestamped export + today = datetime.now() + week_num = int(today.strftime('%V')) + year_num = int(today.strftime('%Y')) + + # Output directory: reports/kpis/field_stats/ + reports_dir = base_storage.parent / "reports" / "kpis" / "field_stats" + reports_dir.mkdir(parents=True, exist_ok=True) + OUTPUT_CSV = reports_dir / f"{project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv" print("="*80) print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})") print("="*80) - # [1] Load harvest dates (optional - for projects with predictions) - harvest_dates = None - if BASELINE_FILE.exists(): - harvest_dates = load_harvest_dates(BASELINE_FILE) - else: - print("[1/5] Loading harvest dates...") - print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)") + # [1] Load harvest dates (required to determine season boundaries) + harvest_dates = load_harvest_dates(HARVEST_FILE) + if harvest_dates is None or len(harvest_dates) == 0: + print(f"ERROR: Cannot run without harvest.xlsx - required to determine current season boundaries") + print(f" Please run scripts 22 (baseline prediction) and 23 (format conversion) first") + return # [2] Load CI data print(f"\n[2/5] Loading CI data...") @@ -271,6 +284,9 @@ def main(): count = 0 for field_id in ci_data['field'].unique(): + # Convert field_id to string for consistency + field_id_str = str(field_id).strip() + # Get metadata meta = field_meta[field_meta['field'] == field_id] if len(meta) == 0: @@ -279,18 +295,21 @@ def main(): sub_field = meta['sub_field'].iloc[0] latest_date = meta['latest_date'].iloc[0] - # Use recent CI history (last 300 days from latest available data) + # Get last harvest date for this field (start of current season) + last_harvest = harvest_dates.get(field_id_str) + if last_harvest is None: + continue + + # Extract all CI data AFTER last harvest (complete current season) field_data = ci_data[ci_data['field'] == field_id].copy() field_data = field_data.sort_values('Date') + field_data = field_data[field_data['Date'] > last_harvest] # After last harvest - # Keep last 300 days of history for inference - if len(field_data) > 300: - field_data = field_data.iloc[-300:] - + # Need at least 30 days of data since planting if len(field_data) < 30: continue - # Run inference on recent history to predict next 28 days + # Run inference on full current season to predict next 28 days imminent_prob, detected_prob = run_inference_on_season( field_data, model, config, scalers, device, ci_column ) @@ -338,10 +357,11 @@ def main(): print(f" WARNING: No results exported - check CI data availability") print(f"\nStorage structure:") - print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/") - print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx") - print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/") - print(f"\nReady to load into 09b field analysis report") + print(f" Input harvest: laravel_app/storage/app/{project_name}/Data/harvest.xlsx") + print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/") + print(f" Output: laravel_app/storage/app/{project_name}/reports/kpis/field_stats/") + print(f" Filename: {project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv") + print(f"\nReady to load into 80_calculate_kpis.R") if __name__ == "__main__": diff --git a/python_app/batch_rgb_validation_top_fields.py b/python_app/batch_rgb_validation_top_fields.py new file mode 100644 index 0000000..cccb9ae --- /dev/null +++ b/python_app/batch_rgb_validation_top_fields.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +""" +Batch RGB Validation for Top 50 Largest Fields + +Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields. +Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. + +Configuration: +- GeoJSON: pivot.geojson (defines field boundaries and sizes) +- Harvest data: harvest.xlsx (season_end dates for completed harvests) +- CI data: ci_data_for_python.csv +- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png + +Usage: + python batch_rgb_validation_top_fields.py + +Output: + - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ + - Filenames: field___harvest_rgb.png + - Each grid shows 15 images at 7-day intervals around the season_end date +""" + +import json +import numpy as np +import pandas as pd +from pathlib import Path +from datetime import datetime, timedelta +import sys + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from rgb_visualization import generate_rgb_grids + + +def load_geojson_and_calculate_areas(geojson_path): + """ + Load GeoJSON and calculate area for each field. + + Returns: + pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending + """ + geojson_path = Path(geojson_path) + + if not geojson_path.exists(): + print(f"✗ GeoJSON not found: {geojson_path}") + return None + + print(f"Loading GeoJSON: {geojson_path}") + + with open(geojson_path) as f: + geojson_data = json.load(f) + + fields = [] + + for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + field_id = str(props.get('field', '')) + field_name = props.get('name', f"field_{field_id}") + + geometry = feature.get('geometry', {}) + geom_type = geometry.get('type', '') + coordinates = geometry.get('coordinates', []) + + # Simple area calculation using Shoelace formula + area_m2 = 0 + if geom_type == 'Polygon' and coordinates: + coords = coordinates[0] # Exterior ring + area_m2 = calculate_polygon_area(coords) + elif geom_type == 'MultiPolygon' and coordinates: + for poly_coords in coordinates: + area_m2 += calculate_polygon_area(poly_coords[0]) + + if area_m2 > 0: + fields.append({ + 'field': field_id, + 'field_name': field_name, + 'area_m2': area_m2, + 'area_hectares': area_m2 / 10000 + }) + + df = pd.DataFrame(fields) + df = df.sort_values('area_m2', ascending=False) + + print(f" ✓ Loaded {len(df)} fields") + print(f" Top 10 largest fields (hectares):") + for i, row in df.head(10).iterrows(): + print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha") + + return df + + +def calculate_polygon_area(coords): + """ + Calculate area of polygon using Shoelace formula (in m²). + Assumes coordinates are in lat/lon (roughly converts to meters). + """ + if len(coords) < 3: + return 0 + + # Rough conversion: at equator, 1 degree ≈ 111 km + # For lat/lon coordinates, use average latitude + lats = [c[1] for c in coords] + avg_lat = np.mean(lats) + lat_m_per_deg = 111000 + lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat)) + + # Convert to meters + coords_m = [] + for lon, lat in coords: + x = (lon - coords[0][0]) * lon_m_per_deg + y = (lat - coords[0][1]) * lat_m_per_deg + coords_m.append((x, y)) + + # Shoelace formula + area = 0 + for i in range(len(coords_m)): + j = (i + 1) % len(coords_m) + area += coords_m[i][0] * coords_m[j][1] + area -= coords_m[j][0] * coords_m[i][1] + + return abs(area) / 2 + + +def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df): + """ + Load harvest data from Excel file and get latest completed season for each field. + + Returns season_end date for each field (latest complete season where season_end is not null). + + Args: + harvest_xlsx_path (Path): Path to harvest.xlsx + top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering + + Returns: + dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}} + """ + harvest_xlsx_path = Path(harvest_xlsx_path) + + if not harvest_xlsx_path.exists(): + print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}") + return {} + + print(f"Loading harvest data: {harvest_xlsx_path}") + + try: + harvest_df = pd.read_excel(harvest_xlsx_path) + + # Ensure date columns are datetime + if 'season_end' in harvest_df.columns: + harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') + + # Filter to top 50 fields and get only rows with season_end filled in + top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip()) + harvest_df['field'] = harvest_df['field'].astype(str).str.strip() + harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)] + harvest_df = harvest_df[harvest_df['season_end'].notna()] + + # Group by field and get the LATEST (most recent) season_end + latest_harvests = {} + + for field_id in top_50_field_ids: + field_records = harvest_df[harvest_df['field'] == field_id] + + if len(field_records) > 0: + # Get row with latest season_end + latest_idx = field_records['season_end'].idxmax() + latest_row = field_records.loc[latest_idx] + + # Get field name from top_50_fields_df + field_info = top_50_fields_df[top_50_fields_df['field'] == field_id] + if len(field_info) > 0: + field_name = field_info.iloc[0]['field_name'] + else: + field_name = f"field_{field_id}" + + latest_harvests[field_id] = { + 'field_name': field_name, + 'harvest_date': latest_row['season_end'] + } + + print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields") + + return latest_harvests + + except Exception as e: + print(f"✗ Error loading harvest data: {e}") + return {} + + +def main(): + print("="*90) + print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS") + print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery") + print("="*90) + + # Configuration + geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") + harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx") + output_dir = Path("laravel_app/storage/app/angata/RGB") + tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") + + # Verify paths + if not geojson_path.exists(): + print(f"✗ GeoJSON not found: {geojson_path}") + return + if not harvest_xlsx.exists(): + print(f"✗ Harvest Excel not found: {harvest_xlsx}") + return + if not tiff_dir.exists(): + print(f"✗ TIFF directory not found: {tiff_dir}") + return + + output_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Load GeoJSON and get top 50 largest fields + print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...") + fields_df = load_geojson_and_calculate_areas(geojson_path) + if fields_df is None: + return + + top_50_fields = fields_df.head(50) + print(f" ✓ Selected {len(top_50_fields)} largest fields for processing") + + # Step 2: Load harvest dates from Excel + print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") + harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields) + + if len(harvest_dates) == 0: + print("✗ No harvest dates found in Excel file") + return + + print(f" ✓ Found {len(harvest_dates)} fields with completed seasons") + for field_id, info in list(harvest_dates.items())[:5]: + print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}") + if len(harvest_dates) > 5: + print(f" ... and {len(harvest_dates) - 5} more") + + # Step 3: Generate RGB grids for each field + print("\n[3/4] Generating RGB validation grids...") + rgb_count = 0 + + for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): + field_name = harvest_info['field_name'] + harvest_date = harvest_info['harvest_date'] + + try: + # Run RGB visualization (harvest dates only, no registered/predicted distinction) + results = generate_rgb_grids( + field_data=None, # Not needed - just for function compatibility + field_id=field_id, + registered_harvest_dates=[], # Empty - using harvest.xlsx instead + predicted_harvest_dates=[ + { + 'harvest_date': harvest_date, + 'model_name': 'harvest_xlsx' + } + ], + output_dir=str(output_dir), # All PNGs in same folder + tiff_dir=str(tiff_dir), + geojson_path=str(geojson_path) + ) + + if results['predicted']: + rgb_count += 1 + print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}") + else: + print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)") + + except Exception as e: + print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}") + + # Summary + print("\n" + "="*90) + print(f"SUMMARY:") + print(f" Fields with harvest dates: {len(harvest_dates)}") + print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}") + print(f" Output directory: {output_dir}") + print("="*90) + print("\nVisual inspection checklist:") + print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested") + print(" ⚠ Green vegetation at T~0d = Possible data error or replanting") + print(" ✓ Green → Brown progression = Normal harvest sequence") + print("="*90) + + +if __name__ == "__main__": + main() diff --git a/python_app/debug_all_tiles_for_date.py b/python_app/debug_all_tiles_for_date.py new file mode 100644 index 0000000..ba07fe7 --- /dev/null +++ b/python_app/debug_all_tiles_for_date.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +""" +Debug script to find all tiles for a date and check which overlap with field boundary +""" + +import json +import rasterio +from rasterio.mask import mask +from pathlib import Path +import numpy as np +import shapely.geometry as shgeom +import pandas as pd + +# Load field 79 boundary +geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") +field_id = "79" + +print(f"Loading field {field_id} from GeoJSON...") +with open(geojson_path) as f: + geojson_data = json.load(f) + +field_boundary = None +for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + if str(props.get('field', '')) == str(field_id): + geometry = feature.get('geometry') + if geometry: + geom_type = geometry.get('type', '') + coordinates = geometry.get('coordinates', []) + + if geom_type == 'MultiPolygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0][0] + field_boundary = shgeom.Polygon(coords) + elif geom_type == 'Polygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0] + field_boundary = shgeom.Polygon(coords) + break + +if field_boundary is None: + print(f"Field {field_id} not found") + exit(1) + +print(f"Field boundary bounds: {field_boundary.bounds}") +print(f"Field boundary area: {field_boundary.area}") + +# Find a specific date directory +tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") +target_date = pd.Timestamp("2026-01-15") # Use a recent date that exists + +# Find tiles for that date +date_dirs = [] +for date_dir in tiff_dir.iterdir(): + if date_dir.is_dir(): + try: + dir_name = date_dir.name + date_str = dir_name.split('_')[0] + tile_date = pd.Timestamp(date_str) + if tile_date == target_date: + date_dirs.append(date_dir) + except: + pass + +if not date_dirs: + print(f"No tiles found for {target_date}") + exit(1) + +print(f"\nFound {len(date_dirs)} date directory(ies) for {target_date}") + +for date_dir in date_dirs: + print(f"\n=== Checking date directory: {date_dir.name} ===") + + tiles = list(date_dir.glob("*.tif")) + print(f"Found {len(tiles)} tiles in this directory") + + for tile_path in sorted(tiles): + try: + with rasterio.open(tile_path) as src: + tile_bounds = src.bounds + tile_geom = shgeom.box(*tile_bounds) + + intersects = field_boundary.intersects(tile_geom) + intersection = field_boundary.intersection(tile_geom) if intersects else None + intersection_area = intersection.area if intersection else 0 + + print(f"\n{tile_path.name}") + print(f" Tile bounds: {tile_bounds}") + print(f" Intersects field: {intersects}") + if intersects: + print(f" Intersection area: {intersection_area:.8f}") + + # Try to mask this tile + geom = shgeom.mapping(field_boundary) + try: + masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) + print(f" ✓ Successfully masked! Shape: {masked_data.shape}") + + # Check the data in each band + for i, band_idx in enumerate([1, 2, 3]): + band_data = masked_data[i] + non_zero = (band_data != 0).sum() + print(f" Band {band_idx}: {non_zero} non-zero pixels out of {band_data.size}") + except ValueError as e: + print(f" ✗ Masking failed: {e}") + except Exception as e: + print(f" Error reading tile: {e}") diff --git a/python_app/debug_field_mask.py b/python_app/debug_field_mask.py new file mode 100644 index 0000000..ce96700 --- /dev/null +++ b/python_app/debug_field_mask.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +""" +Debug script to diagnose why field boundary masking produces no data +""" + +import json +import rasterio +from rasterio.mask import mask +from pathlib import Path +import numpy as np +import shapely.geometry as shgeom + +# Load a sample field boundary +geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") +field_id = "79" # A field that had issues + +print(f"Loading field {field_id} from GeoJSON...") +with open(geojson_path) as f: + geojson_data = json.load(f) + +field_boundary = None +for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + if str(props.get('field', '')) == str(field_id): + geometry = feature.get('geometry') + if geometry: + geom_type = geometry.get('type', '') + coordinates = geometry.get('coordinates', []) + + if geom_type == 'MultiPolygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0][0] + field_boundary = shgeom.Polygon(coords) + elif geom_type == 'Polygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0] + field_boundary = shgeom.Polygon(coords) + break + +if field_boundary is None: + print(f"Field {field_id} not found") + exit(1) + +print(f"Field boundary bounds: {field_boundary.bounds}") +print(f"Field boundary area: {field_boundary.area}") + +# Load a sample TIFF tile +tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") +tile_file = None +for date_dir in sorted(tiff_dir.iterdir()): + if date_dir.is_dir(): + for tif in date_dir.glob("*.tif"): + if tif.stat().st_size > 12e6: + tile_file = tif + break + if tile_file: + break + +if not tile_file: + print("No suitable TIFF found") + exit(1) + +print(f"\nTesting with TIFF: {tile_file.name}") + +with rasterio.open(tile_file) as src: + print(f"TIFF Bounds: {src.bounds}") + print(f"TIFF CRS: {src.crs}") + + # Check if field boundary is within tile bounds + tile_box = shgeom.box(*src.bounds) + intersects = field_boundary.intersects(tile_box) + print(f"Field boundary intersects tile: {intersects}") + + if intersects: + intersection = field_boundary.intersection(tile_box) + print(f"Intersection area: {intersection.area}") + print(f"Intersection bounds: {intersection.bounds}") + + # Try to mask and see what we get + print("\nAttempting to mask...") + geom = shgeom.mapping(field_boundary) + try: + masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) + print(f"Masked data shape: {masked_data.shape}") + print(f"Masked data dtype: {masked_data.dtype}") + + # Check the data + for i, band_idx in enumerate([1, 2, 3]): + band_data = masked_data[i] + print(f"\nBand {band_idx}:") + print(f" min: {np.nanmin(band_data):.6f}") + print(f" max: {np.nanmax(band_data):.6f}") + print(f" mean: {np.nanmean(band_data):.6f}") + print(f" % valid (non-zero): {(band_data != 0).sum() / band_data.size * 100:.2f}%") + print(f" % NaN: {np.isnan(band_data).sum() / band_data.size * 100:.2f}%") + + # Show sample values + valid_pixels = band_data[band_data != 0] + if len(valid_pixels) > 0: + print(f" Sample valid values: {valid_pixels[:10]}") + except ValueError as e: + print(f"Error during masking: {e}") diff --git a/python_app/debug_tiff_inspect.py b/python_app/debug_tiff_inspect.py new file mode 100644 index 0000000..be51e5e --- /dev/null +++ b/python_app/debug_tiff_inspect.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +""" +Debug script to inspect TIFF file structure and data +""" + +import rasterio +from pathlib import Path +import numpy as np + +# Pick a tile file to inspect +tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") + +# Find first available tile +tile_file = None +for date_dir in sorted(tiff_dir.iterdir()): + if date_dir.is_dir(): + for tif in date_dir.glob("*.tif"): + if tif.stat().st_size > 12e6: # Skip empty files + tile_file = tif + break + if tile_file: + break + +if not tile_file: + print("No suitable TIFF files found") + exit(1) + +print(f"Inspecting: {tile_file.name}") +print("=" * 80) + +with rasterio.open(tile_file) as src: + print(f"Band count: {src.count}") + print(f"Data type: {src.dtypes[0]}") + print(f"Shape: {src.height} x {src.width}") + print(f"CRS: {src.crs}") + print(f"Bounds: {src.bounds}") + print() + + # Read each band + for band_idx in range(1, min(6, src.count + 1)): + data = src.read(band_idx) + print(f"Band {band_idx}:") + print(f" dtype: {data.dtype}") + print(f" range: {data.min():.6f} - {data.max():.6f}") + print(f" mean: {data.mean():.6f}") + print(f" % valid (non-zero): {(data != 0).sum() / data.size * 100:.1f}%") + print() diff --git a/python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py b/python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py new file mode 100644 index 0000000..412c40b --- /dev/null +++ b/python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py @@ -0,0 +1,299 @@ +""" +Batch Field Visualization Tool - RGB Imagery Around Harvest Date +Purpose: Generate visual validation using RGB satellite imagery samples around +predicted harvest date to verify predictions (bare soil = harvested, green = not harvested) + +Shows 12-15 RGB images in a grid, centered around the predicted harvest date + +Usage: + python batch_plot_fields_rgb.py field1,field2,field3 + python batch_plot_fields_rgb.py 10125,88,97 + + Or read from CSV: + python batch_plot_fields_rgb.py --file fields_to_check.csv +""" + +import pandas as pd +import numpy as np +import torch +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +from pathlib import Path +from datetime import datetime, timedelta +import sys +import rasterio +from rasterio.mask import mask +import geopandas as gpd +from harvest_date_pred_utils import load_model_and_config, extract_features + + +def get_field_centroid(field_id, geojson_path="pivot.geojson"): + """Get centroid of field from GeoJSON for cropping RGB images.""" + try: + gdf = gpd.read_file(geojson_path) + field_geom = gdf[gdf['field'] == str(field_id)] + if len(field_geom) > 0: + centroid = field_geom.geometry.iloc[0].centroid + return (centroid.x, centroid.y) + except Exception as e: + print(f" Warning: Could not get field centroid - {e}") + return None + + +def load_rgb_image(tif_path, field_id=None, geojson_path="pivot.geojson"): + """ + Load RGB bands from 8-band GeoTIFF + Bands: 0=coastal, 1=blue, 2=green, 3=green_i, 4=yellow, 5=red, 6=rededge, 7=nir + RGB = bands 5,3,1 (Red, Green, Blue) + """ + try: + with rasterio.open(tif_path) as src: + # Read RGB bands (bands are 1-indexed in rasterio) + red = src.read(6) # Band 6 = red (0-indexed band 5) + green = src.read(3) # Band 3 = green (0-indexed band 2) + blue = src.read(2) # Band 2 = blue (0-indexed band 1) + + # Stack into RGB image + rgb = np.stack([red, green, blue], axis=2) + + # Normalize to 0-1 range (8-band data is typically 0-10000) + rgb = np.clip(rgb / 5000.0, 0, 1) + + return rgb + except Exception as e: + print(f" Error loading RGB from {tif_path}: {e}") + return None + + +def plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device, + tif_folder="../../../laravel_app/storage/app/angata/merged_tif_8b", + output_dir="validation_plots_rgb"): + """ + Create validation plot for a single field: + - Top: Harvest probability over time with peak marked + - Bottom: 12-15 RGB images in grid around predicted harvest date + """ + # Create output directory + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Filter field data + field_data = ci_data[ci_data['field'] == field_id].copy() + if len(field_data) == 0: + print(f" ✗ Field {field_id}: No CI data found") + return False + + field_data = field_data.sort_values('Date') + print(f" ✓ Field {field_id}: {len(field_data)} days of data") + + try: + # Extract features and run inference + ci_column = config['data']['ci_column'] + feature_names = config['features'] + + feat_array = extract_features(field_data, feature_names, ci_column=ci_column) + if feat_array is None: + print(f" ✗ Field {field_id}: Feature extraction failed") + return False + + # Apply scalers + if isinstance(scalers, dict) and 'features' in scalers: + feat_array = scalers['features'].transform(feat_array) + + # Run inference + with torch.no_grad(): + x_tensor = torch.tensor(feat_array, dtype=torch.float32).unsqueeze(0).to(device) + out_imm, out_det = model(x_tensor) + imm_probs = out_imm.squeeze(0).cpu().numpy() + + # Find peak probability (predicted harvest date) + peak_idx = np.argmax(imm_probs) + peak_date = field_data['Date'].iloc[peak_idx] + peak_prob = imm_probs[peak_idx] + + print(f" Peak probability: {peak_prob:.3f} on {peak_date.strftime('%Y-%m-%d')}") + + # Get date range: ±6 days around peak (12-13 images total) + date_range = field_data['Date'].dt.date + peak_date_only = peak_date.date() if hasattr(peak_date, 'date') else peak_date + + days_before = 6 + days_after = 6 + start_date = peak_date_only - timedelta(days=days_before) + end_date = peak_date_only + timedelta(days=days_after) + + # Find available TIF files in date range + tif_folder_path = Path(tif_folder) + available_dates = [] + for tif_file in sorted(tif_folder_path.glob("*.tif")): + date_str = tif_file.stem # YYYY-MM-DD + try: + tif_date = datetime.strptime(date_str, "%Y-%m-%d").date() + if start_date <= tif_date <= end_date: + available_dates.append((tif_date, tif_file)) + except ValueError: + pass + + if len(available_dates) == 0: + print(f" Warning: No TIF files found in {start_date} to {end_date}") + return False + + print(f" Found {len(available_dates)} RGB images in date range") + + # Load RGB images + rgb_images = [] + rgb_dates = [] + for tif_date, tif_file in available_dates: + rgb = load_rgb_image(str(tif_file), field_id) + if rgb is not None: + rgb_images.append(rgb) + rgb_dates.append(tif_date) + + if len(rgb_images) == 0: + print(f" ✗ No RGB images loaded") + return False + + print(f" Loaded {len(rgb_images)} RGB images") + + # Create figure with probability plot + RGB grid + n_images = len(rgb_images) + n_cols = min(5, n_images) # Max 5 columns + n_rows = (n_images + n_cols - 1) // n_cols # Calculate rows needed + + fig = plt.figure(figsize=(18, 12)) + + # Probability plot (top, spanning all columns) + ax_prob = plt.subplot(n_rows + 1, n_cols, (1, n_cols)) + dates_arr = field_data['Date'].values + ax_prob.plot(dates_arr, imm_probs, '-', color='orange', linewidth=2.5, label='Imminent Probability', alpha=0.8) + ax_prob.axhline(y=0.5, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label='Threshold (0.5)') + ax_prob.axvline(x=peak_date, color='darkred', linestyle=':', linewidth=2, alpha=0.7, label='Peak') + ax_prob.fill_between(dates_arr, 0.5, 1.0, alpha=0.08, color='red') + ax_prob.set_ylim(-0.05, 1.05) + ax_prob.set_ylabel('Probability', fontsize=11, fontweight='bold') + ax_prob.set_xlabel('Date', fontsize=11, fontweight='bold') + ax_prob.set_title(f'Field {field_id} - Model 307 Harvest Probability', fontsize=12, fontweight='bold') + ax_prob.grid(True, alpha=0.3) + ax_prob.legend(loc='upper right', fontsize=9) + ax_prob.xaxis.set_major_locator(mdates.MonthLocator(interval=1)) + ax_prob.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) + plt.setp(ax_prob.xaxis.get_majorticklabels(), rotation=45, ha='right') + + # Annotate peak + ax_prob.annotate(f'{peak_prob:.2f}\n{peak_date_only}', + xy=(peak_date, peak_prob), + xytext=(20, 20), textcoords='offset points', + bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.8), + arrowprops=dict(arrowstyle='->', lw=1.5, color='darkred')) + + # RGB images in grid (below probability plot) + for i, (rgb, date) in enumerate(zip(rgb_images, rgb_dates)): + ax = plt.subplot(n_rows + 1, n_cols, n_cols + i + 1) + ax.imshow(rgb, extent=[0, 100, 0, 100]) + + # Highlight peak date + date_label = date.strftime('%m-%d') + is_peak = date == peak_date_only + color = 'darkred' if is_peak else 'black' + weight = 'bold' if is_peak else 'normal' + size = 11 if is_peak else 9 + + ax.set_title(date_label, fontsize=size, fontweight=weight, color=color) + ax.set_xticks([]) + ax.set_yticks([]) + + plt.suptitle(f'Field {field_id} RGB Imagery: {len(rgb_images)} Days Around Peak Harvest Probability\nPeak: {peak_prob:.2f} on {peak_date_only} | Green = Growing | Brown/Bare = Harvested', + fontsize=13, fontweight='bold', y=0.995) + plt.tight_layout() + + # Save + output_file = Path(output_dir) / f"field_{field_id}_rgb_validation.png" + plt.savefig(output_file, dpi=100, bbox_inches='tight') + print(f" ✓ Saved: {output_file}") + plt.close() + + return True + + except Exception as e: + print(f" ✗ Field {field_id}: Error - {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + print("="*80) + print("BATCH RGB VISUALIZATION TOOL") + print("Visual check: RGB imagery around predicted harvest date") + print("="*80) + + # Parse arguments + fields_to_plot = [] + + if len(sys.argv) < 2: + print("\nUsage:") + print(" python batch_plot_fields_rgb.py field1,field2,field3") + print(" python batch_plot_fields_rgb.py --file fields.csv") + print("\nExample:") + print(" python batch_plot_fields_rgb.py 10125,88,97,440") + return + + if sys.argv[1] == "--file": + if len(sys.argv) < 3: + print("ERROR: --file requires a CSV filename") + return + csv_file = sys.argv[2] + print(f"\n[1/4] Loading fields from CSV: {csv_file}") + try: + df = pd.read_csv(csv_file) + fields_to_plot = df['field'].astype(str).str.strip().tolist() + print(f" ✓ Loaded {len(fields_to_plot)} fields") + except Exception as e: + print(f" ✗ Error reading CSV: {e}") + return + else: + # Parse comma-separated list + fields_to_plot = [f.strip() for f in sys.argv[1].split(',')] + print(f"\n[1/4] Processing {len(fields_to_plot)} field(s): {', '.join(fields_to_plot)}") + + # Load CI data + print("\n[2/4] Loading CI data...") + try: + ci_data = pd.read_csv("ci_data_for_python.csv") + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + ci_data['field'] = ci_data['field'].astype(str).str.strip() + print(f" ✓ Loaded {len(ci_data)} observations for {ci_data['field'].nunique()} fields") + except Exception as e: + print(f" ✗ Error loading CI data: {e}") + return + + # Load model + print("\n[3/4] Loading model...") + try: + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.eval() + print(f" ✓ Model loaded on {device}") + except Exception as e: + print(f" ✗ Error loading model: {e}") + return + + # Process each field + print("\n[4/4] Generating RGB validation plots...") + success_count = 0 + for field_id in fields_to_plot: + if plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device): + success_count += 1 + + # Summary + print("\n" + "="*80) + print(f"SUMMARY: {success_count}/{len(fields_to_plot)} fields processed successfully") + print(f"Output directory: validation_plots_rgb/") + print("="*80) + print("\nInspect the PNG files to verify predictions:") + print(" ✓ Green imagery BEFORE peak date (field growing)") + print(" ✓ Brown/Bare imagery AT/AFTER peak date (harvested)") + print(" ✓ Peak date marked with red title") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/experiment_framework/experiments/rgb_visualization.py b/python_app/rgb_visualization.py similarity index 68% rename from python_app/harvest_detection_experiments/experiment_framework/experiments/rgb_visualization.py rename to python_app/rgb_visualization.py index dff8e34..c1fd2e0 100644 --- a/python_app/harvest_detection_experiments/experiment_framework/experiments/rgb_visualization.py +++ b/python_app/rgb_visualization.py @@ -22,6 +22,8 @@ import numpy as np import pandas as pd from pathlib import Path from datetime import datetime, timedelta +import matplotlib +matplotlib.use('Agg') # Use non-interactive backend to avoid display hangs import matplotlib.pyplot as plt import matplotlib.patches as patches from matplotlib.colors import Normalize @@ -86,55 +88,95 @@ def load_field_boundaries(geojson_path, field_id): return None, None -def find_closest_tiff(target_date, tiff_dir, days_window=60, field_boundary=None): +def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60): """ - Find available TIFF file closest to target date. - Skips obviously empty files (< 12 MB) without reading data. + Find all tile files for target_date (or closest date) that overlap with field_boundary. - TIFF files are named: YYYY-MM-DD.tif + Tile files are organized in subdirectories by date: 5x5/YYYY-MM-DD_HH/*.tif Args: - target_date (pd.Timestamp): Target date to find TIFF near - tiff_dir (Path): Directory containing TIFF files + target_date (pd.Timestamp): Target date to find tiles near + tiff_dir (Path): Directory containing 5x5 date subdirectories + field_boundary (shapely.Polygon): Field boundary for overlap detection days_window (int): Max days to search before/after target - field_boundary (shapely.Polygon): Unused, kept for compatibility Returns: - Path: Path to closest TIFF file or None if not found - int: Days difference from target (negative=before, positive=after) + tuple: (list of tile paths, actual_date, days_diff) + list: tile paths that overlap field + pd.Timestamp: actual date of tiles found + int: days difference from target to actual date found """ target_date = pd.Timestamp(target_date) tiff_dir = Path(tiff_dir) - available_tiffs = list(tiff_dir.glob('*.tif')) - if not available_tiffs: - return None, None + if not tiff_dir.exists(): + return [], None, None - # Parse dates from filenames, skip obviously empty files by size - tiff_dates = [] - min_size_mb = 12.0 # Empty files are ~11.56 MB, valid files are ~12.6-13.0 MB + # Find all date subdirectories + available_dates = {} # {date: ([tile file paths], actual_dir_name)} + min_size_mb = 12.0 # Empty files are ~11.56 MB - for tiff_path in available_tiffs: + for date_dir in tiff_dir.iterdir(): + if not date_dir.is_dir(): + continue + try: - # Quick size check to skip obviously empty files - file_size_mb = tiff_path.stat().st_size / (1024 * 1024) - if file_size_mb < min_size_mb: + # Parse date from directory name (YYYY-MM-DD or YYYY-MM-DD_HH) + dir_name = date_dir.name + # Extract just the date part before underscore if it exists + date_str = dir_name.split('_')[0] + tile_date = pd.Timestamp(date_str) + days_diff = (tile_date - target_date).days + + if abs(days_diff) > days_window: continue - date_str = tiff_path.stem # Remove .tif extension - tiff_date = pd.Timestamp(date_str) - days_diff = (tiff_date - target_date).days - if abs(days_diff) <= days_window: - tiff_dates.append((tiff_path, tiff_date, days_diff)) + # Find all .tif files in this date directory + tile_files = [] + for tile_file in date_dir.glob('*.tif'): + # Skip obviously empty files + file_size_mb = tile_file.stat().st_size / (1024 * 1024) + if file_size_mb >= min_size_mb: + tile_files.append(tile_file) + + if tile_files: + available_dates[tile_date] = (tile_files, dir_name) except: pass - if not tiff_dates: - return None, None + if not available_dates: + return [], None, None - # Return closest by distance - closest = min(tiff_dates, key=lambda x: abs(x[2])) - return closest[0], closest[2] + # Find closest date + closest_date = min(available_dates.keys(), key=lambda d: abs((d - target_date).days)) + days_diff = (closest_date - target_date).days + tiles, _ = available_dates[closest_date] + + # Filter tiles to only those that overlap field boundary + if rasterio is None or field_boundary is None: + # If rasterio not available, use all tiles (conservative approach) + return tiles, closest_date, days_diff + + overlapping_tiles = [] + + for tile_path in tiles: + try: + with rasterio.open(tile_path) as src: + # Get tile bounds + tile_bounds = src.bounds # (left, bottom, right, top) + tile_geom = shgeom.box(*tile_bounds) + + # Check if tile overlaps field + if tile_geom.intersects(field_boundary): + overlapping_tiles.append(tile_path) + except: + pass + + if not overlapping_tiles: + # No overlapping tiles found, return all tiles for the closest date + return tiles, closest_date, days_diff + + return overlapping_tiles, closest_date, days_diff def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): @@ -148,12 +190,6 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): - Band 4: NIR - Band 5: CI - For older merged_tif_8b files (raw Planet Scope 8-band): - - Bands 1=Coastal Blue, 2=Blue, 3=Green, 4=Red, 5=Red Edge, 6=NIR, 7=SWIR1, 8=SWIR2 - - RGB would be bands 4,3,2 = Red, Green, Blue - - This function auto-detects the format based on band count and descriptions. - Args: tiff_path (Path): Path to TIFF file field_boundary (shapely.Polygon): Field boundary for clipping @@ -168,58 +204,106 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): try: with rasterio.open(tiff_path) as src: - # Check CRS compatibility + # Check band count if src.count < 3: - print(f" ⚠ TIFF has only {src.count} bands (need at least 3 for RGB)") return None - # Auto-detect format based on band count and descriptions - if src.count == 5 and hasattr(src, 'descriptions') and src.descriptions: - # merged_final_tif format: Red, Green, Blue, NIR, CI - bands_to_read = (1, 2, 3) - elif src.count == 9: - # merged_tif_8b format: use bands 4, 3, 2 for Red, Green, Blue - bands_to_read = (4, 3, 2) - else: - # Default: try to use first 3 bands or specified bands - bands_to_read = rgb_bands - print(f" ℹ Unknown TIFF format ({src.count} bands), using bands {bands_to_read}") + # For merged_final_tif: bands 1,2,3 are R,G,B + bands_to_read = (1, 2, 3) - # Mask and read bands (crop=True reads only the clipped window, not full resolution) + # Mask and read bands geom = shgeom.mapping(field_boundary) try: - # Read RGB bands at once, then mask (more efficient than masking 3 times) masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read)) - # masked_data shape is (3, height, width) - one layer per band - rgb_data = [] - for i, band_idx in enumerate(bands_to_read): - band_data = masked_data[i] # Extract the i-th band from masked stack - - # Better debug output that handles NaN values - valid_data = band_data[~np.isnan(band_data)] - if len(valid_data) > 0: - print(f" DEBUG: Band {band_idx} valid data range: {valid_data.min():.4f} - {valid_data.max():.4f} ({len(valid_data)} valid pixels)") - else: - print(f" DEBUG: Band {band_idx} - all NaN!") - rgb_data.append(band_data) - # Stack RGB - rgb = np.stack(rgb_data, axis=-1) + rgb = np.stack([masked_data[i] for i in range(3)], axis=-1) - # Data is already normalized 0-1 from the merged_final_tif files - # Just ensure it's float32 and clipped + # Convert to float32 if not already rgb = rgb.astype(np.float32) + + # Normalize to 0-1 range + # Data appears to be 8-bit (0-255 range) stored as float32 + # Check actual max value to determine normalization + max_val = np.nanmax(rgb) + if max_val > 0: + # If max is around 255 or less, assume 8-bit + if max_val <= 255: + rgb = rgb / 255.0 + # If max is around 65535, assume 16-bit + elif max_val <= 65535: + rgb = rgb / 65535.0 + # Otherwise divide by max to normalize + else: + rgb = rgb / max_val + rgb = np.clip(rgb, 0, 1) - print(f" DEBUG: Final RGB range: {rgb.min():.4f} - {rgb.max():.4f}") + # Check if result is all NaN + if np.all(np.isnan(rgb)): + return None + + # Replace any remaining NaN with 0 (cloud/invalid pixels) + rgb = np.nan_to_num(rgb, nan=0.0) return rgb - except ValueError as e: - print(f" ⚠ Error clipping to field boundary: {e}") + except ValueError: return None except Exception as e: - print(f" ✗ Error loading TIFF {tiff_path.name}: {e}") + return None + + +def load_and_composite_tiles_rgb(tile_paths, field_boundary): + """ + Load RGB from multiple overlapping tiles and composite them into a single image. + + Args: + tile_paths (list[Path]): List of tile file paths + field_boundary (shapely.Polygon): Field boundary for clipping + + Returns: + np.ndarray: Composited RGB data (height, width, 3) with values 0-1 + or None if error occurs + """ + if rasterio is None or field_boundary is None or not tile_paths: + return None + + try: + # Load and composite all tiles + rgb_arrays = [] + + for tile_path in tile_paths: + rgb = load_and_clip_tiff_rgb(tile_path, field_boundary) + if rgb is not None: + rgb_arrays.append(rgb) + + if not rgb_arrays: + return None + + # If single tile, return it + if len(rgb_arrays) == 1: + composited = rgb_arrays[0] + else: + # If multiple tiles, use max composite + stacked = np.stack(rgb_arrays, axis=0) + composited = np.max(stacked, axis=0) + + composited = composited.astype(np.float32) + + # Stretch contrast: normalize to 0-1 range based on actual min/max in the data + # This makes dim images visible + valid_data = composited[composited > 0] + if len(valid_data) > 0: + data_min = np.percentile(valid_data, 2) # 2nd percentile to handle outliers + data_max = np.percentile(valid_data, 98) # 98th percentile + + if data_max > data_min: + composited = (composited - data_min) / (data_max - data_min) + composited = np.clip(composited, 0, 1) + + return composited.astype(np.float32) + + except Exception as e: return None @@ -276,27 +360,24 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field actual_dates = [] # Store actual dates of TIFFs found for target in target_dates: - tiff_path, days_diff = find_closest_tiff(target, tiff_dir, days_window=60, field_boundary=field_boundary) + tile_paths, actual_date, days_diff = find_overlapping_tiles(target, tiff_dir, field_boundary, days_window=60) - if tiff_path is None: + if not tile_paths or actual_date is None: rgb_images.append(None) days_offsets.append(None) actual_dates.append(None) - print(f" ⚠ No TIFF found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data") + print(f" ⚠ No tiles found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data") continue - # Extract date from filename: YYYY-MM-DD.tif - tiff_date = pd.to_datetime(tiff_path.stem) - - rgb = load_and_clip_tiff_rgb(tiff_path, field_boundary) + rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) rgb_images.append(rgb) days_offsets.append(days_diff) - actual_dates.append(tiff_date) + actual_dates.append(actual_date) if rgb is not None: - print(f" ✓ Loaded {tiff_path.name} ({days_diff:+d}d from target)") + print(f" ✓ Loaded {len(tile_paths)} tile(s) for {actual_date.strftime('%Y-%m-%d')} ({days_diff:+d}d from target)") else: - print(f" ⚠ Loaded {tiff_path.name} but RGB data is None") + print(f" ⚠ Loaded {len(tile_paths)} tile(s) but RGB data is None") # Create 5x3 grid plot (15 images) fig, axes = plt.subplots(3, 5, figsize=(25, 15)) @@ -360,11 +441,16 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png' output_path = Path(output_dir) / filename - plt.savefig(output_path, dpi=100, bbox_inches='tight') - plt.close() - - print(f" ✓ Saved: {filename}") - return output_path + try: + plt.savefig(output_path, dpi=100, format='png') + plt.close() + + print(f" ✓ Saved: {filename}") + return output_path + except Exception as e: + plt.close() + print(f" ✗ Error saving PNG: {e}") + return None def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates, diff --git a/r_app/10_create_master_grid_and_split_tiffs.R b/r_app/10_create_master_grid_and_split_tiffs.R index 4f78409..6279fb2 100644 --- a/r_app/10_create_master_grid_and_split_tiffs.R +++ b/r_app/10_create_master_grid_and_split_tiffs.R @@ -11,11 +11,28 @@ library(terra) library(sf) # ============================================================================ -# CONFIGURATION +# CONFIGURATION & COMMAND-LINE ARGUMENTS # ============================================================================ +# Parse command-line arguments for date filtering +args <- commandArgs(trailingOnly = TRUE) + +# Example: Rscript 10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-17 +start_date <- NULL +end_date <- NULL + +if (length(args) >= 1) { + start_date <- as.Date(args[1]) + cat("Filtering: start date =", as.character(start_date), "\n") +} + +if (length(args) >= 2) { + end_date <- as.Date(args[2]) + cat("Filtering: end date =", as.character(end_date), "\n") +} + PROJECT <- "angata" -TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b") +TIFF_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "merged_tif_8b") # GRID SIZE CONFIGURATION - Change this to use different grid sizes # Options: 5x5 (25 tiles), 10x10 (100 tiles), etc. @@ -25,10 +42,10 @@ GRID_NCOLS <- 5 # Construct grid-specific subfolder path GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS) -OUTPUT_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) +OUTPUT_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) # Load field boundaries for overlap checking -GEOJSON_PATH <- file.path("laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") +GEOJSON_PATH <- file.path("..", "laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "") cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") @@ -40,31 +57,50 @@ cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") cat("\n[PART 1] Creating Master Grid\n") # Load field boundaries for overlap checking -cat("\n[1] Loading field boundaries from GeoJSON...\n") +cat("\n[1] Checking for existing master grid...\n") -if (!file.exists(GEOJSON_PATH)) { - stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n", - "Please ensure ", PROJECT, " has a pivot.geojson file.") -} +# Check if master grid already exists +MASTER_GRID_PATH <- file.path(OUTPUT_FOLDER, paste0("master_grid_", GRID_SIZE_LABEL, ".geojson")) -field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE) -field_boundaries_vect <- terra::vect(GEOJSON_PATH) - -cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s)\n", sep = "") - -# Try to find a name column (could be 'name', 'field', 'field_name', etc.) -field_names <- NA -if ("name" %in% names(field_boundaries_sf)) { - field_names <- field_boundaries_sf$name -} else if ("field" %in% names(field_boundaries_sf)) { - field_names <- field_boundaries_sf$field -} else if ("field_name" %in% names(field_boundaries_sf)) { - field_names <- field_boundaries_sf$field_name +if (file.exists(MASTER_GRID_PATH)) { + cat(" ✓ Found existing master grid at:\n ", MASTER_GRID_PATH, "\n", sep = "") + master_grid_sf <- st_read(MASTER_GRID_PATH, quiet = TRUE) + field_boundaries_sf <- NULL # No need to load pivot.geojson + field_boundaries_vect <- NULL + + cat(" ✓ Loaded grid with ", nrow(master_grid_sf), " tiles\n", sep = "") + } else { - field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices + # No existing grid - need to create one from pivot.geojson + cat(" No existing grid found. Creating new one from pivot.geojson...\n") + + if (!file.exists(GEOJSON_PATH)) { + stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n", + "Please ensure ", PROJECT, " has a pivot.geojson file, or run this script ", + "from the same directory as a previous successful run (grid already exists).") + } + + field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE) + field_boundaries_vect <- terra::vect(GEOJSON_PATH) + + cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s) from GeoJSON\n", sep = "") } -cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "") +# Try to find a name column (only if field_boundaries_sf exists) +if (!is.null(field_boundaries_sf)) { + field_names <- NA + if ("name" %in% names(field_boundaries_sf)) { + field_names <- field_boundaries_sf$name + } else if ("field" %in% names(field_boundaries_sf)) { + field_names <- field_boundaries_sf$field + } else if ("field_name" %in% names(field_boundaries_sf)) { + field_names <- field_boundaries_sf$field_name + } else { + field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices + } + + cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "") +} # Helper function: Check if a tile overlaps with any field (simple bbox overlap) tile_overlaps_fields <- function(tile_extent, field_geoms) { @@ -105,6 +141,27 @@ cat("\n[2] Checking TIFF extents...\n") tiff_files <- list.files(TIFF_FOLDER, pattern = "\\.tif$", full.names = FALSE) tiff_files <- sort(tiff_files) +# Filter by date range if specified +if (!is.null(start_date) || !is.null(end_date)) { + cat("\nApplying date filter...\n") + + file_dates <- as.Date(sub("\\.tif$", "", tiff_files)) + + if (!is.null(start_date) && !is.null(end_date)) { + keep_idx <- file_dates >= start_date & file_dates <= end_date + cat(" Date range: ", as.character(start_date), " to ", as.character(end_date), "\n", sep = "") + } else if (!is.null(start_date)) { + keep_idx <- file_dates >= start_date + cat(" From: ", as.character(start_date), "\n", sep = "") + } else { + keep_idx <- file_dates <= end_date + cat(" Until: ", as.character(end_date), "\n", sep = "") + } + + tiff_files <- tiff_files[keep_idx] + cat(" ✓ Filtered to ", length(tiff_files), " file(s)\n", sep = "") +} + if (length(tiff_files) == 0) { stop("No TIFF files found in ", TIFF_FOLDER) } @@ -242,26 +299,33 @@ if (file.exists(master_grid_file)) { cat("\n[PART 2] Creating Filtered Grid (only overlapping tiles)\n") -cat("\n[7] Filtering master grid to only overlapping tiles...\n") - -# Check which tiles overlap with any field -overlapping_tile_indices <- c() -for (tile_idx in 1:nrow(master_grid_sf)) { - tile_geom <- master_grid_sf[tile_idx, ] +# If grid was loaded from file, it's already filtered. Skip filtering. +if (!file.exists(MASTER_GRID_PATH)) { + cat("\n[7] Filtering master grid to only overlapping tiles...\n") - # Check overlap with any field - if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) { - overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx) + # Check which tiles overlap with any field + overlapping_tile_indices <- c() + for (tile_idx in 1:nrow(master_grid_sf)) { + tile_geom <- master_grid_sf[tile_idx, ] + + # Check overlap with any field + if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) { + overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx) + } } + + cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "") + cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "") + + # Create filtered grid with only overlapping tiles + filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ] + filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices) +} else { + cat("\n[7] Using pre-filtered grid (already loaded from file)...\n") + # Grid was already loaded - it's already filtered + filtered_grid_sf <- master_grid_sf } -cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "") -cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "") - -# Create filtered grid with only overlapping tiles -filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ] -filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices) - # Convert to SpatVector for makeTiles filtered_grid_vect <- terra::vect(filtered_grid_sf) @@ -314,7 +378,7 @@ for (file_idx in seq_along(tiff_files)) { dir.create(date_output_folder, recursive = TRUE, showWarnings = FALSE) } - cat(" Creating ", length(overlapping_tile_indices), " tiles...\n", sep = "") + cat(" Creating ", nrow(filtered_grid_sf), " tiles...\n", sep = "") # Use makeTiles with FILTERED grid (only overlapping tiles) tiles_list <- terra::makeTiles( diff --git a/r_app/40_mosaic_creation.R b/r_app/40_mosaic_creation.R index f1d6b91..7efb281 100644 --- a/r_app/40_mosaic_creation.R +++ b/r_app/40_mosaic_creation.R @@ -54,14 +54,12 @@ main <- function() { if (is.na(end_date)) { message("Invalid end_date provided. Using current date.") end_date <- Sys.Date() - end_date <- "2026-01-01" # Default date for testing } } else if (exists("end_date_str", envir = .GlobalEnv)) { end_date <- as.Date(get("end_date_str", envir = .GlobalEnv)) } else { # Default to current date if no argument is provided end_date <- Sys.Date() - end_date <- "2026-01-01" # Default date for testing message("No end_date provided. Using current date: ", format(end_date)) }