#!/usr/bin/env python """ Batch RGB Validation for Top 100 Largest Fields - V3 Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images). Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields. Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. Configuration: - GeoJSON: pivot.geojson (defines field boundaries and sizes) - Harvest data: harvest.xlsx (season_end dates for completed harvests) - Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png Usage: python batch_rgb_validation_top_fields_v3.py --field 1 python batch_rgb_validation_top_fields_v3.py Output: - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ - Filenames: field___harvest_rgb.png - Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images) """ import json import numpy as np import pandas as pd from pathlib import Path from datetime import datetime, timedelta import sys import argparse # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) from rgb_visualization import generate_rgb_grids def load_geojson_and_calculate_areas(geojson_path): """ Load GeoJSON and calculate area for each field. Returns: pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending """ geojson_path = Path(geojson_path) if not geojson_path.exists(): print(f"✗ GeoJSON not found: {geojson_path}") return None print(f"Loading GeoJSON: {geojson_path}") with open(geojson_path) as f: geojson_data = json.load(f) fields = [] for feature in geojson_data.get('features', []): props = feature.get('properties', {}) field_id = str(props.get('field', '')) field_name = props.get('name', f"field_{field_id}") geometry = feature.get('geometry', {}) geom_type = geometry.get('type', '') coordinates = geometry.get('coordinates', []) # Simple area calculation using Shoelace formula area_m2 = 0 if geom_type == 'Polygon' and coordinates: coords = coordinates[0] # Exterior ring area_m2 = calculate_polygon_area(coords) elif geom_type == 'MultiPolygon' and coordinates: for poly_coords in coordinates: area_m2 += calculate_polygon_area(poly_coords[0]) if area_m2 > 0: fields.append({ 'field': field_id, 'field_name': field_name, 'area_m2': area_m2, 'area_hectares': area_m2 / 10000 }) df = pd.DataFrame(fields) df = df.sort_values('area_m2', ascending=False) print(f" ✓ Loaded {len(df)} fields") print(f" Top 10 largest fields (hectares):") for i, row in df.head(10).iterrows(): print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha") return df def calculate_polygon_area(coords): """ Calculate area of polygon using Shoelace formula (in m²). Assumes coordinates are in lat/lon (roughly converts to meters). """ if len(coords) < 3: return 0 # Rough conversion: at equator, 1 degree ≈ 111 km # For lat/lon coordinates, use average latitude lats = [c[1] for c in coords] avg_lat = np.mean(lats) lat_m_per_deg = 111000 lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat)) # Convert to meters coords_m = [] for lon, lat in coords: x = (lon - coords[0][0]) * lon_m_per_deg y = (lat - coords[0][1]) * lat_m_per_deg coords_m.append((x, y)) # Shoelace formula area = 0 for i in range(len(coords_m)): j = (i + 1) % len(coords_m) area += coords_m[i][0] * coords_m[j][1] area -= coords_m[j][0] * coords_m[i][1] return abs(area) / 2 def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df): """ Load harvest data from Excel file and get latest completed season for each field. Returns season_end date for each field (latest complete season where season_end is not null). Args: harvest_xlsx_path (Path): Path to harvest.xlsx top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering Returns: dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}} """ harvest_xlsx_path = Path(harvest_xlsx_path) if not harvest_xlsx_path.exists(): print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}") return {} print(f"Loading harvest data: {harvest_xlsx_path}") try: harvest_df = pd.read_excel(harvest_xlsx_path) # Ensure date columns are datetime if 'season_end' in harvest_df.columns: harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') # Filter to top 50 fields and get only rows with season_end filled in top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip()) harvest_df['field'] = harvest_df['field'].astype(str).str.strip() harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)] harvest_df = harvest_df[harvest_df['season_end'].notna()] # Group by field and get the LATEST (most recent) season_end latest_harvests = {} for field_id in top_50_field_ids: field_records = harvest_df[harvest_df['field'] == field_id] if len(field_records) > 0: # Get row with latest season_end latest_idx = field_records['season_end'].idxmax() latest_row = field_records.loc[latest_idx] # Get field name from top_50_fields_df field_info = top_50_fields_df[top_50_fields_df['field'] == field_id] if len(field_info) > 0: field_name = field_info.iloc[0]['field_name'] else: field_name = f"field_{field_id}" latest_harvests[field_id] = { 'field_name': field_name, 'harvest_date': latest_row['season_end'] } print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields") return latest_harvests except Exception as e: print(f"✗ Error loading harvest data: {e}") return {} def main(): parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)') parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")') parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)') args = parser.parse_args() print("="*90) if args.field: print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}") else: print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS") print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)") print("="*90) # Configuration project = args.project geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson") harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx") output_dir = Path(f"laravel_app/storage/app/{project}/RGB") tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5") # Verify paths if not geojson_path.exists(): print(f"✗ GeoJSON not found: {geojson_path}") return if not harvest_xlsx.exists(): print(f"✗ Harvest Excel not found: {harvest_xlsx}") return if not tiff_dir.exists(): print(f"✗ TIFF directory not found: {tiff_dir}") return output_dir.mkdir(parents=True, exist_ok=True) # Handle single field mode if args.field: print(f"\n[1/3] Loading harvest data for field {args.field}...") harvest_df = pd.read_excel(harvest_xlsx) harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') harvest_df['field'] = harvest_df['field'].astype(str).str.strip() field_records = harvest_df[harvest_df['field'] == args.field] field_records = field_records[field_records['season_end'].notna()] if len(field_records) == 0: print(f"✗ No harvest data found for field {args.field}") return # Get latest harvest for this field latest_idx = field_records['season_end'].idxmax() latest_row = field_records.loc[latest_idx] harvest_date = latest_row['season_end'] print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}") # Load field name from GeoJSON print(f"\n[2/3] Loading field name from GeoJSON...") with open(geojson_path) as f: geojson_data = json.load(f) field_name = f"field_{args.field}" for feature in geojson_data.get('features', []): props = feature.get('properties', {}) if str(props.get('field', '')) == args.field: field_name = props.get('name', field_name) break print(f" ✓ Field name: {field_name}") # Generate RGB grid print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...") results = generate_rgb_grids( field_data=None, field_id=args.field, registered_harvest_dates=[], predicted_harvest_dates=[ { 'harvest_date': harvest_date, 'model_name': 'harvest_xlsx' } ], output_dir=str(output_dir), tiff_dir=str(tiff_dir), geojson_path=str(geojson_path) ) print("\n" + "="*90) if results['predicted']: print(f"✓ RGB grid generated successfully!") print(f" Field: {field_name} (ID: {args.field})") print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}") print(f" Output: {output_dir}") else: print(f"⚠ No RGB grid generated (no imagery available)") print("="*90) return # Batch mode for top 100 fields print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...") fields_df = load_geojson_and_calculate_areas(geojson_path) if fields_df is None: return top_100_fields = fields_df.head(100) print(f" ✓ Selected {len(top_100_fields)} largest fields for processing") # Step 2: Load harvest dates from Excel print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields) if len(harvest_dates) == 0: print("✗ No harvest dates found in Excel file") return print(f" ✓ Found {len(harvest_dates)} fields with completed seasons") for field_id, info in list(harvest_dates.items())[:5]: print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}") if len(harvest_dates) > 5: print(f" ... and {len(harvest_dates) - 5} more") # Step 3: Generate RGB grids for each field print("\n[3/4] Generating RGB validation grids (v3 dynamic)...") rgb_count = 0 for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): field_name = harvest_info['field_name'] harvest_date = harvest_info['harvest_date'] try: # Run RGB visualization (harvest dates only, no registered/predicted distinction) results = generate_rgb_grids( field_data=None, # Not needed - just for function compatibility field_id=field_id, registered_harvest_dates=[], # Empty - using harvest.xlsx instead predicted_harvest_dates=[ { 'harvest_date': harvest_date, 'model_name': 'harvest_xlsx' } ], output_dir=str(output_dir), # All PNGs in same folder tiff_dir=str(tiff_dir), geojson_path=str(geojson_path) ) if results['predicted']: rgb_count += 1 print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}") else: print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)") except Exception as e: print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}") # Summary print("\n" + "="*90) print(f"SUMMARY:") print(f" Fields with harvest dates: {len(harvest_dates)}") print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}") print(f" Output directory: {output_dir}") print("="*90) print("\nVisual inspection checklist:") print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested") print(" ⚠ Green vegetation at T~0d = Possible data error or replanting") print(" ✓ Green → Brown progression = Normal harvest sequence") print("="*90) if __name__ == "__main__": main()