#!/usr/bin/env python """ Batch RGB Validation for Top 50 Largest Fields Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields. Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. Configuration: - GeoJSON: pivot.geojson (defines field boundaries and sizes) - Harvest data: harvest.xlsx (season_end dates for completed harvests) - CI data: ci_data_for_python.csv - Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png Usage: python batch_rgb_validation_top_fields.py Output: - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ - Filenames: field___harvest_rgb.png - Each grid shows 15 images at 7-day intervals around the season_end date """ import json import numpy as np import pandas as pd from pathlib import Path from datetime import datetime, timedelta import sys # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) from rgb_visualization import generate_rgb_grids def load_geojson_and_calculate_areas(geojson_path): """ Load GeoJSON and calculate area for each field. Returns: pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending """ geojson_path = Path(geojson_path) if not geojson_path.exists(): print(f"✗ GeoJSON not found: {geojson_path}") return None print(f"Loading GeoJSON: {geojson_path}") with open(geojson_path) as f: geojson_data = json.load(f) fields = [] for feature in geojson_data.get('features', []): props = feature.get('properties', {}) field_id = str(props.get('field', '')) field_name = props.get('name', f"field_{field_id}") geometry = feature.get('geometry', {}) geom_type = geometry.get('type', '') coordinates = geometry.get('coordinates', []) # Simple area calculation using Shoelace formula area_m2 = 0 if geom_type == 'Polygon' and coordinates: coords = coordinates[0] # Exterior ring area_m2 = calculate_polygon_area(coords) elif geom_type == 'MultiPolygon' and coordinates: for poly_coords in coordinates: area_m2 += calculate_polygon_area(poly_coords[0]) if area_m2 > 0: fields.append({ 'field': field_id, 'field_name': field_name, 'area_m2': area_m2, 'area_hectares': area_m2 / 10000 }) df = pd.DataFrame(fields) df = df.sort_values('area_m2', ascending=False) print(f" ✓ Loaded {len(df)} fields") print(f" Top 10 largest fields (hectares):") for i, row in df.head(10).iterrows(): print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha") return df def calculate_polygon_area(coords): """ Calculate area of polygon using Shoelace formula (in m²). Assumes coordinates are in lat/lon (roughly converts to meters). """ if len(coords) < 3: return 0 # Rough conversion: at equator, 1 degree ≈ 111 km # For lat/lon coordinates, use average latitude lats = [c[1] for c in coords] avg_lat = np.mean(lats) lat_m_per_deg = 111000 lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat)) # Convert to meters coords_m = [] for lon, lat in coords: x = (lon - coords[0][0]) * lon_m_per_deg y = (lat - coords[0][1]) * lat_m_per_deg coords_m.append((x, y)) # Shoelace formula area = 0 for i in range(len(coords_m)): j = (i + 1) % len(coords_m) area += coords_m[i][0] * coords_m[j][1] area -= coords_m[j][0] * coords_m[i][1] return abs(area) / 2 def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df): """ Load harvest data from Excel file and get latest completed season for each field. Returns season_end date for each field (latest complete season where season_end is not null). Args: harvest_xlsx_path (Path): Path to harvest.xlsx top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering Returns: dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}} """ harvest_xlsx_path = Path(harvest_xlsx_path) if not harvest_xlsx_path.exists(): print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}") return {} print(f"Loading harvest data: {harvest_xlsx_path}") try: harvest_df = pd.read_excel(harvest_xlsx_path) # Ensure date columns are datetime if 'season_end' in harvest_df.columns: harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') # Filter to top 50 fields and get only rows with season_end filled in top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip()) harvest_df['field'] = harvest_df['field'].astype(str).str.strip() harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)] harvest_df = harvest_df[harvest_df['season_end'].notna()] # Group by field and get the LATEST (most recent) season_end latest_harvests = {} for field_id in top_50_field_ids: field_records = harvest_df[harvest_df['field'] == field_id] if len(field_records) > 0: # Get row with latest season_end latest_idx = field_records['season_end'].idxmax() latest_row = field_records.loc[latest_idx] # Get field name from top_50_fields_df field_info = top_50_fields_df[top_50_fields_df['field'] == field_id] if len(field_info) > 0: field_name = field_info.iloc[0]['field_name'] else: field_name = f"field_{field_id}" latest_harvests[field_id] = { 'field_name': field_name, 'harvest_date': latest_row['season_end'] } print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields") return latest_harvests except Exception as e: print(f"✗ Error loading harvest data: {e}") return {} def main(): print("="*90) print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS") print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery") print("="*90) # Configuration geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx") output_dir = Path("laravel_app/storage/app/angata/RGB") tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") # Verify paths if not geojson_path.exists(): print(f"✗ GeoJSON not found: {geojson_path}") return if not harvest_xlsx.exists(): print(f"✗ Harvest Excel not found: {harvest_xlsx}") return if not tiff_dir.exists(): print(f"✗ TIFF directory not found: {tiff_dir}") return output_dir.mkdir(parents=True, exist_ok=True) # Step 1: Load GeoJSON and get top 50 largest fields print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...") fields_df = load_geojson_and_calculate_areas(geojson_path) if fields_df is None: return top_50_fields = fields_df.head(50) print(f" ✓ Selected {len(top_50_fields)} largest fields for processing") # Step 2: Load harvest dates from Excel print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields) if len(harvest_dates) == 0: print("✗ No harvest dates found in Excel file") return print(f" ✓ Found {len(harvest_dates)} fields with completed seasons") for field_id, info in list(harvest_dates.items())[:5]: print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}") if len(harvest_dates) > 5: print(f" ... and {len(harvest_dates) - 5} more") # Step 3: Generate RGB grids for each field print("\n[3/4] Generating RGB validation grids...") rgb_count = 0 for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): field_name = harvest_info['field_name'] harvest_date = harvest_info['harvest_date'] try: # Run RGB visualization (harvest dates only, no registered/predicted distinction) results = generate_rgb_grids( field_data=None, # Not needed - just for function compatibility field_id=field_id, registered_harvest_dates=[], # Empty - using harvest.xlsx instead predicted_harvest_dates=[ { 'harvest_date': harvest_date, 'model_name': 'harvest_xlsx' } ], output_dir=str(output_dir), # All PNGs in same folder tiff_dir=str(tiff_dir), geojson_path=str(geojson_path) ) if results['predicted']: rgb_count += 1 print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}") else: print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)") except Exception as e: print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}") # Summary print("\n" + "="*90) print(f"SUMMARY:") print(f" Fields with harvest dates: {len(harvest_dates)}") print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}") print(f" Output directory: {output_dir}") print("="*90) print("\nVisual inspection checklist:") print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested") print(" ⚠ Green vegetation at T~0d = Possible data error or replanting") print(" ✓ Green → Brown progression = Normal harvest sequence") print("="*90) if __name__ == "__main__": main()