SmartCane/python_app/batch_rgb_validation_top_fields_v3.py

#!/usr/bin/env python
"""
Batch RGB Validation for Top 100 Largest Fields - V3

Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images).

Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.

Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png

Usage:
    python batch_rgb_validation_top_fields_v3.py --field 1
    python batch_rgb_validation_top_fields_v3.py

Output:
    - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
    - Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
    - Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images)
"""

import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys
import argparse

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from rgb_visualization import generate_rgb_grids


def load_geojson_and_calculate_areas(geojson_path):
    """
    Load GeoJSON and calculate area for each field.

    Returns:
        pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
    """
    geojson_path = Path(geojson_path)

    if not geojson_path.exists():
        print(f"✗ GeoJSON not found: {geojson_path}")
        return None

    print(f"Loading GeoJSON: {geojson_path}")

    with open(geojson_path) as f:
        geojson_data = json.load(f)

    fields = []

    for feature in geojson_data.get('features', []):
        props = feature.get('properties', {})
        field_id = str(props.get('field', ''))
        field_name = props.get('name', f"field_{field_id}")

        geometry = feature.get('geometry', {})
        geom_type = geometry.get('type', '')
        coordinates = geometry.get('coordinates', [])

        # Simple area calculation using Shoelace formula
        area_m2 = 0
        if geom_type == 'Polygon' and coordinates:
            coords = coordinates[0]  # Exterior ring
            area_m2 = calculate_polygon_area(coords)
        elif geom_type == 'MultiPolygon' and coordinates:
            for poly_coords in coordinates:
                area_m2 += calculate_polygon_area(poly_coords[0])

        if area_m2 > 0:
            fields.append({
                'field': field_id,
                'field_name': field_name,
                'area_m2': area_m2,
                'area_hectares': area_m2 / 10000
            })

    df = pd.DataFrame(fields)
    df = df.sort_values('area_m2', ascending=False)

    print(f"  ✓ Loaded {len(df)} fields")
    print(f"  Top 10 largest fields (hectares):")
    for i, row in df.head(10).iterrows():
        print(f"    {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")

    return df


def calculate_polygon_area(coords):
    """
    Calculate area of polygon using Shoelace formula (in m²).
    Assumes coordinates are in lat/lon (roughly converts to meters).
    """
    if len(coords) < 3:
        return 0

    # Rough conversion: at equator, 1 degree ≈ 111 km
    # For lat/lon coordinates, use average latitude
    lats = [c[1] for c in coords]
    avg_lat = np.mean(lats)
    lat_m_per_deg = 111000
    lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))

    # Convert to meters
    coords_m = []
    for lon, lat in coords:
        x = (lon - coords[0][0]) * lon_m_per_deg
        y = (lat - coords[0][1]) * lat_m_per_deg
        coords_m.append((x, y))

    # Shoelace formula
    area = 0
    for i in range(len(coords_m)):
        j = (i + 1) % len(coords_m)
        area += coords_m[i][0] * coords_m[j][1]
        area -= coords_m[j][0] * coords_m[i][1]

    return abs(area) / 2


def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
    """
    Load harvest data from Excel file and get latest completed season for each field.

    Returns season_end date for each field (latest complete season where season_end is not null).

    Args:
        harvest_xlsx_path (Path): Path to harvest.xlsx
        top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering

    Returns:
        dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
    """
    harvest_xlsx_path = Path(harvest_xlsx_path)

    if not harvest_xlsx_path.exists():
        print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
        return {}

    print(f"Loading harvest data: {harvest_xlsx_path}")

    try:
        harvest_df = pd.read_excel(harvest_xlsx_path)

        # Ensure date columns are datetime
        if 'season_end' in harvest_df.columns:
            harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')

        # Filter to top 50 fields and get only rows with season_end filled in
        top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
        harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
        harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
        harvest_df = harvest_df[harvest_df['season_end'].notna()]

        # Group by field and get the LATEST (most recent) season_end
        latest_harvests = {}

        for field_id in top_50_field_ids:
            field_records = harvest_df[harvest_df['field'] == field_id]

            if len(field_records) > 0:
                # Get row with latest season_end
                latest_idx = field_records['season_end'].idxmax()
                latest_row = field_records.loc[latest_idx]

                # Get field name from top_50_fields_df
                field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
                if len(field_info) > 0:
                    field_name = field_info.iloc[0]['field_name']
                else:
                    field_name = f"field_{field_id}"

                latest_harvests[field_id] = {
                    'field_name': field_name,
                    'harvest_date': latest_row['season_end']
                }

        print(f"  ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")

        return latest_harvests

    except Exception as e:
        print(f"✗ Error loading harvest data: {e}")
        return {}


def main():
    parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)')
    parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")')
    parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)')

    args = parser.parse_args()

    print("="*90)
    if args.field:
        print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}")
    else:
        print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS")
    print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)")
    print("="*90)

    # Configuration
    project = args.project
    geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson")
    harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx")
    output_dir = Path(f"laravel_app/storage/app/{project}/RGB")
    tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5")

    # Verify paths
    if not geojson_path.exists():
        print(f"✗ GeoJSON not found: {geojson_path}")
        return
    if not harvest_xlsx.exists():
        print(f"✗ Harvest Excel not found: {harvest_xlsx}")
        return
    if not tiff_dir.exists():
        print(f"✗ TIFF directory not found: {tiff_dir}")
        return

    output_dir.mkdir(parents=True, exist_ok=True)

    # Handle single field mode
    if args.field:
        print(f"\n[1/3] Loading harvest data for field {args.field}...")

        harvest_df = pd.read_excel(harvest_xlsx)
        harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
        harvest_df['field'] = harvest_df['field'].astype(str).str.strip()

        field_records = harvest_df[harvest_df['field'] == args.field]
        field_records = field_records[field_records['season_end'].notna()]

        if len(field_records) == 0:
            print(f"✗ No harvest data found for field {args.field}")
            return

        # Get latest harvest for this field
        latest_idx = field_records['season_end'].idxmax()
        latest_row = field_records.loc[latest_idx]
        harvest_date = latest_row['season_end']

        print(f"  ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}")

        # Load field name from GeoJSON
        print(f"\n[2/3] Loading field name from GeoJSON...")
        with open(geojson_path) as f:
            geojson_data = json.load(f)

        field_name = f"field_{args.field}"
        for feature in geojson_data.get('features', []):
            props = feature.get('properties', {})
            if str(props.get('field', '')) == args.field:
                field_name = props.get('name', field_name)
                break

        print(f"  ✓ Field name: {field_name}")

        # Generate RGB grid
        print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...")
        results = generate_rgb_grids(
            field_data=None,
            field_id=args.field,
            registered_harvest_dates=[],
            predicted_harvest_dates=[
                {
                    'harvest_date': harvest_date,
                    'model_name': 'harvest_xlsx'
                }
            ],
            output_dir=str(output_dir),
            tiff_dir=str(tiff_dir),
            geojson_path=str(geojson_path)
        )

        print("\n" + "="*90)
        if results['predicted']:
            print(f"✓ RGB grid generated successfully!")
            print(f"  Field: {field_name} (ID: {args.field})")
            print(f"  Harvest date: {harvest_date.strftime('%Y-%m-%d')}")
            print(f"  Output: {output_dir}")
        else:
            print(f"⚠ No RGB grid generated (no imagery available)")
        print("="*90)
        return

    # Batch mode for top 100 fields
    print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...")

    fields_df = load_geojson_and_calculate_areas(geojson_path)
    if fields_df is None:
        return

    top_100_fields = fields_df.head(100)
    print(f"  ✓ Selected {len(top_100_fields)} largest fields for processing")

    # Step 2: Load harvest dates from Excel
    print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
    harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields)

    if len(harvest_dates) == 0:
        print("✗ No harvest dates found in Excel file")
        return

    print(f"  ✓ Found {len(harvest_dates)} fields with completed seasons")
    for field_id, info in list(harvest_dates.items())[:5]:
        print(f"    - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
    if len(harvest_dates) > 5:
        print(f"    ... and {len(harvest_dates) - 5} more")

    # Step 3: Generate RGB grids for each field
    print("\n[3/4] Generating RGB validation grids (v3 dynamic)...")
    rgb_count = 0

    for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
        field_name = harvest_info['field_name']
        harvest_date = harvest_info['harvest_date']

        try:
            # Run RGB visualization (harvest dates only, no registered/predicted distinction)
            results = generate_rgb_grids(
                field_data=None,  # Not needed - just for function compatibility
                field_id=field_id,
                registered_harvest_dates=[],  # Empty - using harvest.xlsx instead
                predicted_harvest_dates=[
                    {
                        'harvest_date': harvest_date,
                        'model_name': 'harvest_xlsx'
                    }
                ],
                output_dir=str(output_dir),  # All PNGs in same folder
                tiff_dir=str(tiff_dir),
                geojson_path=str(geojson_path)
            )

            if results['predicted']:
                rgb_count += 1
                print(f"  [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
            else:
                print(f"  [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")

        except Exception as e:
            print(f"  [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")

    # Summary
    print("\n" + "="*90)
    print(f"SUMMARY:")
    print(f"  Fields with harvest dates: {len(harvest_dates)}")
    print(f"  RGB grids generated: {rgb_count}/{len(harvest_dates)}")
    print(f"  Output directory: {output_dir}")
    print("="*90)
    print("\nVisual inspection checklist:")
    print("  ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
    print("  ⚠ Green vegetation at T~0d = Possible data error or replanting")
    print("  ✓ Green → Brown progression = Normal harvest sequence")
    print("="*90)


if __name__ == "__main__":
    main()