SmartCane/python_app/batch_rgb_validation_top_fields.py

#!/usr/bin/env python
"""
Batch RGB Validation for Top 50 Largest Fields

Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.

Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
- CI data: ci_data_for_python.csv
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png

Usage:
    python batch_rgb_validation_top_fields.py

Output:
    - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
    - Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
    - Each grid shows 15 images at 7-day intervals around the season_end date
"""

import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

from rgb_visualization import generate_rgb_grids


def load_geojson_and_calculate_areas(geojson_path):
    """
    Load GeoJSON and calculate area for each field.

    Returns:
        pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
    """
    geojson_path = Path(geojson_path)

    if not geojson_path.exists():
        print(f"✗ GeoJSON not found: {geojson_path}")
        return None

    print(f"Loading GeoJSON: {geojson_path}")

    with open(geojson_path) as f:
        geojson_data = json.load(f)

    fields = []

    for feature in geojson_data.get('features', []):
        props = feature.get('properties', {})
        field_id = str(props.get('field', ''))
        field_name = props.get('name', f"field_{field_id}")

        geometry = feature.get('geometry', {})
        geom_type = geometry.get('type', '')
        coordinates = geometry.get('coordinates', [])

        # Simple area calculation using Shoelace formula
        area_m2 = 0
        if geom_type == 'Polygon' and coordinates:
            coords = coordinates[0]  # Exterior ring
            area_m2 = calculate_polygon_area(coords)
        elif geom_type == 'MultiPolygon' and coordinates:
            for poly_coords in coordinates:
                area_m2 += calculate_polygon_area(poly_coords[0])

        if area_m2 > 0:
            fields.append({
                'field': field_id,
                'field_name': field_name,
                'area_m2': area_m2,
                'area_hectares': area_m2 / 10000
            })

    df = pd.DataFrame(fields)
    df = df.sort_values('area_m2', ascending=False)

    print(f"  ✓ Loaded {len(df)} fields")
    print(f"  Top 10 largest fields (hectares):")
    for i, row in df.head(10).iterrows():
        print(f"    {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")

    return df


def calculate_polygon_area(coords):
    """
    Calculate area of polygon using Shoelace formula (in m²).
    Assumes coordinates are in lat/lon (roughly converts to meters).
    """
    if len(coords) < 3:
        return 0

    # Rough conversion: at equator, 1 degree ≈ 111 km
    # For lat/lon coordinates, use average latitude
    lats = [c[1] for c in coords]
    avg_lat = np.mean(lats)
    lat_m_per_deg = 111000
    lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))

    # Convert to meters
    coords_m = []
    for lon, lat in coords:
        x = (lon - coords[0][0]) * lon_m_per_deg
        y = (lat - coords[0][1]) * lat_m_per_deg
        coords_m.append((x, y))

    # Shoelace formula
    area = 0
    for i in range(len(coords_m)):
        j = (i + 1) % len(coords_m)
        area += coords_m[i][0] * coords_m[j][1]
        area -= coords_m[j][0] * coords_m[i][1]

    return abs(area) / 2


def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
    """
    Load harvest data from Excel file and get latest completed season for each field.

    Returns season_end date for each field (latest complete season where season_end is not null).

    Args:
        harvest_xlsx_path (Path): Path to harvest.xlsx
        top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering

    Returns:
        dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
    """
    harvest_xlsx_path = Path(harvest_xlsx_path)

    if not harvest_xlsx_path.exists():
        print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
        return {}

    print(f"Loading harvest data: {harvest_xlsx_path}")

    try:
        harvest_df = pd.read_excel(harvest_xlsx_path)

        # Ensure date columns are datetime
        if 'season_end' in harvest_df.columns:
            harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')

        # Filter to top 50 fields and get only rows with season_end filled in
        top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
        harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
        harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
        harvest_df = harvest_df[harvest_df['season_end'].notna()]

        # Group by field and get the LATEST (most recent) season_end
        latest_harvests = {}

        for field_id in top_50_field_ids:
            field_records = harvest_df[harvest_df['field'] == field_id]

            if len(field_records) > 0:
                # Get row with latest season_end
                latest_idx = field_records['season_end'].idxmax()
                latest_row = field_records.loc[latest_idx]

                # Get field name from top_50_fields_df
                field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
                if len(field_info) > 0:
                    field_name = field_info.iloc[0]['field_name']
                else:
                    field_name = f"field_{field_id}"

                latest_harvests[field_id] = {
                    'field_name': field_name,
                    'harvest_date': latest_row['season_end']
                }

        print(f"  ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")

        return latest_harvests

    except Exception as e:
        print(f"✗ Error loading harvest data: {e}")
        return {}


def main():
    print("="*90)
    print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS")
    print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery")
    print("="*90)

    # Configuration
    geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
    harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx")
    output_dir = Path("laravel_app/storage/app/angata/RGB")
    tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")

    # Verify paths
    if not geojson_path.exists():
        print(f"✗ GeoJSON not found: {geojson_path}")
        return
    if not harvest_xlsx.exists():
        print(f"✗ Harvest Excel not found: {harvest_xlsx}")
        return
    if not tiff_dir.exists():
        print(f"✗ TIFF directory not found: {tiff_dir}")
        return

    output_dir.mkdir(parents=True, exist_ok=True)

    # Step 1: Load GeoJSON and get top 50 largest fields
    print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...")
    fields_df = load_geojson_and_calculate_areas(geojson_path)
    if fields_df is None:
        return

    top_50_fields = fields_df.head(50)
    print(f"  ✓ Selected {len(top_50_fields)} largest fields for processing")

    # Step 2: Load harvest dates from Excel
    print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
    harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields)

    if len(harvest_dates) == 0:
        print("✗ No harvest dates found in Excel file")
        return

    print(f"  ✓ Found {len(harvest_dates)} fields with completed seasons")
    for field_id, info in list(harvest_dates.items())[:5]:
        print(f"    - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
    if len(harvest_dates) > 5:
        print(f"    ... and {len(harvest_dates) - 5} more")

    # Step 3: Generate RGB grids for each field
    print("\n[3/4] Generating RGB validation grids...")
    rgb_count = 0

    for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
        field_name = harvest_info['field_name']
        harvest_date = harvest_info['harvest_date']

        try:
            # Run RGB visualization (harvest dates only, no registered/predicted distinction)
            results = generate_rgb_grids(
                field_data=None,  # Not needed - just for function compatibility
                field_id=field_id,
                registered_harvest_dates=[],  # Empty - using harvest.xlsx instead
                predicted_harvest_dates=[
                    {
                        'harvest_date': harvest_date,
                        'model_name': 'harvest_xlsx'
                    }
                ],
                output_dir=str(output_dir),  # All PNGs in same folder
                tiff_dir=str(tiff_dir),
                geojson_path=str(geojson_path)
            )

            if results['predicted']:
                rgb_count += 1
                print(f"  [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
            else:
                print(f"  [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")

        except Exception as e:
            print(f"  [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")

    # Summary
    print("\n" + "="*90)
    print(f"SUMMARY:")
    print(f"  Fields with harvest dates: {len(harvest_dates)}")
    print(f"  RGB grids generated: {rgb_count}/{len(harvest_dates)}")
    print(f"  Output directory: {output_dir}")
    print("="*90)
    print("\nVisual inspection checklist:")
    print("  ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
    print("  ⚠ Green vegetation at T~0d = Possible data error or replanting")
    print("  ✓ Green → Brown progression = Normal harvest sequence")
    print("="*90)


if __name__ == "__main__":
    main()