SmartCane/python_app/batch_rgb_validation_top_fields_v3.py

367 lines
13 KiB
Python

#!/usr/bin/env python
"""
Batch RGB Validation for Top 100 Largest Fields - V3
Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images).
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
Usage:
python batch_rgb_validation_top_fields_v3.py --field 1
python batch_rgb_validation_top_fields_v3.py
Output:
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
- Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images)
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys
import argparse
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from rgb_visualization import generate_rgb_grids
def load_geojson_and_calculate_areas(geojson_path):
"""
Load GeoJSON and calculate area for each field.
Returns:
pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
"""
geojson_path = Path(geojson_path)
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return None
print(f"Loading GeoJSON: {geojson_path}")
with open(geojson_path) as f:
geojson_data = json.load(f)
fields = []
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
field_id = str(props.get('field', ''))
field_name = props.get('name', f"field_{field_id}")
geometry = feature.get('geometry', {})
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
# Simple area calculation using Shoelace formula
area_m2 = 0
if geom_type == 'Polygon' and coordinates:
coords = coordinates[0] # Exterior ring
area_m2 = calculate_polygon_area(coords)
elif geom_type == 'MultiPolygon' and coordinates:
for poly_coords in coordinates:
area_m2 += calculate_polygon_area(poly_coords[0])
if area_m2 > 0:
fields.append({
'field': field_id,
'field_name': field_name,
'area_m2': area_m2,
'area_hectares': area_m2 / 10000
})
df = pd.DataFrame(fields)
df = df.sort_values('area_m2', ascending=False)
print(f" ✓ Loaded {len(df)} fields")
print(f" Top 10 largest fields (hectares):")
for i, row in df.head(10).iterrows():
print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")
return df
def calculate_polygon_area(coords):
"""
Calculate area of polygon using Shoelace formula (in m²).
Assumes coordinates are in lat/lon (roughly converts to meters).
"""
if len(coords) < 3:
return 0
# Rough conversion: at equator, 1 degree ≈ 111 km
# For lat/lon coordinates, use average latitude
lats = [c[1] for c in coords]
avg_lat = np.mean(lats)
lat_m_per_deg = 111000
lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))
# Convert to meters
coords_m = []
for lon, lat in coords:
x = (lon - coords[0][0]) * lon_m_per_deg
y = (lat - coords[0][1]) * lat_m_per_deg
coords_m.append((x, y))
# Shoelace formula
area = 0
for i in range(len(coords_m)):
j = (i + 1) % len(coords_m)
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
"""
Load harvest data from Excel file and get latest completed season for each field.
Returns season_end date for each field (latest complete season where season_end is not null).
Args:
harvest_xlsx_path (Path): Path to harvest.xlsx
top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering
Returns:
dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
"""
harvest_xlsx_path = Path(harvest_xlsx_path)
if not harvest_xlsx_path.exists():
print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
return {}
print(f"Loading harvest data: {harvest_xlsx_path}")
try:
harvest_df = pd.read_excel(harvest_xlsx_path)
# Ensure date columns are datetime
if 'season_end' in harvest_df.columns:
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
# Filter to top 50 fields and get only rows with season_end filled in
top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
harvest_df = harvest_df[harvest_df['season_end'].notna()]
# Group by field and get the LATEST (most recent) season_end
latest_harvests = {}
for field_id in top_50_field_ids:
field_records = harvest_df[harvest_df['field'] == field_id]
if len(field_records) > 0:
# Get row with latest season_end
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
# Get field name from top_50_fields_df
field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
if len(field_info) > 0:
field_name = field_info.iloc[0]['field_name']
else:
field_name = f"field_{field_id}"
latest_harvests[field_id] = {
'field_name': field_name,
'harvest_date': latest_row['season_end']
}
print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")
return latest_harvests
except Exception as e:
print(f"✗ Error loading harvest data: {e}")
return {}
def main():
parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)')
parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")')
parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)')
args = parser.parse_args()
print("="*90)
if args.field:
print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}")
else:
print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS")
print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)")
print("="*90)
# Configuration
project = args.project
geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson")
harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx")
output_dir = Path(f"laravel_app/storage/app/{project}/RGB")
tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5")
# Verify paths
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return
if not harvest_xlsx.exists():
print(f"✗ Harvest Excel not found: {harvest_xlsx}")
return
if not tiff_dir.exists():
print(f"✗ TIFF directory not found: {tiff_dir}")
return
output_dir.mkdir(parents=True, exist_ok=True)
# Handle single field mode
if args.field:
print(f"\n[1/3] Loading harvest data for field {args.field}...")
harvest_df = pd.read_excel(harvest_xlsx)
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
field_records = harvest_df[harvest_df['field'] == args.field]
field_records = field_records[field_records['season_end'].notna()]
if len(field_records) == 0:
print(f"✗ No harvest data found for field {args.field}")
return
# Get latest harvest for this field
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
harvest_date = latest_row['season_end']
print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}")
# Load field name from GeoJSON
print(f"\n[2/3] Loading field name from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_name = f"field_{args.field}"
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == args.field:
field_name = props.get('name', field_name)
break
print(f" ✓ Field name: {field_name}")
# Generate RGB grid
print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...")
results = generate_rgb_grids(
field_data=None,
field_id=args.field,
registered_harvest_dates=[],
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir),
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
print("\n" + "="*90)
if results['predicted']:
print(f"✓ RGB grid generated successfully!")
print(f" Field: {field_name} (ID: {args.field})")
print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}")
print(f" Output: {output_dir}")
else:
print(f"⚠ No RGB grid generated (no imagery available)")
print("="*90)
return
# Batch mode for top 100 fields
print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...")
fields_df = load_geojson_and_calculate_areas(geojson_path)
if fields_df is None:
return
top_100_fields = fields_df.head(100)
print(f" ✓ Selected {len(top_100_fields)} largest fields for processing")
# Step 2: Load harvest dates from Excel
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields)
if len(harvest_dates) == 0:
print("✗ No harvest dates found in Excel file")
return
print(f" ✓ Found {len(harvest_dates)} fields with completed seasons")
for field_id, info in list(harvest_dates.items())[:5]:
print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
if len(harvest_dates) > 5:
print(f" ... and {len(harvest_dates) - 5} more")
# Step 3: Generate RGB grids for each field
print("\n[3/4] Generating RGB validation grids (v3 dynamic)...")
rgb_count = 0
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
field_name = harvest_info['field_name']
harvest_date = harvest_info['harvest_date']
try:
# Run RGB visualization (harvest dates only, no registered/predicted distinction)
results = generate_rgb_grids(
field_data=None, # Not needed - just for function compatibility
field_id=field_id,
registered_harvest_dates=[], # Empty - using harvest.xlsx instead
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir), # All PNGs in same folder
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
if results['predicted']:
rgb_count += 1
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
else:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")
except Exception as e:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")
# Summary
print("\n" + "="*90)
print(f"SUMMARY:")
print(f" Fields with harvest dates: {len(harvest_dates)}")
print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}")
print(f" Output directory: {output_dir}")
print("="*90)
print("\nVisual inspection checklist:")
print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
print(" ⚠ Green vegetation at T~0d = Possible data error or replanting")
print(" ✓ Green → Brown progression = Normal harvest sequence")
print("="*90)
if __name__ == "__main__":
main()