SmartCane/python_app/batch_rgb_validation_top_fields.py

289 lines
10 KiB
Python

#!/usr/bin/env python
"""
Batch RGB Validation for Top 50 Largest Fields
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
- CI data: ci_data_for_python.csv
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
Usage:
python batch_rgb_validation_top_fields.py
Output:
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
- Each grid shows 15 images at 7-day intervals around the season_end date
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from rgb_visualization import generate_rgb_grids
def load_geojson_and_calculate_areas(geojson_path):
"""
Load GeoJSON and calculate area for each field.
Returns:
pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
"""
geojson_path = Path(geojson_path)
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return None
print(f"Loading GeoJSON: {geojson_path}")
with open(geojson_path) as f:
geojson_data = json.load(f)
fields = []
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
field_id = str(props.get('field', ''))
field_name = props.get('name', f"field_{field_id}")
geometry = feature.get('geometry', {})
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
# Simple area calculation using Shoelace formula
area_m2 = 0
if geom_type == 'Polygon' and coordinates:
coords = coordinates[0] # Exterior ring
area_m2 = calculate_polygon_area(coords)
elif geom_type == 'MultiPolygon' and coordinates:
for poly_coords in coordinates:
area_m2 += calculate_polygon_area(poly_coords[0])
if area_m2 > 0:
fields.append({
'field': field_id,
'field_name': field_name,
'area_m2': area_m2,
'area_hectares': area_m2 / 10000
})
df = pd.DataFrame(fields)
df = df.sort_values('area_m2', ascending=False)
print(f" ✓ Loaded {len(df)} fields")
print(f" Top 10 largest fields (hectares):")
for i, row in df.head(10).iterrows():
print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")
return df
def calculate_polygon_area(coords):
"""
Calculate area of polygon using Shoelace formula (in m²).
Assumes coordinates are in lat/lon (roughly converts to meters).
"""
if len(coords) < 3:
return 0
# Rough conversion: at equator, 1 degree ≈ 111 km
# For lat/lon coordinates, use average latitude
lats = [c[1] for c in coords]
avg_lat = np.mean(lats)
lat_m_per_deg = 111000
lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))
# Convert to meters
coords_m = []
for lon, lat in coords:
x = (lon - coords[0][0]) * lon_m_per_deg
y = (lat - coords[0][1]) * lat_m_per_deg
coords_m.append((x, y))
# Shoelace formula
area = 0
for i in range(len(coords_m)):
j = (i + 1) % len(coords_m)
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
"""
Load harvest data from Excel file and get latest completed season for each field.
Returns season_end date for each field (latest complete season where season_end is not null).
Args:
harvest_xlsx_path (Path): Path to harvest.xlsx
top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering
Returns:
dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
"""
harvest_xlsx_path = Path(harvest_xlsx_path)
if not harvest_xlsx_path.exists():
print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
return {}
print(f"Loading harvest data: {harvest_xlsx_path}")
try:
harvest_df = pd.read_excel(harvest_xlsx_path)
# Ensure date columns are datetime
if 'season_end' in harvest_df.columns:
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
# Filter to top 50 fields and get only rows with season_end filled in
top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
harvest_df = harvest_df[harvest_df['season_end'].notna()]
# Group by field and get the LATEST (most recent) season_end
latest_harvests = {}
for field_id in top_50_field_ids:
field_records = harvest_df[harvest_df['field'] == field_id]
if len(field_records) > 0:
# Get row with latest season_end
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
# Get field name from top_50_fields_df
field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
if len(field_info) > 0:
field_name = field_info.iloc[0]['field_name']
else:
field_name = f"field_{field_id}"
latest_harvests[field_id] = {
'field_name': field_name,
'harvest_date': latest_row['season_end']
}
print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")
return latest_harvests
except Exception as e:
print(f"✗ Error loading harvest data: {e}")
return {}
def main():
print("="*90)
print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS")
print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery")
print("="*90)
# Configuration
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx")
output_dir = Path("laravel_app/storage/app/angata/RGB")
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
# Verify paths
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return
if not harvest_xlsx.exists():
print(f"✗ Harvest Excel not found: {harvest_xlsx}")
return
if not tiff_dir.exists():
print(f"✗ TIFF directory not found: {tiff_dir}")
return
output_dir.mkdir(parents=True, exist_ok=True)
# Step 1: Load GeoJSON and get top 50 largest fields
print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...")
fields_df = load_geojson_and_calculate_areas(geojson_path)
if fields_df is None:
return
top_50_fields = fields_df.head(50)
print(f" ✓ Selected {len(top_50_fields)} largest fields for processing")
# Step 2: Load harvest dates from Excel
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields)
if len(harvest_dates) == 0:
print("✗ No harvest dates found in Excel file")
return
print(f" ✓ Found {len(harvest_dates)} fields with completed seasons")
for field_id, info in list(harvest_dates.items())[:5]:
print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
if len(harvest_dates) > 5:
print(f" ... and {len(harvest_dates) - 5} more")
# Step 3: Generate RGB grids for each field
print("\n[3/4] Generating RGB validation grids...")
rgb_count = 0
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
field_name = harvest_info['field_name']
harvest_date = harvest_info['harvest_date']
try:
# Run RGB visualization (harvest dates only, no registered/predicted distinction)
results = generate_rgb_grids(
field_data=None, # Not needed - just for function compatibility
field_id=field_id,
registered_harvest_dates=[], # Empty - using harvest.xlsx instead
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir), # All PNGs in same folder
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
if results['predicted']:
rgb_count += 1
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
else:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")
except Exception as e:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")
# Summary
print("\n" + "="*90)
print(f"SUMMARY:")
print(f" Fields with harvest dates: {len(harvest_dates)}")
print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}")
print(f" Output directory: {output_dir}")
print("="*90)
print("\nVisual inspection checklist:")
print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
print(" ⚠ Green vegetation at T~0d = Possible data error or replanting")
print(" ✓ Green → Brown progression = Normal harvest sequence")
print("="*90)
if __name__ == "__main__":
main()