289 lines
10 KiB
Python
289 lines
10 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Batch RGB Validation for Top 50 Largest Fields
|
|
|
|
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields.
|
|
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
|
|
|
|
Configuration:
|
|
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
|
|
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
|
|
- CI data: ci_data_for_python.csv
|
|
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
|
|
|
|
Usage:
|
|
python batch_rgb_validation_top_fields.py
|
|
|
|
Output:
|
|
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
|
|
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
|
|
- Each grid shows 15 images at 7-day intervals around the season_end date
|
|
"""
|
|
|
|
import json
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from datetime import datetime, timedelta
|
|
import sys
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from rgb_visualization import generate_rgb_grids
|
|
|
|
|
|
def load_geojson_and_calculate_areas(geojson_path):
|
|
"""
|
|
Load GeoJSON and calculate area for each field.
|
|
|
|
Returns:
|
|
pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
|
|
"""
|
|
geojson_path = Path(geojson_path)
|
|
|
|
if not geojson_path.exists():
|
|
print(f"✗ GeoJSON not found: {geojson_path}")
|
|
return None
|
|
|
|
print(f"Loading GeoJSON: {geojson_path}")
|
|
|
|
with open(geojson_path) as f:
|
|
geojson_data = json.load(f)
|
|
|
|
fields = []
|
|
|
|
for feature in geojson_data.get('features', []):
|
|
props = feature.get('properties', {})
|
|
field_id = str(props.get('field', ''))
|
|
field_name = props.get('name', f"field_{field_id}")
|
|
|
|
geometry = feature.get('geometry', {})
|
|
geom_type = geometry.get('type', '')
|
|
coordinates = geometry.get('coordinates', [])
|
|
|
|
# Simple area calculation using Shoelace formula
|
|
area_m2 = 0
|
|
if geom_type == 'Polygon' and coordinates:
|
|
coords = coordinates[0] # Exterior ring
|
|
area_m2 = calculate_polygon_area(coords)
|
|
elif geom_type == 'MultiPolygon' and coordinates:
|
|
for poly_coords in coordinates:
|
|
area_m2 += calculate_polygon_area(poly_coords[0])
|
|
|
|
if area_m2 > 0:
|
|
fields.append({
|
|
'field': field_id,
|
|
'field_name': field_name,
|
|
'area_m2': area_m2,
|
|
'area_hectares': area_m2 / 10000
|
|
})
|
|
|
|
df = pd.DataFrame(fields)
|
|
df = df.sort_values('area_m2', ascending=False)
|
|
|
|
print(f" ✓ Loaded {len(df)} fields")
|
|
print(f" Top 10 largest fields (hectares):")
|
|
for i, row in df.head(10).iterrows():
|
|
print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")
|
|
|
|
return df
|
|
|
|
|
|
def calculate_polygon_area(coords):
|
|
"""
|
|
Calculate area of polygon using Shoelace formula (in m²).
|
|
Assumes coordinates are in lat/lon (roughly converts to meters).
|
|
"""
|
|
if len(coords) < 3:
|
|
return 0
|
|
|
|
# Rough conversion: at equator, 1 degree ≈ 111 km
|
|
# For lat/lon coordinates, use average latitude
|
|
lats = [c[1] for c in coords]
|
|
avg_lat = np.mean(lats)
|
|
lat_m_per_deg = 111000
|
|
lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))
|
|
|
|
# Convert to meters
|
|
coords_m = []
|
|
for lon, lat in coords:
|
|
x = (lon - coords[0][0]) * lon_m_per_deg
|
|
y = (lat - coords[0][1]) * lat_m_per_deg
|
|
coords_m.append((x, y))
|
|
|
|
# Shoelace formula
|
|
area = 0
|
|
for i in range(len(coords_m)):
|
|
j = (i + 1) % len(coords_m)
|
|
area += coords_m[i][0] * coords_m[j][1]
|
|
area -= coords_m[j][0] * coords_m[i][1]
|
|
|
|
return abs(area) / 2
|
|
|
|
|
|
def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
|
|
"""
|
|
Load harvest data from Excel file and get latest completed season for each field.
|
|
|
|
Returns season_end date for each field (latest complete season where season_end is not null).
|
|
|
|
Args:
|
|
harvest_xlsx_path (Path): Path to harvest.xlsx
|
|
top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering
|
|
|
|
Returns:
|
|
dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
|
|
"""
|
|
harvest_xlsx_path = Path(harvest_xlsx_path)
|
|
|
|
if not harvest_xlsx_path.exists():
|
|
print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
|
|
return {}
|
|
|
|
print(f"Loading harvest data: {harvest_xlsx_path}")
|
|
|
|
try:
|
|
harvest_df = pd.read_excel(harvest_xlsx_path)
|
|
|
|
# Ensure date columns are datetime
|
|
if 'season_end' in harvest_df.columns:
|
|
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
|
|
|
|
# Filter to top 50 fields and get only rows with season_end filled in
|
|
top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
|
|
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
|
|
harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
|
|
harvest_df = harvest_df[harvest_df['season_end'].notna()]
|
|
|
|
# Group by field and get the LATEST (most recent) season_end
|
|
latest_harvests = {}
|
|
|
|
for field_id in top_50_field_ids:
|
|
field_records = harvest_df[harvest_df['field'] == field_id]
|
|
|
|
if len(field_records) > 0:
|
|
# Get row with latest season_end
|
|
latest_idx = field_records['season_end'].idxmax()
|
|
latest_row = field_records.loc[latest_idx]
|
|
|
|
# Get field name from top_50_fields_df
|
|
field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
|
|
if len(field_info) > 0:
|
|
field_name = field_info.iloc[0]['field_name']
|
|
else:
|
|
field_name = f"field_{field_id}"
|
|
|
|
latest_harvests[field_id] = {
|
|
'field_name': field_name,
|
|
'harvest_date': latest_row['season_end']
|
|
}
|
|
|
|
print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")
|
|
|
|
return latest_harvests
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error loading harvest data: {e}")
|
|
return {}
|
|
|
|
|
|
def main():
|
|
print("="*90)
|
|
print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS")
|
|
print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery")
|
|
print("="*90)
|
|
|
|
# Configuration
|
|
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
|
|
harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx")
|
|
output_dir = Path("laravel_app/storage/app/angata/RGB")
|
|
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
|
|
|
|
# Verify paths
|
|
if not geojson_path.exists():
|
|
print(f"✗ GeoJSON not found: {geojson_path}")
|
|
return
|
|
if not harvest_xlsx.exists():
|
|
print(f"✗ Harvest Excel not found: {harvest_xlsx}")
|
|
return
|
|
if not tiff_dir.exists():
|
|
print(f"✗ TIFF directory not found: {tiff_dir}")
|
|
return
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Step 1: Load GeoJSON and get top 50 largest fields
|
|
print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...")
|
|
fields_df = load_geojson_and_calculate_areas(geojson_path)
|
|
if fields_df is None:
|
|
return
|
|
|
|
top_50_fields = fields_df.head(50)
|
|
print(f" ✓ Selected {len(top_50_fields)} largest fields for processing")
|
|
|
|
# Step 2: Load harvest dates from Excel
|
|
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
|
|
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields)
|
|
|
|
if len(harvest_dates) == 0:
|
|
print("✗ No harvest dates found in Excel file")
|
|
return
|
|
|
|
print(f" ✓ Found {len(harvest_dates)} fields with completed seasons")
|
|
for field_id, info in list(harvest_dates.items())[:5]:
|
|
print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
|
|
if len(harvest_dates) > 5:
|
|
print(f" ... and {len(harvest_dates) - 5} more")
|
|
|
|
# Step 3: Generate RGB grids for each field
|
|
print("\n[3/4] Generating RGB validation grids...")
|
|
rgb_count = 0
|
|
|
|
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
|
|
field_name = harvest_info['field_name']
|
|
harvest_date = harvest_info['harvest_date']
|
|
|
|
try:
|
|
# Run RGB visualization (harvest dates only, no registered/predicted distinction)
|
|
results = generate_rgb_grids(
|
|
field_data=None, # Not needed - just for function compatibility
|
|
field_id=field_id,
|
|
registered_harvest_dates=[], # Empty - using harvest.xlsx instead
|
|
predicted_harvest_dates=[
|
|
{
|
|
'harvest_date': harvest_date,
|
|
'model_name': 'harvest_xlsx'
|
|
}
|
|
],
|
|
output_dir=str(output_dir), # All PNGs in same folder
|
|
tiff_dir=str(tiff_dir),
|
|
geojson_path=str(geojson_path)
|
|
)
|
|
|
|
if results['predicted']:
|
|
rgb_count += 1
|
|
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
|
|
else:
|
|
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")
|
|
|
|
except Exception as e:
|
|
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")
|
|
|
|
# Summary
|
|
print("\n" + "="*90)
|
|
print(f"SUMMARY:")
|
|
print(f" Fields with harvest dates: {len(harvest_dates)}")
|
|
print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}")
|
|
print(f" Output directory: {output_dir}")
|
|
print("="*90)
|
|
print("\nVisual inspection checklist:")
|
|
print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
|
|
print(" ⚠ Green vegetation at T~0d = Possible data error or replanting")
|
|
print(" ✓ Green → Brown progression = Normal harvest sequence")
|
|
print("="*90)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|