working on visualisation, not working too well...

This commit is contained in:
Timon 2026-01-18 11:25:31 +01:00
parent 9f312131d7
commit c11b10a73f
9 changed files with 1182 additions and 171 deletions

View file

@ -7,18 +7,19 @@ is approaching harvest. Use this for operational decision-making and real-time a
RUN FREQUENCY: Weekly (or daily if required)
INPUT:
- ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R)
- harvest.xlsx (baseline from scripts 22+23 - contains last harvest date per field)
Location: laravel_app/storage/app/{project}/Data/harvest.xlsx
- ci_data_for_python.csv (complete CI data from R script)
Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
- harvest_production_export.xlsx (baseline from script 01 - optional, for reference)
OUTPUT:
- harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
- reports/kpis/field_stats/{project}_harvest_imminent_week_{WW}_{YYYY}.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
Workflow:
1. Load harvest_production_export.xlsx (baseline dates - optional, for context)
2. Load ci_data_for_python.csv (recent CI data)
3. For each field, extract last 300 days of history
4. Run Model 307 inference on full sequence (last timestep probabilities)
5. Export harvest_imminent_weekly.csv with probabilities
1. Load harvest.xlsx to find last harvest date (season_end) per field
2. Load ci_data_for_python.csv (complete CI data)
3. For each field, extract all CI data AFTER last harvest (complete current season)
4. Run Model 307 inference on full season sequence (last timestep probabilities)
5. Export week_WW_YYYY.csv with probabilities
Output Columns:
- field: Field ID
@ -61,33 +62,34 @@ from harvest_date_pred_utils import (
def load_harvest_dates(harvest_file):
"""Load latest harvest end dates from Excel file (from harvest_production_export.xlsx)."""
print("[1/5] Loading harvest dates...")
"""Load last harvest end dates from harvest.xlsx (output from scripts 22+23)."""
print("[1/5] Loading harvest data for season boundaries...")
if not Path(harvest_file).exists():
print(f" ERROR: {harvest_file} not found")
print(" Using 180-day lookback as default")
print(f" harvest.xlsx is required to determine current season boundaries")
return None
try:
harvest_df = pd.read_excel(harvest_file)
print(f" Loaded {len(harvest_df)} field-season records")
print(f" Loaded {len(harvest_df)} season records")
# Use season_end_date column (output from harvest prediction script)
harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date'])
# season_end contains the last harvest date for each season
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'])
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
# Group by field and get the latest season_end_date
# Group by field and get the LATEST season_end_date (most recent harvest)
# This marks the start of the current season
harvest_dates = {}
for field_id, group in harvest_df.groupby('field'):
latest_end = group['season_end_date'].max()
harvest_dates[str(field_id).strip()] = latest_end
latest_harvest = group['season_end'].max()
harvest_dates[field_id] = latest_harvest
print(f" Successfully mapped {len(harvest_dates)} fields")
print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
print(f" Last harvest dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
return harvest_dates
except Exception as e:
print(f" ERROR loading harvest file: {e}")
print(f" Using 180-day lookback instead")
print(f" ERROR loading harvest.xlsx: {e}")
return None
@ -212,26 +214,37 @@ def main():
# Get project name from command line or use default
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
# Construct paths
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
# Construct paths - work from either python_app/ or root smartcane/ directory
# Try root first (laravel_app/...), then fall back to ../ (running from python_app/)
if Path("laravel_app/storage/app").exists():
base_storage = Path("laravel_app/storage/app") / project_name / "Data"
else:
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
harvest_data_dir = base_storage / "HarvestData"
BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx"
OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv"
harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
HARVEST_FILE = base_storage / "harvest.xlsx" # Output from scripts 22+23
# Determine week and year from current date for timestamped export
today = datetime.now()
week_num = int(today.strftime('%V'))
year_num = int(today.strftime('%Y'))
# Output directory: reports/kpis/field_stats/
reports_dir = base_storage.parent / "reports" / "kpis" / "field_stats"
reports_dir.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV = reports_dir / f"{project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv"
print("="*80)
print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})")
print("="*80)
# [1] Load harvest dates (optional - for projects with predictions)
harvest_dates = None
if BASELINE_FILE.exists():
harvest_dates = load_harvest_dates(BASELINE_FILE)
else:
print("[1/5] Loading harvest dates...")
print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)")
# [1] Load harvest dates (required to determine season boundaries)
harvest_dates = load_harvest_dates(HARVEST_FILE)
if harvest_dates is None or len(harvest_dates) == 0:
print(f"ERROR: Cannot run without harvest.xlsx - required to determine current season boundaries")
print(f" Please run scripts 22 (baseline prediction) and 23 (format conversion) first")
return
# [2] Load CI data
print(f"\n[2/5] Loading CI data...")
@ -271,6 +284,9 @@ def main():
count = 0
for field_id in ci_data['field'].unique():
# Convert field_id to string for consistency
field_id_str = str(field_id).strip()
# Get metadata
meta = field_meta[field_meta['field'] == field_id]
if len(meta) == 0:
@ -279,18 +295,21 @@ def main():
sub_field = meta['sub_field'].iloc[0]
latest_date = meta['latest_date'].iloc[0]
# Use recent CI history (last 300 days from latest available data)
# Get last harvest date for this field (start of current season)
last_harvest = harvest_dates.get(field_id_str)
if last_harvest is None:
continue
# Extract all CI data AFTER last harvest (complete current season)
field_data = ci_data[ci_data['field'] == field_id].copy()
field_data = field_data.sort_values('Date')
field_data = field_data[field_data['Date'] > last_harvest] # After last harvest
# Keep last 300 days of history for inference
if len(field_data) > 300:
field_data = field_data.iloc[-300:]
# Need at least 30 days of data since planting
if len(field_data) < 30:
continue
# Run inference on recent history to predict next 28 days
# Run inference on full current season to predict next 28 days
imminent_prob, detected_prob = run_inference_on_season(
field_data, model, config, scalers, device, ci_column
)
@ -338,10 +357,11 @@ def main():
print(f" WARNING: No results exported - check CI data availability")
print(f"\nStorage structure:")
print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx")
print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
print(f"\nReady to load into 09b field analysis report")
print(f" Input harvest: laravel_app/storage/app/{project_name}/Data/harvest.xlsx")
print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
print(f" Output: laravel_app/storage/app/{project_name}/reports/kpis/field_stats/")
print(f" Filename: {project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv")
print(f"\nReady to load into 80_calculate_kpis.R")
if __name__ == "__main__":

View file

@ -0,0 +1,288 @@
#!/usr/bin/env python
"""
Batch RGB Validation for Top 50 Largest Fields
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
- CI data: ci_data_for_python.csv
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
Usage:
python batch_rgb_validation_top_fields.py
Output:
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
- Each grid shows 15 images at 7-day intervals around the season_end date
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from rgb_visualization import generate_rgb_grids
def load_geojson_and_calculate_areas(geojson_path):
"""
Load GeoJSON and calculate area for each field.
Returns:
pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
"""
geojson_path = Path(geojson_path)
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return None
print(f"Loading GeoJSON: {geojson_path}")
with open(geojson_path) as f:
geojson_data = json.load(f)
fields = []
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
field_id = str(props.get('field', ''))
field_name = props.get('name', f"field_{field_id}")
geometry = feature.get('geometry', {})
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
# Simple area calculation using Shoelace formula
area_m2 = 0
if geom_type == 'Polygon' and coordinates:
coords = coordinates[0] # Exterior ring
area_m2 = calculate_polygon_area(coords)
elif geom_type == 'MultiPolygon' and coordinates:
for poly_coords in coordinates:
area_m2 += calculate_polygon_area(poly_coords[0])
if area_m2 > 0:
fields.append({
'field': field_id,
'field_name': field_name,
'area_m2': area_m2,
'area_hectares': area_m2 / 10000
})
df = pd.DataFrame(fields)
df = df.sort_values('area_m2', ascending=False)
print(f" ✓ Loaded {len(df)} fields")
print(f" Top 10 largest fields (hectares):")
for i, row in df.head(10).iterrows():
print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")
return df
def calculate_polygon_area(coords):
"""
Calculate area of polygon using Shoelace formula (in ).
Assumes coordinates are in lat/lon (roughly converts to meters).
"""
if len(coords) < 3:
return 0
# Rough conversion: at equator, 1 degree ≈ 111 km
# For lat/lon coordinates, use average latitude
lats = [c[1] for c in coords]
avg_lat = np.mean(lats)
lat_m_per_deg = 111000
lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))
# Convert to meters
coords_m = []
for lon, lat in coords:
x = (lon - coords[0][0]) * lon_m_per_deg
y = (lat - coords[0][1]) * lat_m_per_deg
coords_m.append((x, y))
# Shoelace formula
area = 0
for i in range(len(coords_m)):
j = (i + 1) % len(coords_m)
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
"""
Load harvest data from Excel file and get latest completed season for each field.
Returns season_end date for each field (latest complete season where season_end is not null).
Args:
harvest_xlsx_path (Path): Path to harvest.xlsx
top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering
Returns:
dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
"""
harvest_xlsx_path = Path(harvest_xlsx_path)
if not harvest_xlsx_path.exists():
print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
return {}
print(f"Loading harvest data: {harvest_xlsx_path}")
try:
harvest_df = pd.read_excel(harvest_xlsx_path)
# Ensure date columns are datetime
if 'season_end' in harvest_df.columns:
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
# Filter to top 50 fields and get only rows with season_end filled in
top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
harvest_df = harvest_df[harvest_df['season_end'].notna()]
# Group by field and get the LATEST (most recent) season_end
latest_harvests = {}
for field_id in top_50_field_ids:
field_records = harvest_df[harvest_df['field'] == field_id]
if len(field_records) > 0:
# Get row with latest season_end
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
# Get field name from top_50_fields_df
field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
if len(field_info) > 0:
field_name = field_info.iloc[0]['field_name']
else:
field_name = f"field_{field_id}"
latest_harvests[field_id] = {
'field_name': field_name,
'harvest_date': latest_row['season_end']
}
print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")
return latest_harvests
except Exception as e:
print(f"✗ Error loading harvest data: {e}")
return {}
def main():
print("="*90)
print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS")
print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery")
print("="*90)
# Configuration
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx")
output_dir = Path("laravel_app/storage/app/angata/RGB")
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
# Verify paths
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return
if not harvest_xlsx.exists():
print(f"✗ Harvest Excel not found: {harvest_xlsx}")
return
if not tiff_dir.exists():
print(f"✗ TIFF directory not found: {tiff_dir}")
return
output_dir.mkdir(parents=True, exist_ok=True)
# Step 1: Load GeoJSON and get top 50 largest fields
print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...")
fields_df = load_geojson_and_calculate_areas(geojson_path)
if fields_df is None:
return
top_50_fields = fields_df.head(50)
print(f" ✓ Selected {len(top_50_fields)} largest fields for processing")
# Step 2: Load harvest dates from Excel
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields)
if len(harvest_dates) == 0:
print("✗ No harvest dates found in Excel file")
return
print(f" ✓ Found {len(harvest_dates)} fields with completed seasons")
for field_id, info in list(harvest_dates.items())[:5]:
print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
if len(harvest_dates) > 5:
print(f" ... and {len(harvest_dates) - 5} more")
# Step 3: Generate RGB grids for each field
print("\n[3/4] Generating RGB validation grids...")
rgb_count = 0
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
field_name = harvest_info['field_name']
harvest_date = harvest_info['harvest_date']
try:
# Run RGB visualization (harvest dates only, no registered/predicted distinction)
results = generate_rgb_grids(
field_data=None, # Not needed - just for function compatibility
field_id=field_id,
registered_harvest_dates=[], # Empty - using harvest.xlsx instead
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir), # All PNGs in same folder
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
if results['predicted']:
rgb_count += 1
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
else:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")
except Exception as e:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")
# Summary
print("\n" + "="*90)
print(f"SUMMARY:")
print(f" Fields with harvest dates: {len(harvest_dates)}")
print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}")
print(f" Output directory: {output_dir}")
print("="*90)
print("\nVisual inspection checklist:")
print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
print(" ⚠ Green vegetation at T~0d = Possible data error or replanting")
print(" ✓ Green → Brown progression = Normal harvest sequence")
print("="*90)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,107 @@
#!/usr/bin/env python
"""
Debug script to find all tiles for a date and check which overlap with field boundary
"""
import json
import rasterio
from rasterio.mask import mask
from pathlib import Path
import numpy as np
import shapely.geometry as shgeom
import pandas as pd
# Load field 79 boundary
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
field_id = "79"
print(f"Loading field {field_id} from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_boundary = None
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == str(field_id):
geometry = feature.get('geometry')
if geometry:
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
if geom_type == 'MultiPolygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0][0]
field_boundary = shgeom.Polygon(coords)
elif geom_type == 'Polygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0]
field_boundary = shgeom.Polygon(coords)
break
if field_boundary is None:
print(f"Field {field_id} not found")
exit(1)
print(f"Field boundary bounds: {field_boundary.bounds}")
print(f"Field boundary area: {field_boundary.area}")
# Find a specific date directory
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
target_date = pd.Timestamp("2026-01-15") # Use a recent date that exists
# Find tiles for that date
date_dirs = []
for date_dir in tiff_dir.iterdir():
if date_dir.is_dir():
try:
dir_name = date_dir.name
date_str = dir_name.split('_')[0]
tile_date = pd.Timestamp(date_str)
if tile_date == target_date:
date_dirs.append(date_dir)
except:
pass
if not date_dirs:
print(f"No tiles found for {target_date}")
exit(1)
print(f"\nFound {len(date_dirs)} date directory(ies) for {target_date}")
for date_dir in date_dirs:
print(f"\n=== Checking date directory: {date_dir.name} ===")
tiles = list(date_dir.glob("*.tif"))
print(f"Found {len(tiles)} tiles in this directory")
for tile_path in sorted(tiles):
try:
with rasterio.open(tile_path) as src:
tile_bounds = src.bounds
tile_geom = shgeom.box(*tile_bounds)
intersects = field_boundary.intersects(tile_geom)
intersection = field_boundary.intersection(tile_geom) if intersects else None
intersection_area = intersection.area if intersection else 0
print(f"\n{tile_path.name}")
print(f" Tile bounds: {tile_bounds}")
print(f" Intersects field: {intersects}")
if intersects:
print(f" Intersection area: {intersection_area:.8f}")
# Try to mask this tile
geom = shgeom.mapping(field_boundary)
try:
masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3])
print(f" ✓ Successfully masked! Shape: {masked_data.shape}")
# Check the data in each band
for i, band_idx in enumerate([1, 2, 3]):
band_data = masked_data[i]
non_zero = (band_data != 0).sum()
print(f" Band {band_idx}: {non_zero} non-zero pixels out of {band_data.size}")
except ValueError as e:
print(f" ✗ Masking failed: {e}")
except Exception as e:
print(f" Error reading tile: {e}")

View file

@ -0,0 +1,102 @@
#!/usr/bin/env python
"""
Debug script to diagnose why field boundary masking produces no data
"""
import json
import rasterio
from rasterio.mask import mask
from pathlib import Path
import numpy as np
import shapely.geometry as shgeom
# Load a sample field boundary
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
field_id = "79" # A field that had issues
print(f"Loading field {field_id} from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_boundary = None
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == str(field_id):
geometry = feature.get('geometry')
if geometry:
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
if geom_type == 'MultiPolygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0][0]
field_boundary = shgeom.Polygon(coords)
elif geom_type == 'Polygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0]
field_boundary = shgeom.Polygon(coords)
break
if field_boundary is None:
print(f"Field {field_id} not found")
exit(1)
print(f"Field boundary bounds: {field_boundary.bounds}")
print(f"Field boundary area: {field_boundary.area}")
# Load a sample TIFF tile
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
tile_file = None
for date_dir in sorted(tiff_dir.iterdir()):
if date_dir.is_dir():
for tif in date_dir.glob("*.tif"):
if tif.stat().st_size > 12e6:
tile_file = tif
break
if tile_file:
break
if not tile_file:
print("No suitable TIFF found")
exit(1)
print(f"\nTesting with TIFF: {tile_file.name}")
with rasterio.open(tile_file) as src:
print(f"TIFF Bounds: {src.bounds}")
print(f"TIFF CRS: {src.crs}")
# Check if field boundary is within tile bounds
tile_box = shgeom.box(*src.bounds)
intersects = field_boundary.intersects(tile_box)
print(f"Field boundary intersects tile: {intersects}")
if intersects:
intersection = field_boundary.intersection(tile_box)
print(f"Intersection area: {intersection.area}")
print(f"Intersection bounds: {intersection.bounds}")
# Try to mask and see what we get
print("\nAttempting to mask...")
geom = shgeom.mapping(field_boundary)
try:
masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3])
print(f"Masked data shape: {masked_data.shape}")
print(f"Masked data dtype: {masked_data.dtype}")
# Check the data
for i, band_idx in enumerate([1, 2, 3]):
band_data = masked_data[i]
print(f"\nBand {band_idx}:")
print(f" min: {np.nanmin(band_data):.6f}")
print(f" max: {np.nanmax(band_data):.6f}")
print(f" mean: {np.nanmean(band_data):.6f}")
print(f" % valid (non-zero): {(band_data != 0).sum() / band_data.size * 100:.2f}%")
print(f" % NaN: {np.isnan(band_data).sum() / band_data.size * 100:.2f}%")
# Show sample values
valid_pixels = band_data[band_data != 0]
if len(valid_pixels) > 0:
print(f" Sample valid values: {valid_pixels[:10]}")
except ValueError as e:
print(f"Error during masking: {e}")

View file

@ -0,0 +1,47 @@
#!/usr/bin/env python
"""
Debug script to inspect TIFF file structure and data
"""
import rasterio
from pathlib import Path
import numpy as np
# Pick a tile file to inspect
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
# Find first available tile
tile_file = None
for date_dir in sorted(tiff_dir.iterdir()):
if date_dir.is_dir():
for tif in date_dir.glob("*.tif"):
if tif.stat().st_size > 12e6: # Skip empty files
tile_file = tif
break
if tile_file:
break
if not tile_file:
print("No suitable TIFF files found")
exit(1)
print(f"Inspecting: {tile_file.name}")
print("=" * 80)
with rasterio.open(tile_file) as src:
print(f"Band count: {src.count}")
print(f"Data type: {src.dtypes[0]}")
print(f"Shape: {src.height} x {src.width}")
print(f"CRS: {src.crs}")
print(f"Bounds: {src.bounds}")
print()
# Read each band
for band_idx in range(1, min(6, src.count + 1)):
data = src.read(band_idx)
print(f"Band {band_idx}:")
print(f" dtype: {data.dtype}")
print(f" range: {data.min():.6f} - {data.max():.6f}")
print(f" mean: {data.mean():.6f}")
print(f" % valid (non-zero): {(data != 0).sum() / data.size * 100:.1f}%")
print()

View file

@ -0,0 +1,299 @@
"""
Batch Field Visualization Tool - RGB Imagery Around Harvest Date
Purpose: Generate visual validation using RGB satellite imagery samples around
predicted harvest date to verify predictions (bare soil = harvested, green = not harvested)
Shows 12-15 RGB images in a grid, centered around the predicted harvest date
Usage:
python batch_plot_fields_rgb.py field1,field2,field3
python batch_plot_fields_rgb.py 10125,88,97
Or read from CSV:
python batch_plot_fields_rgb.py --file fields_to_check.csv
"""
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path
from datetime import datetime, timedelta
import sys
import rasterio
from rasterio.mask import mask
import geopandas as gpd
from harvest_date_pred_utils import load_model_and_config, extract_features
def get_field_centroid(field_id, geojson_path="pivot.geojson"):
"""Get centroid of field from GeoJSON for cropping RGB images."""
try:
gdf = gpd.read_file(geojson_path)
field_geom = gdf[gdf['field'] == str(field_id)]
if len(field_geom) > 0:
centroid = field_geom.geometry.iloc[0].centroid
return (centroid.x, centroid.y)
except Exception as e:
print(f" Warning: Could not get field centroid - {e}")
return None
def load_rgb_image(tif_path, field_id=None, geojson_path="pivot.geojson"):
"""
Load RGB bands from 8-band GeoTIFF
Bands: 0=coastal, 1=blue, 2=green, 3=green_i, 4=yellow, 5=red, 6=rededge, 7=nir
RGB = bands 5,3,1 (Red, Green, Blue)
"""
try:
with rasterio.open(tif_path) as src:
# Read RGB bands (bands are 1-indexed in rasterio)
red = src.read(6) # Band 6 = red (0-indexed band 5)
green = src.read(3) # Band 3 = green (0-indexed band 2)
blue = src.read(2) # Band 2 = blue (0-indexed band 1)
# Stack into RGB image
rgb = np.stack([red, green, blue], axis=2)
# Normalize to 0-1 range (8-band data is typically 0-10000)
rgb = np.clip(rgb / 5000.0, 0, 1)
return rgb
except Exception as e:
print(f" Error loading RGB from {tif_path}: {e}")
return None
def plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device,
tif_folder="../../../laravel_app/storage/app/angata/merged_tif_8b",
output_dir="validation_plots_rgb"):
"""
Create validation plot for a single field:
- Top: Harvest probability over time with peak marked
- Bottom: 12-15 RGB images in grid around predicted harvest date
"""
# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Filter field data
field_data = ci_data[ci_data['field'] == field_id].copy()
if len(field_data) == 0:
print(f" ✗ Field {field_id}: No CI data found")
return False
field_data = field_data.sort_values('Date')
print(f" ✓ Field {field_id}: {len(field_data)} days of data")
try:
# Extract features and run inference
ci_column = config['data']['ci_column']
feature_names = config['features']
feat_array = extract_features(field_data, feature_names, ci_column=ci_column)
if feat_array is None:
print(f" ✗ Field {field_id}: Feature extraction failed")
return False
# Apply scalers
if isinstance(scalers, dict) and 'features' in scalers:
feat_array = scalers['features'].transform(feat_array)
# Run inference
with torch.no_grad():
x_tensor = torch.tensor(feat_array, dtype=torch.float32).unsqueeze(0).to(device)
out_imm, out_det = model(x_tensor)
imm_probs = out_imm.squeeze(0).cpu().numpy()
# Find peak probability (predicted harvest date)
peak_idx = np.argmax(imm_probs)
peak_date = field_data['Date'].iloc[peak_idx]
peak_prob = imm_probs[peak_idx]
print(f" Peak probability: {peak_prob:.3f} on {peak_date.strftime('%Y-%m-%d')}")
# Get date range: ±6 days around peak (12-13 images total)
date_range = field_data['Date'].dt.date
peak_date_only = peak_date.date() if hasattr(peak_date, 'date') else peak_date
days_before = 6
days_after = 6
start_date = peak_date_only - timedelta(days=days_before)
end_date = peak_date_only + timedelta(days=days_after)
# Find available TIF files in date range
tif_folder_path = Path(tif_folder)
available_dates = []
for tif_file in sorted(tif_folder_path.glob("*.tif")):
date_str = tif_file.stem # YYYY-MM-DD
try:
tif_date = datetime.strptime(date_str, "%Y-%m-%d").date()
if start_date <= tif_date <= end_date:
available_dates.append((tif_date, tif_file))
except ValueError:
pass
if len(available_dates) == 0:
print(f" Warning: No TIF files found in {start_date} to {end_date}")
return False
print(f" Found {len(available_dates)} RGB images in date range")
# Load RGB images
rgb_images = []
rgb_dates = []
for tif_date, tif_file in available_dates:
rgb = load_rgb_image(str(tif_file), field_id)
if rgb is not None:
rgb_images.append(rgb)
rgb_dates.append(tif_date)
if len(rgb_images) == 0:
print(f" ✗ No RGB images loaded")
return False
print(f" Loaded {len(rgb_images)} RGB images")
# Create figure with probability plot + RGB grid
n_images = len(rgb_images)
n_cols = min(5, n_images) # Max 5 columns
n_rows = (n_images + n_cols - 1) // n_cols # Calculate rows needed
fig = plt.figure(figsize=(18, 12))
# Probability plot (top, spanning all columns)
ax_prob = plt.subplot(n_rows + 1, n_cols, (1, n_cols))
dates_arr = field_data['Date'].values
ax_prob.plot(dates_arr, imm_probs, '-', color='orange', linewidth=2.5, label='Imminent Probability', alpha=0.8)
ax_prob.axhline(y=0.5, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label='Threshold (0.5)')
ax_prob.axvline(x=peak_date, color='darkred', linestyle=':', linewidth=2, alpha=0.7, label='Peak')
ax_prob.fill_between(dates_arr, 0.5, 1.0, alpha=0.08, color='red')
ax_prob.set_ylim(-0.05, 1.05)
ax_prob.set_ylabel('Probability', fontsize=11, fontweight='bold')
ax_prob.set_xlabel('Date', fontsize=11, fontweight='bold')
ax_prob.set_title(f'Field {field_id} - Model 307 Harvest Probability', fontsize=12, fontweight='bold')
ax_prob.grid(True, alpha=0.3)
ax_prob.legend(loc='upper right', fontsize=9)
ax_prob.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax_prob.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.setp(ax_prob.xaxis.get_majorticklabels(), rotation=45, ha='right')
# Annotate peak
ax_prob.annotate(f'{peak_prob:.2f}\n{peak_date_only}',
xy=(peak_date, peak_prob),
xytext=(20, 20), textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.8),
arrowprops=dict(arrowstyle='->', lw=1.5, color='darkred'))
# RGB images in grid (below probability plot)
for i, (rgb, date) in enumerate(zip(rgb_images, rgb_dates)):
ax = plt.subplot(n_rows + 1, n_cols, n_cols + i + 1)
ax.imshow(rgb, extent=[0, 100, 0, 100])
# Highlight peak date
date_label = date.strftime('%m-%d')
is_peak = date == peak_date_only
color = 'darkred' if is_peak else 'black'
weight = 'bold' if is_peak else 'normal'
size = 11 if is_peak else 9
ax.set_title(date_label, fontsize=size, fontweight=weight, color=color)
ax.set_xticks([])
ax.set_yticks([])
plt.suptitle(f'Field {field_id} RGB Imagery: {len(rgb_images)} Days Around Peak Harvest Probability\nPeak: {peak_prob:.2f} on {peak_date_only} | Green = Growing | Brown/Bare = Harvested',
fontsize=13, fontweight='bold', y=0.995)
plt.tight_layout()
# Save
output_file = Path(output_dir) / f"field_{field_id}_rgb_validation.png"
plt.savefig(output_file, dpi=100, bbox_inches='tight')
print(f" ✓ Saved: {output_file}")
plt.close()
return True
except Exception as e:
print(f" ✗ Field {field_id}: Error - {e}")
import traceback
traceback.print_exc()
return False
def main():
print("="*80)
print("BATCH RGB VISUALIZATION TOOL")
print("Visual check: RGB imagery around predicted harvest date")
print("="*80)
# Parse arguments
fields_to_plot = []
if len(sys.argv) < 2:
print("\nUsage:")
print(" python batch_plot_fields_rgb.py field1,field2,field3")
print(" python batch_plot_fields_rgb.py --file fields.csv")
print("\nExample:")
print(" python batch_plot_fields_rgb.py 10125,88,97,440")
return
if sys.argv[1] == "--file":
if len(sys.argv) < 3:
print("ERROR: --file requires a CSV filename")
return
csv_file = sys.argv[2]
print(f"\n[1/4] Loading fields from CSV: {csv_file}")
try:
df = pd.read_csv(csv_file)
fields_to_plot = df['field'].astype(str).str.strip().tolist()
print(f" ✓ Loaded {len(fields_to_plot)} fields")
except Exception as e:
print(f" ✗ Error reading CSV: {e}")
return
else:
# Parse comma-separated list
fields_to_plot = [f.strip() for f in sys.argv[1].split(',')]
print(f"\n[1/4] Processing {len(fields_to_plot)} field(s): {', '.join(fields_to_plot)}")
# Load CI data
print("\n[2/4] Loading CI data...")
try:
ci_data = pd.read_csv("ci_data_for_python.csv")
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
ci_data['field'] = ci_data['field'].astype(str).str.strip()
print(f" ✓ Loaded {len(ci_data)} observations for {ci_data['field'].nunique()} fields")
except Exception as e:
print(f" ✗ Error loading CI data: {e}")
return
# Load model
print("\n[3/4] Loading model...")
try:
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
print(f" ✓ Model loaded on {device}")
except Exception as e:
print(f" ✗ Error loading model: {e}")
return
# Process each field
print("\n[4/4] Generating RGB validation plots...")
success_count = 0
for field_id in fields_to_plot:
if plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device):
success_count += 1
# Summary
print("\n" + "="*80)
print(f"SUMMARY: {success_count}/{len(fields_to_plot)} fields processed successfully")
print(f"Output directory: validation_plots_rgb/")
print("="*80)
print("\nInspect the PNG files to verify predictions:")
print(" ✓ Green imagery BEFORE peak date (field growing)")
print(" ✓ Brown/Bare imagery AT/AFTER peak date (harvested)")
print(" ✓ Peak date marked with red title")
if __name__ == "__main__":
main()

View file

@ -22,6 +22,8 @@ import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import matplotlib
matplotlib.use('Agg') # Use non-interactive backend to avoid display hangs
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.colors import Normalize
@ -86,55 +88,95 @@ def load_field_boundaries(geojson_path, field_id):
return None, None
def find_closest_tiff(target_date, tiff_dir, days_window=60, field_boundary=None):
def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60):
"""
Find available TIFF file closest to target date.
Skips obviously empty files (< 12 MB) without reading data.
Find all tile files for target_date (or closest date) that overlap with field_boundary.
TIFF files are named: YYYY-MM-DD.tif
Tile files are organized in subdirectories by date: 5x5/YYYY-MM-DD_HH/*.tif
Args:
target_date (pd.Timestamp): Target date to find TIFF near
tiff_dir (Path): Directory containing TIFF files
target_date (pd.Timestamp): Target date to find tiles near
tiff_dir (Path): Directory containing 5x5 date subdirectories
field_boundary (shapely.Polygon): Field boundary for overlap detection
days_window (int): Max days to search before/after target
field_boundary (shapely.Polygon): Unused, kept for compatibility
Returns:
Path: Path to closest TIFF file or None if not found
int: Days difference from target (negative=before, positive=after)
tuple: (list of tile paths, actual_date, days_diff)
list: tile paths that overlap field
pd.Timestamp: actual date of tiles found
int: days difference from target to actual date found
"""
target_date = pd.Timestamp(target_date)
tiff_dir = Path(tiff_dir)
available_tiffs = list(tiff_dir.glob('*.tif'))
if not available_tiffs:
return None, None
if not tiff_dir.exists():
return [], None, None
# Parse dates from filenames, skip obviously empty files by size
tiff_dates = []
min_size_mb = 12.0 # Empty files are ~11.56 MB, valid files are ~12.6-13.0 MB
# Find all date subdirectories
available_dates = {} # {date: ([tile file paths], actual_dir_name)}
min_size_mb = 12.0 # Empty files are ~11.56 MB
for tiff_path in available_tiffs:
for date_dir in tiff_dir.iterdir():
if not date_dir.is_dir():
continue
try:
# Quick size check to skip obviously empty files
file_size_mb = tiff_path.stat().st_size / (1024 * 1024)
if file_size_mb < min_size_mb:
# Parse date from directory name (YYYY-MM-DD or YYYY-MM-DD_HH)
dir_name = date_dir.name
# Extract just the date part before underscore if it exists
date_str = dir_name.split('_')[0]
tile_date = pd.Timestamp(date_str)
days_diff = (tile_date - target_date).days
if abs(days_diff) > days_window:
continue
date_str = tiff_path.stem # Remove .tif extension
tiff_date = pd.Timestamp(date_str)
days_diff = (tiff_date - target_date).days
if abs(days_diff) <= days_window:
tiff_dates.append((tiff_path, tiff_date, days_diff))
# Find all .tif files in this date directory
tile_files = []
for tile_file in date_dir.glob('*.tif'):
# Skip obviously empty files
file_size_mb = tile_file.stat().st_size / (1024 * 1024)
if file_size_mb >= min_size_mb:
tile_files.append(tile_file)
if tile_files:
available_dates[tile_date] = (tile_files, dir_name)
except:
pass
if not tiff_dates:
return None, None
if not available_dates:
return [], None, None
# Return closest by distance
closest = min(tiff_dates, key=lambda x: abs(x[2]))
return closest[0], closest[2]
# Find closest date
closest_date = min(available_dates.keys(), key=lambda d: abs((d - target_date).days))
days_diff = (closest_date - target_date).days
tiles, _ = available_dates[closest_date]
# Filter tiles to only those that overlap field boundary
if rasterio is None or field_boundary is None:
# If rasterio not available, use all tiles (conservative approach)
return tiles, closest_date, days_diff
overlapping_tiles = []
for tile_path in tiles:
try:
with rasterio.open(tile_path) as src:
# Get tile bounds
tile_bounds = src.bounds # (left, bottom, right, top)
tile_geom = shgeom.box(*tile_bounds)
# Check if tile overlaps field
if tile_geom.intersects(field_boundary):
overlapping_tiles.append(tile_path)
except:
pass
if not overlapping_tiles:
# No overlapping tiles found, return all tiles for the closest date
return tiles, closest_date, days_diff
return overlapping_tiles, closest_date, days_diff
def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
@ -148,12 +190,6 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
- Band 4: NIR
- Band 5: CI
For older merged_tif_8b files (raw Planet Scope 8-band):
- Bands 1=Coastal Blue, 2=Blue, 3=Green, 4=Red, 5=Red Edge, 6=NIR, 7=SWIR1, 8=SWIR2
- RGB would be bands 4,3,2 = Red, Green, Blue
This function auto-detects the format based on band count and descriptions.
Args:
tiff_path (Path): Path to TIFF file
field_boundary (shapely.Polygon): Field boundary for clipping
@ -168,58 +204,106 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
try:
with rasterio.open(tiff_path) as src:
# Check CRS compatibility
# Check band count
if src.count < 3:
print(f" ⚠ TIFF has only {src.count} bands (need at least 3 for RGB)")
return None
# Auto-detect format based on band count and descriptions
if src.count == 5 and hasattr(src, 'descriptions') and src.descriptions:
# merged_final_tif format: Red, Green, Blue, NIR, CI
bands_to_read = (1, 2, 3)
elif src.count == 9:
# merged_tif_8b format: use bands 4, 3, 2 for Red, Green, Blue
bands_to_read = (4, 3, 2)
else:
# Default: try to use first 3 bands or specified bands
bands_to_read = rgb_bands
print(f" Unknown TIFF format ({src.count} bands), using bands {bands_to_read}")
# For merged_final_tif: bands 1,2,3 are R,G,B
bands_to_read = (1, 2, 3)
# Mask and read bands (crop=True reads only the clipped window, not full resolution)
# Mask and read bands
geom = shgeom.mapping(field_boundary)
try:
# Read RGB bands at once, then mask (more efficient than masking 3 times)
masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read))
# masked_data shape is (3, height, width) - one layer per band
rgb_data = []
for i, band_idx in enumerate(bands_to_read):
band_data = masked_data[i] # Extract the i-th band from masked stack
# Better debug output that handles NaN values
valid_data = band_data[~np.isnan(band_data)]
if len(valid_data) > 0:
print(f" DEBUG: Band {band_idx} valid data range: {valid_data.min():.4f} - {valid_data.max():.4f} ({len(valid_data)} valid pixels)")
else:
print(f" DEBUG: Band {band_idx} - all NaN!")
rgb_data.append(band_data)
# Stack RGB
rgb = np.stack(rgb_data, axis=-1)
rgb = np.stack([masked_data[i] for i in range(3)], axis=-1)
# Data is already normalized 0-1 from the merged_final_tif files
# Just ensure it's float32 and clipped
# Convert to float32 if not already
rgb = rgb.astype(np.float32)
# Normalize to 0-1 range
# Data appears to be 8-bit (0-255 range) stored as float32
# Check actual max value to determine normalization
max_val = np.nanmax(rgb)
if max_val > 0:
# If max is around 255 or less, assume 8-bit
if max_val <= 255:
rgb = rgb / 255.0
# If max is around 65535, assume 16-bit
elif max_val <= 65535:
rgb = rgb / 65535.0
# Otherwise divide by max to normalize
else:
rgb = rgb / max_val
rgb = np.clip(rgb, 0, 1)
print(f" DEBUG: Final RGB range: {rgb.min():.4f} - {rgb.max():.4f}")
# Check if result is all NaN
if np.all(np.isnan(rgb)):
return None
# Replace any remaining NaN with 0 (cloud/invalid pixels)
rgb = np.nan_to_num(rgb, nan=0.0)
return rgb
except ValueError as e:
print(f" ⚠ Error clipping to field boundary: {e}")
except ValueError:
return None
except Exception as e:
print(f" ✗ Error loading TIFF {tiff_path.name}: {e}")
return None
def load_and_composite_tiles_rgb(tile_paths, field_boundary):
"""
Load RGB from multiple overlapping tiles and composite them into a single image.
Args:
tile_paths (list[Path]): List of tile file paths
field_boundary (shapely.Polygon): Field boundary for clipping
Returns:
np.ndarray: Composited RGB data (height, width, 3) with values 0-1
or None if error occurs
"""
if rasterio is None or field_boundary is None or not tile_paths:
return None
try:
# Load and composite all tiles
rgb_arrays = []
for tile_path in tile_paths:
rgb = load_and_clip_tiff_rgb(tile_path, field_boundary)
if rgb is not None:
rgb_arrays.append(rgb)
if not rgb_arrays:
return None
# If single tile, return it
if len(rgb_arrays) == 1:
composited = rgb_arrays[0]
else:
# If multiple tiles, use max composite
stacked = np.stack(rgb_arrays, axis=0)
composited = np.max(stacked, axis=0)
composited = composited.astype(np.float32)
# Stretch contrast: normalize to 0-1 range based on actual min/max in the data
# This makes dim images visible
valid_data = composited[composited > 0]
if len(valid_data) > 0:
data_min = np.percentile(valid_data, 2) # 2nd percentile to handle outliers
data_max = np.percentile(valid_data, 98) # 98th percentile
if data_max > data_min:
composited = (composited - data_min) / (data_max - data_min)
composited = np.clip(composited, 0, 1)
return composited.astype(np.float32)
except Exception as e:
return None
@ -276,27 +360,24 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field
actual_dates = [] # Store actual dates of TIFFs found
for target in target_dates:
tiff_path, days_diff = find_closest_tiff(target, tiff_dir, days_window=60, field_boundary=field_boundary)
tile_paths, actual_date, days_diff = find_overlapping_tiles(target, tiff_dir, field_boundary, days_window=60)
if tiff_path is None:
if not tile_paths or actual_date is None:
rgb_images.append(None)
days_offsets.append(None)
actual_dates.append(None)
print(f" ⚠ No TIFF found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data")
print(f" ⚠ No tiles found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data")
continue
# Extract date from filename: YYYY-MM-DD.tif
tiff_date = pd.to_datetime(tiff_path.stem)
rgb = load_and_clip_tiff_rgb(tiff_path, field_boundary)
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
rgb_images.append(rgb)
days_offsets.append(days_diff)
actual_dates.append(tiff_date)
actual_dates.append(actual_date)
if rgb is not None:
print(f" ✓ Loaded {tiff_path.name} ({days_diff:+d}d from target)")
print(f" ✓ Loaded {len(tile_paths)} tile(s) for {actual_date.strftime('%Y-%m-%d')} ({days_diff:+d}d from target)")
else:
print(f" ⚠ Loaded {tiff_path.name} but RGB data is None")
print(f" ⚠ Loaded {len(tile_paths)} tile(s) but RGB data is None")
# Create 5x3 grid plot (15 images)
fig, axes = plt.subplots(3, 5, figsize=(25, 15))
@ -360,11 +441,16 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png'
output_path = Path(output_dir) / filename
plt.savefig(output_path, dpi=100, bbox_inches='tight')
plt.close()
print(f" ✓ Saved: {filename}")
return output_path
try:
plt.savefig(output_path, dpi=100, format='png')
plt.close()
print(f" ✓ Saved: {filename}")
return output_path
except Exception as e:
plt.close()
print(f" ✗ Error saving PNG: {e}")
return None
def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates,

View file

@ -11,11 +11,28 @@ library(terra)
library(sf)
# ============================================================================
# CONFIGURATION
# CONFIGURATION & COMMAND-LINE ARGUMENTS
# ============================================================================
# Parse command-line arguments for date filtering
args <- commandArgs(trailingOnly = TRUE)
# Example: Rscript 10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-17
start_date <- NULL
end_date <- NULL
if (length(args) >= 1) {
start_date <- as.Date(args[1])
cat("Filtering: start date =", as.character(start_date), "\n")
}
if (length(args) >= 2) {
end_date <- as.Date(args[2])
cat("Filtering: end date =", as.character(end_date), "\n")
}
PROJECT <- "angata"
TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b")
TIFF_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "merged_tif_8b")
# GRID SIZE CONFIGURATION - Change this to use different grid sizes
# Options: 5x5 (25 tiles), 10x10 (100 tiles), etc.
@ -25,10 +42,10 @@ GRID_NCOLS <- 5
# Construct grid-specific subfolder path
GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS)
OUTPUT_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL)
OUTPUT_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL)
# Load field boundaries for overlap checking
GEOJSON_PATH <- file.path("laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson")
GEOJSON_PATH <- file.path("..", "laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson")
cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "")
cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "")
@ -40,31 +57,50 @@ cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "")
cat("\n[PART 1] Creating Master Grid\n")
# Load field boundaries for overlap checking
cat("\n[1] Loading field boundaries from GeoJSON...\n")
cat("\n[1] Checking for existing master grid...\n")
if (!file.exists(GEOJSON_PATH)) {
stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n",
"Please ensure ", PROJECT, " has a pivot.geojson file.")
}
# Check if master grid already exists
MASTER_GRID_PATH <- file.path(OUTPUT_FOLDER, paste0("master_grid_", GRID_SIZE_LABEL, ".geojson"))
field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE)
field_boundaries_vect <- terra::vect(GEOJSON_PATH)
cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s)\n", sep = "")
# Try to find a name column (could be 'name', 'field', 'field_name', etc.)
field_names <- NA
if ("name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$name
} else if ("field" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field
} else if ("field_name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field_name
if (file.exists(MASTER_GRID_PATH)) {
cat(" ✓ Found existing master grid at:\n ", MASTER_GRID_PATH, "\n", sep = "")
master_grid_sf <- st_read(MASTER_GRID_PATH, quiet = TRUE)
field_boundaries_sf <- NULL # No need to load pivot.geojson
field_boundaries_vect <- NULL
cat(" ✓ Loaded grid with ", nrow(master_grid_sf), " tiles\n", sep = "")
} else {
field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices
# No existing grid - need to create one from pivot.geojson
cat(" No existing grid found. Creating new one from pivot.geojson...\n")
if (!file.exists(GEOJSON_PATH)) {
stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n",
"Please ensure ", PROJECT, " has a pivot.geojson file, or run this script ",
"from the same directory as a previous successful run (grid already exists).")
}
field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE)
field_boundaries_vect <- terra::vect(GEOJSON_PATH)
cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s) from GeoJSON\n", sep = "")
}
cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "")
# Try to find a name column (only if field_boundaries_sf exists)
if (!is.null(field_boundaries_sf)) {
field_names <- NA
if ("name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$name
} else if ("field" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field
} else if ("field_name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field_name
} else {
field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices
}
cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "")
}
# Helper function: Check if a tile overlaps with any field (simple bbox overlap)
tile_overlaps_fields <- function(tile_extent, field_geoms) {
@ -105,6 +141,27 @@ cat("\n[2] Checking TIFF extents...\n")
tiff_files <- list.files(TIFF_FOLDER, pattern = "\\.tif$", full.names = FALSE)
tiff_files <- sort(tiff_files)
# Filter by date range if specified
if (!is.null(start_date) || !is.null(end_date)) {
cat("\nApplying date filter...\n")
file_dates <- as.Date(sub("\\.tif$", "", tiff_files))
if (!is.null(start_date) && !is.null(end_date)) {
keep_idx <- file_dates >= start_date & file_dates <= end_date
cat(" Date range: ", as.character(start_date), " to ", as.character(end_date), "\n", sep = "")
} else if (!is.null(start_date)) {
keep_idx <- file_dates >= start_date
cat(" From: ", as.character(start_date), "\n", sep = "")
} else {
keep_idx <- file_dates <= end_date
cat(" Until: ", as.character(end_date), "\n", sep = "")
}
tiff_files <- tiff_files[keep_idx]
cat(" ✓ Filtered to ", length(tiff_files), " file(s)\n", sep = "")
}
if (length(tiff_files) == 0) {
stop("No TIFF files found in ", TIFF_FOLDER)
}
@ -242,26 +299,33 @@ if (file.exists(master_grid_file)) {
cat("\n[PART 2] Creating Filtered Grid (only overlapping tiles)\n")
cat("\n[7] Filtering master grid to only overlapping tiles...\n")
# Check which tiles overlap with any field
overlapping_tile_indices <- c()
for (tile_idx in 1:nrow(master_grid_sf)) {
tile_geom <- master_grid_sf[tile_idx, ]
# If grid was loaded from file, it's already filtered. Skip filtering.
if (!file.exists(MASTER_GRID_PATH)) {
cat("\n[7] Filtering master grid to only overlapping tiles...\n")
# Check overlap with any field
if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) {
overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx)
# Check which tiles overlap with any field
overlapping_tile_indices <- c()
for (tile_idx in 1:nrow(master_grid_sf)) {
tile_geom <- master_grid_sf[tile_idx, ]
# Check overlap with any field
if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) {
overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx)
}
}
cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "")
cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "")
# Create filtered grid with only overlapping tiles
filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ]
filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices)
} else {
cat("\n[7] Using pre-filtered grid (already loaded from file)...\n")
# Grid was already loaded - it's already filtered
filtered_grid_sf <- master_grid_sf
}
cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "")
cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "")
# Create filtered grid with only overlapping tiles
filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ]
filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices)
# Convert to SpatVector for makeTiles
filtered_grid_vect <- terra::vect(filtered_grid_sf)
@ -314,7 +378,7 @@ for (file_idx in seq_along(tiff_files)) {
dir.create(date_output_folder, recursive = TRUE, showWarnings = FALSE)
}
cat(" Creating ", length(overlapping_tile_indices), " tiles...\n", sep = "")
cat(" Creating ", nrow(filtered_grid_sf), " tiles...\n", sep = "")
# Use makeTiles with FILTERED grid (only overlapping tiles)
tiles_list <- terra::makeTiles(

View file

@ -54,14 +54,12 @@ main <- function() {
if (is.na(end_date)) {
message("Invalid end_date provided. Using current date.")
end_date <- Sys.Date()
end_date <- "2026-01-01" # Default date for testing
}
} else if (exists("end_date_str", envir = .GlobalEnv)) {
end_date <- as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
# Default to current date if no argument is provided
end_date <- Sys.Date()
end_date <- "2026-01-01" # Default date for testing
message("No end_date provided. Using current date: ", format(end_date))
}