Merge pull request #4 from TimonWeitkamp/timon/sc-64-apply-the-new-structure-to-the-code

Timon/sc 64 apply the new structure to the code. mainly new numbering structure, and cleaned some code.
This commit is contained in:
Timon Weitkamp 2026-01-27 09:12:51 +01:00 committed by GitHub
commit 24401d436a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
31 changed files with 6039 additions and 1615 deletions

View file

@ -26,6 +26,9 @@ This is your GROUND TRUTH - compare all future predictions against this baseline
Usage:
python 01_harvest_baseline_prediction.py [project_name]
conda activate pytorch_gpu
cd python_app
python 22_harvest_baseline_prediction.py angata
Examples:
python 01_harvest_baseline_prediction.py angata
@ -43,6 +46,7 @@ from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
run_phase1_growing_window,
run_two_step_refinement,
build_production_harvest_table
)
@ -107,9 +111,9 @@ def main():
# [3/4] Run model predictions with two-step detection
print("\n[3/4] Running two-step harvest detection...")
print(" (Using threshold=0.45, consecutive_days=2 - tuned for Model 307 output)")
print(" (Using threshold=0.3, consecutive_days=2 - tuned baseline with DOY reset)")
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device,
phase1_threshold=0.45, phase1_consecutive=2)
phase1_threshold=0.3, phase1_consecutive=2)
# Build and export
print("\nBuilding production harvest table...")

View file

@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Script 23: Convert Harvest Format from Production to Standard
==============================================================
Converts harvest_production_export.xlsx (output from script 22) to the standard
harvest.xlsx format used by R scripts 24+.
INPUT:
- harvest_production_export.xlsx (from script 22)
Columns: field, season (numeric), season_start_date, season_end_date, phase2_harvest_date
Contains detected harvests only
OUTPUT:
- harvest.xlsx (standard format for R pipeline)
Columns: field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha
LOGIC:
1. For each field, group all detections chronologically
2. Create one row per completed season (has season_end date)
3. season_start = first CI date (2024-09-25) for first season, then previous harvest + 1 day
4. season_end = phase2_harvest_date (refined harvest date from script 22)
5. year = extracted from season_start date
6. season = "Data{year} : {field}" format
7. sub_field = field (same as field)
8. age, sub_area, tonnage_ha = left empty (filled by R scripts later or other data sources)
Date Format: YYYY-MM-DD
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import sys
from pathlib import Path
def get_ci_date_range(project_dir):
"""
Get the date range of CI data to establish season_start for first season.
Returns: (min_date, max_date) as datetime objects
"""
base_storage = Path("../laravel_app/storage/app") / project_dir / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
ci_csv_path = ci_data_dir / "ci_data_for_python.csv"
if not ci_csv_path.exists():
# Fallback: assume data starts 2024-09-25 (typical for projects)
print(f"[WARNING] CI data CSV not found at {ci_csv_path}, assuming CI starts 2024-09-25")
return datetime(2024, 9, 25), datetime.now()
try:
# Read only date column (first column usually has dates)
df = pd.read_csv(ci_csv_path, nrows=1)
columns = df.columns.tolist()
date_col = columns[0] # First column should be dates
df = pd.read_csv(ci_csv_path, usecols=[date_col])
df[date_col] = pd.to_datetime(df[date_col])
min_date = df[date_col].min()
max_date = df[date_col].max()
print(f"[INFO] CI data date range: {min_date.date()} to {max_date.date()}")
return min_date, max_date
except Exception as e:
print(f"[WARNING] Error reading CI date range: {e}, using fallback dates")
return datetime(2024, 9, 25), datetime.now()
def convert_harvest_format(project_dir="angata"):
"""
Convert harvest_production_export.xlsx to standard harvest.xlsx format.
Parameters:
-----------
project_dir : str
Project name (angata, esa, chemba, etc.)
"""
print(f"\n{'='*80}")
print(f"Script 23: Convert Harvest Format")
print(f"Project: {project_dir}")
print(f"{'='*80}\n")
# Get paths (same as script 22)
base_storage = Path("../laravel_app/storage/app") / project_dir / "Data"
harvest_data_dir = base_storage / "HarvestData"
source_file = harvest_data_dir / "harvest_production_export.xlsx"
output_file = base_storage / "harvest.xlsx"
# Check source file exists
if not source_file.exists():
print(f"[ERROR] Source file not found: {source_file}")
print(f"[ERROR] Please run script 22 first to generate harvest_production_export.xlsx")
return False
print(f"[INFO] Reading source file: {source_file}")
try:
# Read production format
df_source = pd.read_excel(source_file)
print(f"[INFO] Loaded {len(df_source)} harvest detections")
print(f"[INFO] Columns: {list(df_source.columns)}")
# Validate required columns
required_cols = ["field", "season_start_date", "season_end_date", "phase2_harvest_date"]
missing = [c for c in required_cols if c not in df_source.columns]
if missing:
print(f"[ERROR] Missing columns: {missing}")
return False
# Get CI date range for establishing first season start
ci_min_date, ci_max_date = get_ci_date_range(project_dir)
first_season_start = ci_min_date.strftime("%Y-%m-%d")
# Convert to datetime for processing
df_source["phase2_harvest_date"] = pd.to_datetime(df_source["phase2_harvest_date"])
df_source["field"] = df_source["field"].astype(str)
# Sort by field and harvest date
df_source = df_source.sort_values(["field", "phase2_harvest_date"]).reset_index(drop=True)
# Build output rows
output_rows = []
# Group by field
for field_id, group_df in df_source.groupby("field"):
# Get all harvest dates for this field, sorted chronologically
harvest_dates = group_df["phase2_harvest_date"].dt.strftime("%Y-%m-%d").tolist()
print(f"[INFO] Field {field_id}: {len(harvest_dates)} harvest detection(s)")
# First season always starts from CI beginning
current_season_start = first_season_start
for harvest_idx, harvest_date in enumerate(harvest_dates):
# Extract year from current season start
season_start_obj = pd.to_datetime(current_season_start)
year = season_start_obj.year
# Create season identifier
season_str = f"Data{year} : {field_id}"
# Create row for completed season
row = {
"field": field_id,
"sub_field": field_id, # Same as field
"year": year,
"season": season_str,
"season_start": current_season_start,
"season_end": harvest_date, # Filled because harvest detected
"age": "", # Empty - will be calculated in R
"sub_area": "", # Empty - will be populated from other data
"tonnage_ha": "" # Empty - will be populated from other data
}
output_rows.append(row)
# Next season starts day after this harvest
next_season_start = (pd.to_datetime(harvest_date) + timedelta(days=1)).strftime("%Y-%m-%d")
current_season_start = next_season_start
# If field has detections, check if we should add a final incomplete season
# Only if we're not at the end of the monitoring period
last_harvest = pd.to_datetime(harvest_dates[-1])
days_after_last = (ci_max_date - last_harvest).days
if days_after_last > 30: # More than 30 days of data after last harvest
# Add incomplete season row (season_end empty)
season_start_obj = pd.to_datetime(current_season_start)
year = season_start_obj.year
season_str = f"Data{year} : {field_id}"
row = {
"field": field_id,
"sub_field": field_id,
"year": year,
"season": season_str,
"season_start": current_season_start,
"season_end": "", # Empty - season still active
"age": "",
"sub_area": "",
"tonnage_ha": ""
}
output_rows.append(row)
print(f"[INFO] Added incomplete season starting {current_season_start}")
# Create output DataFrame
df_output = pd.DataFrame(output_rows)
# Reorder columns to match standard format
column_order = ["field", "sub_field", "year", "season", "season_start", "season_end",
"age", "sub_area", "tonnage_ha"]
df_output = df_output[column_order]
# Write to Excel
print(f"\n[INFO] Writing output file: {output_file}")
df_output.to_excel(output_file, index=False, sheet_name="Harvest")
# Print summary
print(f"\n[SUCCESS] Conversion complete!")
print(f"[INFO] Output rows: {len(df_output)}")
print(f"[INFO] Unique fields: {df_output['field'].nunique()}")
print(f"\n[INFO] Sample output:")
print(df_output.head(10).to_string(index=False))
return True
except Exception as e:
print(f"[ERROR] Conversion failed: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
# Get project from command line or use default
project = sys.argv[1] if len(sys.argv) > 1 else "angata"
success = convert_harvest_format(project)
sys.exit(0 if success else 1)

View file

@ -1,4 +1,4 @@
"""
r"""
Script: 02_harvest_imminent_weekly.py
Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields
@ -7,18 +7,19 @@ is approaching harvest. Use this for operational decision-making and real-time a
RUN FREQUENCY: Weekly (or daily if required)
INPUT:
- ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R)
- harvest.xlsx (baseline from scripts 22+23 - contains last harvest date per field)
Location: laravel_app/storage/app/{project}/Data/harvest.xlsx
- ci_data_for_python.csv (complete CI data from R script)
Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
- harvest_production_export.xlsx (baseline from script 01 - optional, for reference)
OUTPUT:
- harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
- reports/kpis/field_stats/{project}_harvest_imminent_week_{WW}_{YYYY}.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
Workflow:
1. Load harvest_production_export.xlsx (baseline dates - optional, for context)
2. Load ci_data_for_python.csv (recent CI data)
3. For each field, extract last 300 days of history
4. Run Model 307 inference on full sequence (last timestep probabilities)
5. Export harvest_imminent_weekly.csv with probabilities
1. Load harvest.xlsx to find last harvest date (season_end) per field
2. Load ci_data_for_python.csv (complete CI data)
3. For each field, extract all CI data AFTER last harvest (complete current season)
4. Run Model 307 inference on full season sequence (last timestep probabilities)
5. Export week_WW_YYYY.csv with probabilities
Output Columns:
- field: Field ID
@ -37,12 +38,12 @@ Use Cases:
- Feed into 09b script for weekly dashboard reports
Usage:
python 02_harvest_imminent_weekly.py [project_name]
python python_app/31_harvest_imminent_weekly.py angata
Examples:
python 02_harvest_imminent_weekly.py angata
python 02_harvest_imminent_weekly.py esa
python 02_harvest_imminent_weekly.py chemba
python python_app/31_harvest_imminent_weekly.py angata
python python_app/31_harvest_imminent_weekly.py esa
python python_app/31_harvest_imminent_weekly.py chemba
If no project specified, defaults to 'angata'
"""
@ -61,33 +62,34 @@ from harvest_date_pred_utils import (
def load_harvest_dates(harvest_file):
"""Load latest harvest end dates from Excel file (from harvest_production_export.xlsx)."""
print("[1/5] Loading harvest dates...")
"""Load last harvest end dates from harvest.xlsx (output from scripts 22+23)."""
print("[1/5] Loading harvest data for season boundaries...")
if not Path(harvest_file).exists():
print(f" ERROR: {harvest_file} not found")
print(" Using 180-day lookback as default")
print(f" harvest.xlsx is required to determine current season boundaries")
return None
try:
harvest_df = pd.read_excel(harvest_file)
print(f" Loaded {len(harvest_df)} field-season records")
print(f" Loaded {len(harvest_df)} season records")
# Use season_end_date column (output from harvest prediction script)
harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date'])
# season_end contains the last harvest date for each season
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'])
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
# Group by field and get the latest season_end_date
# Group by field and get the LATEST season_end_date (most recent harvest)
# This marks the start of the current season
harvest_dates = {}
for field_id, group in harvest_df.groupby('field'):
latest_end = group['season_end_date'].max()
harvest_dates[str(field_id).strip()] = latest_end
latest_harvest = group['season_end'].max()
harvest_dates[field_id] = latest_harvest
print(f" Successfully mapped {len(harvest_dates)} fields")
print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
print(f" Last harvest dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
return harvest_dates
except Exception as e:
print(f" ERROR loading harvest file: {e}")
print(f" Using 180-day lookback instead")
print(f" ERROR loading harvest.xlsx: {e}")
return None
@ -212,26 +214,37 @@ def main():
# Get project name from command line or use default
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
# Construct paths
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
# Construct paths - work from either python_app/ or root smartcane/ directory
# Try root first (laravel_app/...), then fall back to ../ (running from python_app/)
if Path("laravel_app/storage/app").exists():
base_storage = Path("laravel_app/storage/app") / project_name / "Data"
else:
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
harvest_data_dir = base_storage / "HarvestData"
BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx"
OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv"
harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
HARVEST_FILE = base_storage / "harvest.xlsx" # Output from scripts 22+23
# Determine week and year from current date for timestamped export
today = datetime.now()
week_num = int(today.strftime('%V'))
year_num = int(today.strftime('%Y'))
# Output directory: reports/kpis/field_stats/
reports_dir = base_storage.parent / "reports" / "kpis" / "field_stats"
reports_dir.mkdir(parents=True, exist_ok=True)
OUTPUT_CSV = reports_dir / f"{project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv"
print("="*80)
print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})")
print("="*80)
# [1] Load harvest dates (optional - for projects with predictions)
harvest_dates = None
if BASELINE_FILE.exists():
harvest_dates = load_harvest_dates(BASELINE_FILE)
else:
print("[1/5] Loading harvest dates...")
print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)")
# [1] Load harvest dates (required to determine season boundaries)
harvest_dates = load_harvest_dates(HARVEST_FILE)
if harvest_dates is None or len(harvest_dates) == 0:
print(f"ERROR: Cannot run without harvest.xlsx - required to determine current season boundaries")
print(f" Please run scripts 22 (baseline prediction) and 23 (format conversion) first")
return
# [2] Load CI data
print(f"\n[2/5] Loading CI data...")
@ -251,7 +264,7 @@ def main():
# [3] Load model (from python_app directory)
print("\n[3/5] Loading Model 307...")
model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl
model_dir = Path("python_app") # Model files located in python_app/ directory
model, config, scalers = load_model_and_config(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Device: {device}")
@ -271,6 +284,9 @@ def main():
count = 0
for field_id in ci_data['field'].unique():
# Convert field_id to string for consistency
field_id_str = str(field_id).strip()
# Get metadata
meta = field_meta[field_meta['field'] == field_id]
if len(meta) == 0:
@ -279,18 +295,21 @@ def main():
sub_field = meta['sub_field'].iloc[0]
latest_date = meta['latest_date'].iloc[0]
# Use recent CI history (last 300 days from latest available data)
# Get last harvest date for this field (start of current season)
last_harvest = harvest_dates.get(field_id_str)
if last_harvest is None:
continue
# Extract all CI data AFTER last harvest (complete current season)
field_data = ci_data[ci_data['field'] == field_id].copy()
field_data = field_data.sort_values('Date')
field_data = field_data[field_data['Date'] > last_harvest] # After last harvest
# Keep last 300 days of history for inference
if len(field_data) > 300:
field_data = field_data.iloc[-300:]
# Need at least 30 days of data since planting
if len(field_data) < 30:
continue
# Run inference on recent history to predict next 28 days
# Run inference on full current season to predict next 28 days
imminent_prob, detected_prob = run_inference_on_season(
field_data, model, config, scalers, device, ci_column
)
@ -338,10 +357,11 @@ def main():
print(f" WARNING: No results exported - check CI data availability")
print(f"\nStorage structure:")
print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx")
print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
print(f"\nReady to load into 09b field analysis report")
print(f" Input harvest: laravel_app/storage/app/{project_name}/Data/harvest.xlsx")
print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
print(f" Output: laravel_app/storage/app/{project_name}/reports/kpis/field_stats/")
print(f" Filename: {project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv")
print(f"\nReady to load into 80_calculate_kpis.R")
if __name__ == "__main__":

View file

@ -0,0 +1,366 @@
#!/usr/bin/env python
"""
Batch RGB Validation for Top 100 Largest Fields - V3
Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images).
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
Usage:
python batch_rgb_validation_top_fields_v3.py --field 1
python batch_rgb_validation_top_fields_v3.py
Output:
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
- Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images)
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import sys
import argparse
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from rgb_visualization import generate_rgb_grids
def load_geojson_and_calculate_areas(geojson_path):
"""
Load GeoJSON and calculate area for each field.
Returns:
pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
"""
geojson_path = Path(geojson_path)
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return None
print(f"Loading GeoJSON: {geojson_path}")
with open(geojson_path) as f:
geojson_data = json.load(f)
fields = []
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
field_id = str(props.get('field', ''))
field_name = props.get('name', f"field_{field_id}")
geometry = feature.get('geometry', {})
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
# Simple area calculation using Shoelace formula
area_m2 = 0
if geom_type == 'Polygon' and coordinates:
coords = coordinates[0] # Exterior ring
area_m2 = calculate_polygon_area(coords)
elif geom_type == 'MultiPolygon' and coordinates:
for poly_coords in coordinates:
area_m2 += calculate_polygon_area(poly_coords[0])
if area_m2 > 0:
fields.append({
'field': field_id,
'field_name': field_name,
'area_m2': area_m2,
'area_hectares': area_m2 / 10000
})
df = pd.DataFrame(fields)
df = df.sort_values('area_m2', ascending=False)
print(f" ✓ Loaded {len(df)} fields")
print(f" Top 10 largest fields (hectares):")
for i, row in df.head(10).iterrows():
print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")
return df
def calculate_polygon_area(coords):
"""
Calculate area of polygon using Shoelace formula (in ).
Assumes coordinates are in lat/lon (roughly converts to meters).
"""
if len(coords) < 3:
return 0
# Rough conversion: at equator, 1 degree ≈ 111 km
# For lat/lon coordinates, use average latitude
lats = [c[1] for c in coords]
avg_lat = np.mean(lats)
lat_m_per_deg = 111000
lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))
# Convert to meters
coords_m = []
for lon, lat in coords:
x = (lon - coords[0][0]) * lon_m_per_deg
y = (lat - coords[0][1]) * lat_m_per_deg
coords_m.append((x, y))
# Shoelace formula
area = 0
for i in range(len(coords_m)):
j = (i + 1) % len(coords_m)
area += coords_m[i][0] * coords_m[j][1]
area -= coords_m[j][0] * coords_m[i][1]
return abs(area) / 2
def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
"""
Load harvest data from Excel file and get latest completed season for each field.
Returns season_end date for each field (latest complete season where season_end is not null).
Args:
harvest_xlsx_path (Path): Path to harvest.xlsx
top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering
Returns:
dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
"""
harvest_xlsx_path = Path(harvest_xlsx_path)
if not harvest_xlsx_path.exists():
print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
return {}
print(f"Loading harvest data: {harvest_xlsx_path}")
try:
harvest_df = pd.read_excel(harvest_xlsx_path)
# Ensure date columns are datetime
if 'season_end' in harvest_df.columns:
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
# Filter to top 50 fields and get only rows with season_end filled in
top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
harvest_df = harvest_df[harvest_df['season_end'].notna()]
# Group by field and get the LATEST (most recent) season_end
latest_harvests = {}
for field_id in top_50_field_ids:
field_records = harvest_df[harvest_df['field'] == field_id]
if len(field_records) > 0:
# Get row with latest season_end
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
# Get field name from top_50_fields_df
field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
if len(field_info) > 0:
field_name = field_info.iloc[0]['field_name']
else:
field_name = f"field_{field_id}"
latest_harvests[field_id] = {
'field_name': field_name,
'harvest_date': latest_row['season_end']
}
print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")
return latest_harvests
except Exception as e:
print(f"✗ Error loading harvest data: {e}")
return {}
def main():
parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)')
parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")')
parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)')
args = parser.parse_args()
print("="*90)
if args.field:
print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}")
else:
print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS")
print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)")
print("="*90)
# Configuration
project = args.project
geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson")
harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx")
output_dir = Path(f"laravel_app/storage/app/{project}/RGB")
tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5")
# Verify paths
if not geojson_path.exists():
print(f"✗ GeoJSON not found: {geojson_path}")
return
if not harvest_xlsx.exists():
print(f"✗ Harvest Excel not found: {harvest_xlsx}")
return
if not tiff_dir.exists():
print(f"✗ TIFF directory not found: {tiff_dir}")
return
output_dir.mkdir(parents=True, exist_ok=True)
# Handle single field mode
if args.field:
print(f"\n[1/3] Loading harvest data for field {args.field}...")
harvest_df = pd.read_excel(harvest_xlsx)
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
field_records = harvest_df[harvest_df['field'] == args.field]
field_records = field_records[field_records['season_end'].notna()]
if len(field_records) == 0:
print(f"✗ No harvest data found for field {args.field}")
return
# Get latest harvest for this field
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
harvest_date = latest_row['season_end']
print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}")
# Load field name from GeoJSON
print(f"\n[2/3] Loading field name from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_name = f"field_{args.field}"
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == args.field:
field_name = props.get('name', field_name)
break
print(f" ✓ Field name: {field_name}")
# Generate RGB grid
print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...")
results = generate_rgb_grids(
field_data=None,
field_id=args.field,
registered_harvest_dates=[],
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir),
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
print("\n" + "="*90)
if results['predicted']:
print(f"✓ RGB grid generated successfully!")
print(f" Field: {field_name} (ID: {args.field})")
print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}")
print(f" Output: {output_dir}")
else:
print(f"⚠ No RGB grid generated (no imagery available)")
print("="*90)
return
# Batch mode for top 100 fields
print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...")
fields_df = load_geojson_and_calculate_areas(geojson_path)
if fields_df is None:
return
top_100_fields = fields_df.head(100)
print(f" ✓ Selected {len(top_100_fields)} largest fields for processing")
# Step 2: Load harvest dates from Excel
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields)
if len(harvest_dates) == 0:
print("✗ No harvest dates found in Excel file")
return
print(f" ✓ Found {len(harvest_dates)} fields with completed seasons")
for field_id, info in list(harvest_dates.items())[:5]:
print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
if len(harvest_dates) > 5:
print(f" ... and {len(harvest_dates) - 5} more")
# Step 3: Generate RGB grids for each field
print("\n[3/4] Generating RGB validation grids (v3 dynamic)...")
rgb_count = 0
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
field_name = harvest_info['field_name']
harvest_date = harvest_info['harvest_date']
try:
# Run RGB visualization (harvest dates only, no registered/predicted distinction)
results = generate_rgb_grids(
field_data=None, # Not needed - just for function compatibility
field_id=field_id,
registered_harvest_dates=[], # Empty - using harvest.xlsx instead
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir), # All PNGs in same folder
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
if results['predicted']:
rgb_count += 1
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
else:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")
except Exception as e:
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")
# Summary
print("\n" + "="*90)
print(f"SUMMARY:")
print(f" Fields with harvest dates: {len(harvest_dates)}")
print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}")
print(f" Output directory: {output_dir}")
print("="*90)
print("\nVisual inspection checklist:")
print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
print(" ⚠ Green vegetation at T~0d = Possible data error or replanting")
print(" ✓ Green → Brown progression = Normal harvest sequence")
print("="*90)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,193 @@
"""
Create an Excel evaluation template for RGB harvest date predictions.
Parses field names and dates directly from RGB image filenames.
"""
import os
import glob
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import re
from datetime import datetime
# Configuration
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
RGB_DIR = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB")
OUTPUT_PATH = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB_Evaluation_Template.xlsx")
# Evaluators
EVALUATORS = ["Joey", "Daniel", "Nik", "Dimitra", "Timon"]
def parse_rgb_filenames():
"""
Parse field names and harvest dates from RGB image filenames.
Expected format: field_{field_id or name}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png
"""
fields_data = []
# Find all RGB images
rgb_files = glob.glob(os.path.join(RGB_DIR, "field_*.png"))
for filepath in sorted(rgb_files):
filename = os.path.basename(filepath)
# Pattern: field_{field_id}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png
match = re.match(r"field_(.+?)_(\d{8})_harvest_xlsx_harvest_rgb\.png", filename)
if match:
field_id = match.group(1) # e.g., "1000" or "91&92"
date_str = match.group(2) # e.g., "20250814"
# Format date as YYYY-MM-DD
try:
harvest_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
except ValueError:
harvest_date = date_str
fields_data.append({
"field_id": field_id,
"harvest_date": harvest_date,
"filename": filename
})
# Sort by field_id (treating numeric parts as integers where possible)
fields_data = sorted(fields_data, key=lambda x: (x["field_id"].replace("&92", ""), ))
return fields_data
def create_evaluation_template():
"""Create the Excel evaluation template."""
print("Loading field data from RGB images...")
fields_data = parse_rgb_filenames()
if not fields_data:
print("ERROR: No RGB images found in", RGB_DIR)
return
print(f"Found {len(fields_data)} RGB images")
# Create workbook
wb = Workbook()
# === SHEET 1: Evaluation Form ===
ws_eval = wb.active
ws_eval.title = "Evaluation"
# Define styles
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF", size=11)
border = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin')
)
center_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
left_align = Alignment(horizontal="left", vertical="center", wrap_text=True)
# Column headers
headers = ["Field ID", "Predicted Harvest Date"] + EVALUATORS
for col_idx, header in enumerate(headers, start=1):
cell = ws_eval.cell(row=1, column=col_idx, value=header)
cell.fill = header_fill
cell.font = header_font
cell.alignment = center_align
cell.border = border
# Set column widths
ws_eval.column_dimensions['A'].width = 15
ws_eval.column_dimensions['B'].width = 20
for col_idx in range(3, 3 + len(EVALUATORS)):
ws_eval.column_dimensions[get_column_letter(col_idx)].width = 12
# Add data rows
for row_idx, field in enumerate(fields_data, start=2):
ws_eval.cell(row=row_idx, column=1, value=field["field_id"])
ws_eval.cell(row=row_idx, column=2, value=field["harvest_date"])
# Add empty cells for evaluator responses
for col_idx in range(3, 3 + len(EVALUATORS)):
cell = ws_eval.cell(row=row_idx, column=col_idx)
cell.alignment = center_align
cell.border = border
# Light alternating row color
if row_idx % 2 == 0:
for col_idx in range(1, 3 + len(EVALUATORS)):
ws_eval.cell(row=row_idx, column=col_idx).fill = PatternFill(
start_color="D9E8F5", end_color="D9E8F5", fill_type="solid"
)
# Apply borders to all data cells
for col_idx in range(1, 3 + len(EVALUATORS)):
ws_eval.cell(row=row_idx, column=col_idx).border = border
if col_idx == 1 or col_idx == 2:
ws_eval.cell(row=row_idx, column=col_idx).alignment = left_align
# Freeze panes
ws_eval.freeze_panes = "C2"
# === SHEET 2: Instructions ===
ws_instr = wb.create_sheet("Instructions")
instr_content = [
["RGB Evaluation Instructions", ""],
["", ""],
["Overview:", ""],
["The generated RGB images visualize the predicted harvest dates for each field.", ""],
["The images are 3x3 grids showing satellite imagery from different dates", ""],
["centered on the predicted harvest date (the center/red-box image).", ""],
["", ""],
["What to Evaluate:", ""],
["For each field, determine if the predicted harvest date is CORRECT:", ""],
["", ""],
["Instructions for Reviewing:", ""],
["1. Look at the CENTER image (red box) - this is the predicted harvest date", ""],
["2. Compare to surrounding dates (before and after)", ""],
["3. Look for change in field color/status:", ""],
[" • BEFORE: Field appears GREEN (growing/healthy crop)", ""],
[" • AT PREDICTED DATE: Field shows BROWN/YELLOW (soil visible, ripe for harvest)", ""],
[" • AFTER: Field continues BROWN (post-harvest or dormant)", ""],
["", ""],
["How to Enter Your Assessment:", ""],
["Enter one of the following in your evaluator column for each field:", ""],
[" • YES = Predicted date is CORRECT (brown/harvest-ready at center date)", ""],
[" • NO = Predicted date is INCORRECT (not ready or already harvested)", ""],
[" • ? or MAYBE = Uncertain (cloudy images, unclear field status)", ""],
["", ""],
["Workflow Options:", ""],
["Option A (Divide Work): Assign 2-3 fields per evaluator (rows divided by column)", ""],
["Option B (Full Review): Each evaluator reviews all fields (everyone fills all rows)", ""],
["Option C (Spot Check): Each evaluator samples 5-10 random fields", ""],
["", ""],
["Image Location:", ""],
["All RGB images are in: /laravel_app/storage/app/angata/RGB/", ""],
["Format: field_{FIELD_ID}_{YYYY-MM-DD}_harvest_xlsx_harvest_rgb.png", ""],
["", ""],
["Notes:", ""],
["• Cloud cover may obscure ground truth - use best judgment", ""],
["• Fields with multiple bands or irregular shapes: focus on dominant area", ""],
["• Use context from previous/next dates to validate your assessment", ""],
]
# Add instructions text
for row_idx, row_data in enumerate(instr_content, start=1):
for col_idx, value in enumerate(row_data, start=1):
cell = ws_instr.cell(row=row_idx, column=col_idx, value=value)
if row_idx == 1:
cell.font = Font(bold=True, size=14)
elif any(keyword in str(value) for keyword in ["Overview:", "Instructions", "Workflow", "Image Location", "Notes"]):
cell.font = Font(bold=True, size=11)
cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True)
ws_instr.column_dimensions['A'].width = 50
ws_instr.column_dimensions['B'].width = 80
# Save workbook
wb.save(OUTPUT_PATH)
print(f"✓ Evaluation template created: {OUTPUT_PATH}")
print(f"{len(fields_data)} fields added to evaluation form")
print(f"✓ Evaluators: {', '.join(EVALUATORS)}")
if __name__ == "__main__":
create_evaluation_template()

View file

@ -4,7 +4,7 @@ Purpose: Download Planet satellite data for missing dates only (skip existing fi
Can be called from batch scripts or other Python scripts.
Usage:
python download_planet_missing_dates.py --start 2022-01-01 --end 2025-12-15 --project angata
python download_planet_missing_dates.py --start 2026-01-17 --end 2026-12-20 --project angata
python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run
Environment variables (alternative to CLI args):

View file

@ -184,18 +184,47 @@ def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> p
return features.fillna(0)
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
"""Extract and return specified features as numpy array."""
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData',
season_anchor_day: int = None, lookback_start: int = 0) -> np.ndarray:
"""
Extract and return specified features as numpy array.
Args:
data_df: DataFrame with Date and CI data (may be a window after a harvest)
feature_names: List of feature names to extract
ci_column: Name of CI column
season_anchor_day: Day in FULL sequence where this season started (for DOY reset)
DOY will be recalculated as: 1, 2, 3, ... from this point
lookback_start: Starting index in original full data (for season reset calculation)
Returns:
NumPy array of shape (len(data_df), len(feature_names))
"""
# Compute all CI features
ci_series = data_df[ci_column].astype(float)
doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
# Compute DOY (age/days since season start) - NOT day-of-year!
# DOY is a continuous counter: 1, 2, 3, ..., 475 (doesn't cycle at 365)
# It only resets to 1 after a harvest is detected (new season)
doy_series = None
if 'DOY_normalized' in feature_names:
if season_anchor_day is not None and lookback_start >= season_anchor_day:
# Season was reset after harvest. Recalculate DOY as simple counter from 1
# This is a window starting at or after harvest, so DOY should be: 1, 2, 3, ...
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
elif 'DOY' in data_df.columns:
# Use DOY directly from CSV - already calculated as continuous age counter
doy_series = pd.Series(data_df['DOY'].astype(float).values, index=data_df.index)
else:
# Fallback: create continuous age counter (1, 2, 3, ...)
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
all_features = compute_ci_features(ci_series, doy_series)
# Select requested features
requested = [f for f in feature_names if f in all_features.columns]
if not requested:
raise ValueError(f"No valid features found. Requested: {feature_names}")
raise ValueError(f"No valid features found. Requested: {feature_names}, Available: {all_features.columns.tolist()}")
return all_features[requested].values
@ -274,59 +303,64 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame:
def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device,
threshold=0.45, consecutive_days=2):
"""
Phase 1: Growing window detection with threshold crossing.
Expand window day-by-day, check last timestep's detected_prob.
When N consecutive days have prob > threshold, harvest detected.
Phase 1: Growing window detection with DOY season reset.
Args:
threshold (float): Probability threshold (default 0.45, tuned for Model 307)
consecutive_days (int): Required consecutive days above threshold (default 2, reduced from 3 for robustness)
Returns list of (harvest_date, harvest_idx) tuples.
For each detected harvest, reset DOY counter for the next season.
This allows the model to detect multiple consecutive harvests in multi-year data.
"""
harvest_dates = []
season_anchor_day = 0
current_pos = 0
while current_pos < len(field_data):
consecutive_above_threshold = 0
min_window_size = 120
for window_end in range(current_pos + 1, len(field_data) + 1):
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
if len(window_data) < min_window_size:
continue
try:
features = extract_features(window_data, config['features'], ci_column=ci_column)
reset_doy = current_pos > season_anchor_day
# Apply scalers
features = extract_features(
window_data,
config['features'],
ci_column=ci_column,
season_anchor_day=season_anchor_day if reset_doy else None,
lookback_start=current_pos
)
features_scaled = features.copy().astype(float)
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception:
pass
# Run model
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
detected_probs = detected_probs.squeeze(0).cpu().numpy()
# Check LAST timestep
last_prob = detected_probs[-1]
last_prob = detected_probs[0, -1].item()
if last_prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
# Harvest detected: N consecutive days above threshold
if consecutive_above_threshold >= consecutive_days:
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
harvest_idx = current_pos + window_end - consecutive_days
harvest_date = field_data.iloc[harvest_idx]['Date']
# Reset to next day after harvest
current_pos = current_pos + window_end - consecutive_days + 1
harvest_dates.append((harvest_date, harvest_idx))
season_anchor_day = harvest_idx + 1
current_pos = harvest_idx + 1
break
except Exception:
except Exception as e:
continue
else:
break
@ -359,8 +393,21 @@ def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, c
window_start_date = season_start_date - pd.Timedelta(days=40)
window_end_date = phase1_harvest_date + pd.Timedelta(days=40)
window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0)
window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data))
# FIXED: Use proper index selection
mask_start = field_data['Date'] >= window_start_date
mask_end = field_data['Date'] <= window_end_date
if mask_start.any():
window_start_idx = mask_start.idxmax() # First True index
else:
window_start_idx = 0
if mask_end.any():
# Last True index: find where condition becomes False from the right
true_indices = np.where(mask_end)[0]
window_end_idx = true_indices[-1] + 1 # +1 for slicing (exclusive end)
else:
window_end_idx = len(field_data)
if window_end_idx <= window_start_idx:
refined_harvests.append((phase1_harvest_date, phase1_idx))
@ -414,7 +461,9 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = []
ci_column = config['data']['ci_column']
# CI column is 'FitData' (interpolated CI) - NOT 'value' (raw with NAs)
# 'FitData' is already gap-filled by R stage 03, ready for ML
ci_column = 'FitData'
# Group by field and count total fields for progress
field_groups = list(df.groupby('field'))
@ -469,6 +518,10 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non
print() # New line after progress bar
print(f" ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields")
if results:
print(f" Sample harvest dates: {results[0]['phase2_harvest_date']}")
if len(results) > 1:
print(f" {results[-1]['phase2_harvest_date']}")
return results

View file

@ -0,0 +1,299 @@
"""
Batch Field Visualization Tool - RGB Imagery Around Harvest Date
Purpose: Generate visual validation using RGB satellite imagery samples around
predicted harvest date to verify predictions (bare soil = harvested, green = not harvested)
Shows 12-15 RGB images in a grid, centered around the predicted harvest date
Usage:
python batch_plot_fields_rgb.py field1,field2,field3
python batch_plot_fields_rgb.py 10125,88,97
Or read from CSV:
python batch_plot_fields_rgb.py --file fields_to_check.csv
"""
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path
from datetime import datetime, timedelta
import sys
import rasterio
from rasterio.mask import mask
import geopandas as gpd
from harvest_date_pred_utils import load_model_and_config, extract_features
def get_field_centroid(field_id, geojson_path="pivot.geojson"):
"""Get centroid of field from GeoJSON for cropping RGB images."""
try:
gdf = gpd.read_file(geojson_path)
field_geom = gdf[gdf['field'] == str(field_id)]
if len(field_geom) > 0:
centroid = field_geom.geometry.iloc[0].centroid
return (centroid.x, centroid.y)
except Exception as e:
print(f" Warning: Could not get field centroid - {e}")
return None
def load_rgb_image(tif_path, field_id=None, geojson_path="pivot.geojson"):
"""
Load RGB bands from 8-band GeoTIFF
Bands: 0=coastal, 1=blue, 2=green, 3=green_i, 4=yellow, 5=red, 6=rededge, 7=nir
RGB = bands 5,3,1 (Red, Green, Blue)
"""
try:
with rasterio.open(tif_path) as src:
# Read RGB bands (bands are 1-indexed in rasterio)
red = src.read(6) # Band 6 = red (0-indexed band 5)
green = src.read(3) # Band 3 = green (0-indexed band 2)
blue = src.read(2) # Band 2 = blue (0-indexed band 1)
# Stack into RGB image
rgb = np.stack([red, green, blue], axis=2)
# Normalize to 0-1 range (8-band data is typically 0-10000)
rgb = np.clip(rgb / 5000.0, 0, 1)
return rgb
except Exception as e:
print(f" Error loading RGB from {tif_path}: {e}")
return None
def plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device,
tif_folder="../../../laravel_app/storage/app/angata/merged_tif_8b",
output_dir="validation_plots_rgb"):
"""
Create validation plot for a single field:
- Top: Harvest probability over time with peak marked
- Bottom: 12-15 RGB images in grid around predicted harvest date
"""
# Create output directory
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Filter field data
field_data = ci_data[ci_data['field'] == field_id].copy()
if len(field_data) == 0:
print(f" ✗ Field {field_id}: No CI data found")
return False
field_data = field_data.sort_values('Date')
print(f" ✓ Field {field_id}: {len(field_data)} days of data")
try:
# Extract features and run inference
ci_column = config['data']['ci_column']
feature_names = config['features']
feat_array = extract_features(field_data, feature_names, ci_column=ci_column)
if feat_array is None:
print(f" ✗ Field {field_id}: Feature extraction failed")
return False
# Apply scalers
if isinstance(scalers, dict) and 'features' in scalers:
feat_array = scalers['features'].transform(feat_array)
# Run inference
with torch.no_grad():
x_tensor = torch.tensor(feat_array, dtype=torch.float32).unsqueeze(0).to(device)
out_imm, out_det = model(x_tensor)
imm_probs = out_imm.squeeze(0).cpu().numpy()
# Find peak probability (predicted harvest date)
peak_idx = np.argmax(imm_probs)
peak_date = field_data['Date'].iloc[peak_idx]
peak_prob = imm_probs[peak_idx]
print(f" Peak probability: {peak_prob:.3f} on {peak_date.strftime('%Y-%m-%d')}")
# Get date range: ±6 days around peak (12-13 images total)
date_range = field_data['Date'].dt.date
peak_date_only = peak_date.date() if hasattr(peak_date, 'date') else peak_date
days_before = 6
days_after = 6
start_date = peak_date_only - timedelta(days=days_before)
end_date = peak_date_only + timedelta(days=days_after)
# Find available TIF files in date range
tif_folder_path = Path(tif_folder)
available_dates = []
for tif_file in sorted(tif_folder_path.glob("*.tif")):
date_str = tif_file.stem # YYYY-MM-DD
try:
tif_date = datetime.strptime(date_str, "%Y-%m-%d").date()
if start_date <= tif_date <= end_date:
available_dates.append((tif_date, tif_file))
except ValueError:
pass
if len(available_dates) == 0:
print(f" Warning: No TIF files found in {start_date} to {end_date}")
return False
print(f" Found {len(available_dates)} RGB images in date range")
# Load RGB images
rgb_images = []
rgb_dates = []
for tif_date, tif_file in available_dates:
rgb = load_rgb_image(str(tif_file), field_id)
if rgb is not None:
rgb_images.append(rgb)
rgb_dates.append(tif_date)
if len(rgb_images) == 0:
print(f" ✗ No RGB images loaded")
return False
print(f" Loaded {len(rgb_images)} RGB images")
# Create figure with probability plot + RGB grid
n_images = len(rgb_images)
n_cols = min(5, n_images) # Max 5 columns
n_rows = (n_images + n_cols - 1) // n_cols # Calculate rows needed
fig = plt.figure(figsize=(18, 12))
# Probability plot (top, spanning all columns)
ax_prob = plt.subplot(n_rows + 1, n_cols, (1, n_cols))
dates_arr = field_data['Date'].values
ax_prob.plot(dates_arr, imm_probs, '-', color='orange', linewidth=2.5, label='Imminent Probability', alpha=0.8)
ax_prob.axhline(y=0.5, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label='Threshold (0.5)')
ax_prob.axvline(x=peak_date, color='darkred', linestyle=':', linewidth=2, alpha=0.7, label='Peak')
ax_prob.fill_between(dates_arr, 0.5, 1.0, alpha=0.08, color='red')
ax_prob.set_ylim(-0.05, 1.05)
ax_prob.set_ylabel('Probability', fontsize=11, fontweight='bold')
ax_prob.set_xlabel('Date', fontsize=11, fontweight='bold')
ax_prob.set_title(f'Field {field_id} - Model 307 Harvest Probability', fontsize=12, fontweight='bold')
ax_prob.grid(True, alpha=0.3)
ax_prob.legend(loc='upper right', fontsize=9)
ax_prob.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
ax_prob.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
plt.setp(ax_prob.xaxis.get_majorticklabels(), rotation=45, ha='right')
# Annotate peak
ax_prob.annotate(f'{peak_prob:.2f}\n{peak_date_only}',
xy=(peak_date, peak_prob),
xytext=(20, 20), textcoords='offset points',
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.8),
arrowprops=dict(arrowstyle='->', lw=1.5, color='darkred'))
# RGB images in grid (below probability plot)
for i, (rgb, date) in enumerate(zip(rgb_images, rgb_dates)):
ax = plt.subplot(n_rows + 1, n_cols, n_cols + i + 1)
ax.imshow(rgb, extent=[0, 100, 0, 100])
# Highlight peak date
date_label = date.strftime('%m-%d')
is_peak = date == peak_date_only
color = 'darkred' if is_peak else 'black'
weight = 'bold' if is_peak else 'normal'
size = 11 if is_peak else 9
ax.set_title(date_label, fontsize=size, fontweight=weight, color=color)
ax.set_xticks([])
ax.set_yticks([])
plt.suptitle(f'Field {field_id} RGB Imagery: {len(rgb_images)} Days Around Peak Harvest Probability\nPeak: {peak_prob:.2f} on {peak_date_only} | Green = Growing | Brown/Bare = Harvested',
fontsize=13, fontweight='bold', y=0.995)
plt.tight_layout()
# Save
output_file = Path(output_dir) / f"field_{field_id}_rgb_validation.png"
plt.savefig(output_file, dpi=100, bbox_inches='tight')
print(f" ✓ Saved: {output_file}")
plt.close()
return True
except Exception as e:
print(f" ✗ Field {field_id}: Error - {e}")
import traceback
traceback.print_exc()
return False
def main():
print("="*80)
print("BATCH RGB VISUALIZATION TOOL")
print("Visual check: RGB imagery around predicted harvest date")
print("="*80)
# Parse arguments
fields_to_plot = []
if len(sys.argv) < 2:
print("\nUsage:")
print(" python batch_plot_fields_rgb.py field1,field2,field3")
print(" python batch_plot_fields_rgb.py --file fields.csv")
print("\nExample:")
print(" python batch_plot_fields_rgb.py 10125,88,97,440")
return
if sys.argv[1] == "--file":
if len(sys.argv) < 3:
print("ERROR: --file requires a CSV filename")
return
csv_file = sys.argv[2]
print(f"\n[1/4] Loading fields from CSV: {csv_file}")
try:
df = pd.read_csv(csv_file)
fields_to_plot = df['field'].astype(str).str.strip().tolist()
print(f" ✓ Loaded {len(fields_to_plot)} fields")
except Exception as e:
print(f" ✗ Error reading CSV: {e}")
return
else:
# Parse comma-separated list
fields_to_plot = [f.strip() for f in sys.argv[1].split(',')]
print(f"\n[1/4] Processing {len(fields_to_plot)} field(s): {', '.join(fields_to_plot)}")
# Load CI data
print("\n[2/4] Loading CI data...")
try:
ci_data = pd.read_csv("ci_data_for_python.csv")
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
ci_data['field'] = ci_data['field'].astype(str).str.strip()
print(f" ✓ Loaded {len(ci_data)} observations for {ci_data['field'].nunique()} fields")
except Exception as e:
print(f" ✗ Error loading CI data: {e}")
return
# Load model
print("\n[3/4] Loading model...")
try:
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()
print(f" ✓ Model loaded on {device}")
except Exception as e:
print(f" ✗ Error loading model: {e}")
return
# Process each field
print("\n[4/4] Generating RGB validation plots...")
success_count = 0
for field_id in fields_to_plot:
if plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device):
success_count += 1
# Summary
print("\n" + "="*80)
print(f"SUMMARY: {success_count}/{len(fields_to_plot)} fields processed successfully")
print(f"Output directory: validation_plots_rgb/")
print("="*80)
print("\nInspect the PNG files to verify predictions:")
print(" ✓ Green imagery BEFORE peak date (field growing)")
print(" ✓ Brown/Bare imagery AT/AFTER peak date (harvested)")
print(" ✓ Peak date marked with red title")
if __name__ == "__main__":
main()

View file

@ -1,485 +0,0 @@
#!/usr/bin/env python
"""
RGB Visualization Tool for Harvest Date Validation
Creates 3x3 temporal grids showing satellite imagery around registered and predicted harvest dates.
Extracts RGB from 8-band Planet scope data and clips to field boundaries from GeoJSON.
Functions:
- load_field_boundaries(): Load field geometries from GeoJSON
- find_closest_tiff(): Find available TIFF file closest to target date
- load_and_clip_tiff_rgb(): Load TIFF, extract RGB, clip to field boundary
- create_temporal_grid(): Create 3x3 grid (4 pre-harvest, 1 near, 2-3 post-harvest)
- generate_rgb_grids(): Main orchestration function
Usage:
from rgb_visualization import generate_rgb_grids
generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates, output_dir, tiff_dir, geojson_path)
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.colors import Normalize
import warnings
warnings.filterwarnings('ignore')
try:
import rasterio
from rasterio.mask import mask
import shapely.geometry as shgeom
except ImportError:
print("Warning: rasterio not available. RGB visualization will be skipped.")
rasterio = None
def load_field_boundaries(geojson_path, field_id):
"""
Load field boundary from GeoJSON file.
Args:
geojson_path (Path): Path to pivot.geojson
field_id (str): Field identifier (e.g., "13973")
Returns:
dict: GeoJSON feature or None if not found
shapely.geometry.Polygon: Field boundary polygon or None
"""
try:
with open(geojson_path) as f:
geojson_data = json.load(f)
# Match field ID in properties
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
# Try matching on 'field' or 'sub_field'
if str(props.get('field', '')) == str(field_id) or \
str(props.get('sub_field', '')) == str(field_id):
geometry = feature.get('geometry')
if geometry:
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
# Handle MultiPolygon: coordinates[i] = [[[ring coords]], [[inner ring coords]], ...]
if geom_type == 'MultiPolygon':
# Use the first polygon from the multipolygon
if coordinates and len(coordinates) > 0:
coords = coordinates[0][0] # First polygon's exterior ring
polygon = shgeom.Polygon(coords)
return feature, polygon
# Handle Polygon: coordinates = [[[ring coords]], [[inner ring coords]], ...]
elif geom_type == 'Polygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0] # Exterior ring
polygon = shgeom.Polygon(coords)
return feature, polygon
print(f" ⚠ Field {field_id} not found in GeoJSON")
return None, None
except Exception as e:
print(f" ✗ Error loading GeoJSON: {e}")
return None, None
def find_closest_tiff(target_date, tiff_dir, days_window=60, field_boundary=None):
"""
Find available TIFF file closest to target date.
Skips obviously empty files (< 12 MB) without reading data.
TIFF files are named: YYYY-MM-DD.tif
Args:
target_date (pd.Timestamp): Target date to find TIFF near
tiff_dir (Path): Directory containing TIFF files
days_window (int): Max days to search before/after target
field_boundary (shapely.Polygon): Unused, kept for compatibility
Returns:
Path: Path to closest TIFF file or None if not found
int: Days difference from target (negative=before, positive=after)
"""
target_date = pd.Timestamp(target_date)
tiff_dir = Path(tiff_dir)
available_tiffs = list(tiff_dir.glob('*.tif'))
if not available_tiffs:
return None, None
# Parse dates from filenames, skip obviously empty files by size
tiff_dates = []
min_size_mb = 12.0 # Empty files are ~11.56 MB, valid files are ~12.6-13.0 MB
for tiff_path in available_tiffs:
try:
# Quick size check to skip obviously empty files
file_size_mb = tiff_path.stat().st_size / (1024 * 1024)
if file_size_mb < min_size_mb:
continue
date_str = tiff_path.stem # Remove .tif extension
tiff_date = pd.Timestamp(date_str)
days_diff = (tiff_date - target_date).days
if abs(days_diff) <= days_window:
tiff_dates.append((tiff_path, tiff_date, days_diff))
except:
pass
if not tiff_dates:
return None, None
# Return closest by distance
closest = min(tiff_dates, key=lambda x: abs(x[2]))
return closest[0], closest[2]
def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
"""
Load TIFF and extract RGB bands clipped to field boundary.
For merged_final_tif files (cloud-masked and filtered):
- Band 1: Red
- Band 2: Green
- Band 3: Blue
- Band 4: NIR
- Band 5: CI
For older merged_tif_8b files (raw Planet Scope 8-band):
- Bands 1=Coastal Blue, 2=Blue, 3=Green, 4=Red, 5=Red Edge, 6=NIR, 7=SWIR1, 8=SWIR2
- RGB would be bands 4,3,2 = Red, Green, Blue
This function auto-detects the format based on band count and descriptions.
Args:
tiff_path (Path): Path to TIFF file
field_boundary (shapely.Polygon): Field boundary for clipping
rgb_bands (tuple): Band indices for RGB (1-indexed, defaults to 1,2,3 for merged_final_tif)
Returns:
np.ndarray: RGB data (height, width, 3) with values 0-1
or None if error occurs
"""
if rasterio is None or field_boundary is None:
return None
try:
with rasterio.open(tiff_path) as src:
# Check CRS compatibility
if src.count < 3:
print(f" ⚠ TIFF has only {src.count} bands (need at least 3 for RGB)")
return None
# Auto-detect format based on band count and descriptions
if src.count == 5 and hasattr(src, 'descriptions') and src.descriptions:
# merged_final_tif format: Red, Green, Blue, NIR, CI
bands_to_read = (1, 2, 3)
elif src.count == 9:
# merged_tif_8b format: use bands 4, 3, 2 for Red, Green, Blue
bands_to_read = (4, 3, 2)
else:
# Default: try to use first 3 bands or specified bands
bands_to_read = rgb_bands
print(f" Unknown TIFF format ({src.count} bands), using bands {bands_to_read}")
# Mask and read bands (crop=True reads only the clipped window, not full resolution)
geom = shgeom.mapping(field_boundary)
try:
# Read RGB bands at once, then mask (more efficient than masking 3 times)
masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read))
# masked_data shape is (3, height, width) - one layer per band
rgb_data = []
for i, band_idx in enumerate(bands_to_read):
band_data = masked_data[i] # Extract the i-th band from masked stack
# Better debug output that handles NaN values
valid_data = band_data[~np.isnan(band_data)]
if len(valid_data) > 0:
print(f" DEBUG: Band {band_idx} valid data range: {valid_data.min():.4f} - {valid_data.max():.4f} ({len(valid_data)} valid pixels)")
else:
print(f" DEBUG: Band {band_idx} - all NaN!")
rgb_data.append(band_data)
# Stack RGB
rgb = np.stack(rgb_data, axis=-1)
# Data is already normalized 0-1 from the merged_final_tif files
# Just ensure it's float32 and clipped
rgb = rgb.astype(np.float32)
rgb = np.clip(rgb, 0, 1)
print(f" DEBUG: Final RGB range: {rgb.min():.4f} - {rgb.max():.4f}")
return rgb
except ValueError as e:
print(f" ⚠ Error clipping to field boundary: {e}")
return None
except Exception as e:
print(f" ✗ Error loading TIFF {tiff_path.name}: {e}")
return None
def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary,
title, output_dir, harvest_type='registered', model_name=None, harvest_index=None):
"""
Create 5x3 temporal grid around harvest date (15 images, 7-day intervals).
Layout:
Row 1: T-56d, T-42d, T-35d, T-28d, T-21d (pre-harvest)
Row 2: T-14d, T-7d, T~0d, T+7d, T+14d (near harvest)
Row 3: T+21d, T+28d, T+35d, T+42d, T+56d (post-harvest progression)
Args:
harvest_date (pd.Timestamp): Target harvest date
field_data (pd.DataFrame): Field data with Date column
field_id (str): Field identifier
tiff_dir (Path): Directory with TIFF files
field_boundary (shapely.Polygon): Field boundary
title (str): Plot title
output_dir (Path): Output directory
harvest_type (str): 'registered' or 'predicted'
model_name (str): Model name for predicted harvests (e.g., 'Original', 'Long-Season')
harvest_index (int): Index of harvest within same model (for multiple harvests)
Returns:
Path: Path to saved PNG or None if failed
"""
harvest_date = pd.Timestamp(harvest_date)
# Target dates: 15 images at 7-day intervals (8 pre, 1 near, 6 post)
target_dates = [
harvest_date - timedelta(days=56), # T-56d
harvest_date - timedelta(days=49), # T-49d
harvest_date - timedelta(days=42), # T-42d
harvest_date - timedelta(days=35), # T-35d
harvest_date - timedelta(days=28), # T-28d
harvest_date - timedelta(days=21), # T-21d
harvest_date - timedelta(days=14), # T-14d
harvest_date - timedelta(days=7), # T-7d
harvest_date, # T~0d (near harvest)
harvest_date + timedelta(days=7), # T+7d
harvest_date + timedelta(days=14), # T+14d
harvest_date + timedelta(days=21), # T+21d
harvest_date + timedelta(days=28), # T+28d
harvest_date + timedelta(days=35), # T+35d
harvest_date + timedelta(days=42), # T+42d
harvest_date + timedelta(days=56), # T+56d (Note: non-standard to fill 5th col in row 3)
]
# Find TIFFs for each date
rgb_images = []
days_offsets = []
actual_dates = [] # Store actual dates of TIFFs found
for target in target_dates:
tiff_path, days_diff = find_closest_tiff(target, tiff_dir, days_window=60, field_boundary=field_boundary)
if tiff_path is None:
rgb_images.append(None)
days_offsets.append(None)
actual_dates.append(None)
print(f" ⚠ No TIFF found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data")
continue
# Extract date from filename: YYYY-MM-DD.tif
tiff_date = pd.to_datetime(tiff_path.stem)
rgb = load_and_clip_tiff_rgb(tiff_path, field_boundary)
rgb_images.append(rgb)
days_offsets.append(days_diff)
actual_dates.append(tiff_date)
if rgb is not None:
print(f" ✓ Loaded {tiff_path.name} ({days_diff:+d}d from target)")
else:
print(f" ⚠ Loaded {tiff_path.name} but RGB data is None")
# Create 5x3 grid plot (15 images)
fig, axes = plt.subplots(3, 5, figsize=(25, 15))
fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")}',
fontsize=16, fontweight='bold')
# Grid positions (5 columns, 3 rows = 15 images)
positions = [
('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4),
('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('T~0d', 1, 3), ('T+7d', 1, 4),
('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4),
]
for idx, (label, row, col) in enumerate(positions): # All 15 images
ax = axes[row, col]
if idx < len(rgb_images) and rgb_images[idx] is not None:
rgb_data = rgb_images[idx]
# Debug: check data range
data_min, data_max = np.nanmin(rgb_data), np.nanmax(rgb_data)
print(f" DEBUG: {label} RGB range: {data_min:.4f} - {data_max:.4f}, shape: {rgb_data.shape}")
# Display with explicit vmin/vmax to handle normalized 0-1 data
ax.imshow(rgb_data, vmin=0, vmax=1)
# Build title: label + offset + actual date
offset_str = f"{days_offsets[idx]:+d}d" if days_offsets[idx] is not None else "?"
date_str = actual_dates[idx].strftime('%Y-%m-%d') if actual_dates[idx] is not None else "No Date"
ax.set_title(f'{label}\n{offset_str}\n{date_str}', fontsize=10, fontweight='bold')
# Add red box around harvest date (T~0d at row=1, col=3)
if label == 'T~0d':
for spine in ax.spines.values():
spine.set_edgecolor('red')
spine.set_linewidth(4)
else:
ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray')
ax.set_title(label, fontsize=10)
# Add red box for T~0d even if no data
if label == 'T~0d':
for spine in ax.spines.values():
spine.set_edgecolor('red')
spine.set_linewidth(4)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
# Save figure with detailed naming: field_ID_harvestdate_model_harvestyle.png
harvest_date_str = harvest_date.strftime('%Y%m%d')
if harvest_type == 'registered':
filename = f'field_{field_id}_{harvest_date_str}_registered_harvest_rgb.png'
else:
# For predicted: include model name and harvest index if multiple
if harvest_index is not None and harvest_index > 0:
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest{harvest_index}_rgb.png'
else:
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png'
output_path = Path(output_dir) / filename
plt.savefig(output_path, dpi=100, bbox_inches='tight')
plt.close()
print(f" ✓ Saved: {filename}")
return output_path
def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates,
output_dir, tiff_dir, geojson_path):
"""
Main orchestration function for RGB visualization.
Creates 3x3 grids for:
1. Registered harvest dates (if available)
2. Predicted harvest dates (if available)
Args:
field_data (pd.DataFrame): Field data with Date, CI columns
field_id (str): Field identifier
registered_harvest_dates (list): List of registered harvest dates (pd.Timestamp)
predicted_harvest_dates (list): List of predicted harvest dates (dict or pd.Timestamp)
output_dir (Path): Output directory for plots
tiff_dir (Path): Directory containing TIFF files
geojson_path (Path): Path to pivot.geojson
Returns:
dict: Summary of generated plots with keys 'registered' and 'predicted'
"""
if rasterio is None:
print(" ⚠ Rasterio not available - skipping RGB visualization")
return {'registered': [], 'predicted': []}
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
tiff_dir = Path(tiff_dir)
geojson_path = Path(geojson_path)
if not tiff_dir.exists():
print(f" ✗ TIFF directory not found: {tiff_dir}")
return {'registered': [], 'predicted': []}
if not geojson_path.exists():
print(f" ✗ GeoJSON not found: {geojson_path}")
return {'registered': [], 'predicted': []}
# Load field boundary
print(f" Loading field boundary for {field_id}...")
feature, field_boundary = load_field_boundaries(geojson_path, field_id)
if field_boundary is None:
print(f" ✗ Could not load field boundary for {field_id}")
return {'registered': [], 'predicted': []}
results = {'registered': [], 'predicted': []}
# Process registered harvest dates
if registered_harvest_dates and len(registered_harvest_dates) > 0:
print(f" Processing {len(registered_harvest_dates)} registered harvest dates...")
for i, harvest_date in enumerate(registered_harvest_dates):
if pd.isna(harvest_date):
continue
print(f" [{i+1}/{len(registered_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')}")
output_path = create_temporal_rgb_grid(
harvest_date, field_data, field_id, tiff_dir, field_boundary,
title='Registered Harvest Validation',
output_dir=output_dir,
harvest_type='registered',
model_name=None,
harvest_index=i
)
if output_path:
results['registered'].append(output_path)
# Process predicted harvest dates - grouped by model
if predicted_harvest_dates and len(predicted_harvest_dates) > 0:
print(f" Processing {len(predicted_harvest_dates)} predicted harvest dates...")
# Group by model to track index per model
harvest_by_model = {}
for harvest_info in predicted_harvest_dates:
# Handle both dict and Timestamp formats
if isinstance(harvest_info, dict):
harvest_date = harvest_info.get('harvest_date')
model_name = harvest_info.get('model_name', 'predicted')
else:
harvest_date = harvest_info
model_name = 'predicted'
if model_name not in harvest_by_model:
harvest_by_model[model_name] = []
harvest_by_model[model_name].append(harvest_date)
# Process each model's harvests
overall_index = 1
for model_name, harvest_dates in harvest_by_model.items():
for model_harvest_idx, harvest_date in enumerate(harvest_dates):
if pd.isna(harvest_date):
continue
print(f" [{overall_index}/{len(predicted_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')} ({model_name})")
output_path = create_temporal_rgb_grid(
harvest_date, field_data, field_id, tiff_dir, field_boundary,
title=f'Predicted Harvest Validation ({model_name})',
output_dir=output_dir,
harvest_type='predicted',
model_name=model_name,
harvest_index=model_harvest_idx
)
if output_path:
results['predicted'].append(output_path)
overall_index += 1
return results
if __name__ == '__main__':
# Example usage
print("RGB Visualization Tool")
print("This module is intended to be imported and called from compare_307_models_production.py")
print("\nExample:")
print(" from rgb_visualization import generate_rgb_grids")
print(" generate_rgb_grids(field_data, field_id, registered_dates, predicted_dates, output_dir, tiff_dir, geojson_path)")

View file

@ -0,0 +1,175 @@
"""
BATCH TEST: Sample multiple fields to understand model output distribution
Purpose: Determine optimal threshold and consecutive_days parameters
This runs the model on 10 random fields and summarizes the results,
helping us decide what parameters to use in the production script.
"""
import pandas as pd
import numpy as np
import torch
import sys
from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
def test_field(ci_data, field_id, model, config, scalers, device):
"""Test a single field and return summary statistics"""
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
if len(field_data) == 0:
return None
try:
ci_column = config['data']['ci_column']
features = extract_features(field_data, config['features'], ci_column=ci_column)
# Apply scalers
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except:
pass
# Run model
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
with torch.no_grad():
output = model(features_tensor)
if isinstance(output, tuple):
detected_probs = output[1].cpu().numpy().flatten()
else:
detected_probs = output.cpu().numpy().flatten()
# Analyze
max_prob = detected_probs.max()
mean_prob = detected_probs.mean()
median_prob = np.median(detected_probs)
# Count consecutive days above thresholds
consecutive_above = {}
for thresh in [0.2, 0.3, 0.4, 0.5]:
above = (detected_probs > thresh).astype(int)
changes = np.diff(np.concatenate(([0], above, [0])))
starts = np.where(changes == 1)[0]
ends = np.where(changes == -1)[0]
runs = ends - starts if len(starts) > 0 else []
consecutive_above[thresh] = np.max(runs) if len(runs) > 0 else 0
return {
'field': field_id,
'max_prob': max_prob,
'mean_prob': mean_prob,
'median_prob': median_prob,
'consecutive_0.2': consecutive_above[0.2],
'consecutive_0.3': consecutive_above[0.3],
'consecutive_0.4': consecutive_above[0.4],
'consecutive_0.5': consecutive_above[0.5],
'num_days': len(field_data),
}
except Exception as e:
return None
def main():
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 10
# Load data
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
if not CI_DATA_FILE.exists():
print(f"ERROR: {CI_DATA_FILE} not found")
return
print(f"Loading CI data from {CI_DATA_FILE}...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Get random sample of fields
all_fields = sorted(ci_data['field'].unique())
np.random.seed(42)
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
print(f"Testing {len(sample_fields)} random fields...")
# Load model
print(f"Loading model...")
from harvest_date_pred_utils import load_model_and_config
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Test each field
results = []
for idx, field_id in enumerate(sample_fields, 1):
result = test_field(ci_data, field_id, model, config, scalers, device)
if result:
results.append(result)
print(f" [{idx:2d}/{len(sample_fields)}] Field {field_id:>6s}: "
f"max={result['max_prob']:.3f} mean={result['mean_prob']:.4f} "
f"consec@0.3={result['consecutive_0.3']:2d} consec@0.2={result['consecutive_0.2']:2d}")
# Summary statistics
results_df = pd.DataFrame(results)
print(f"\n{'='*80}")
print(f"SUMMARY STATISTICS ({len(results)} fields tested):")
print(f"{'='*80}")
print(f"\nMax probability (per field):")
print(f" Mean: {results_df['max_prob'].mean():.4f}")
print(f" Median: {results_df['max_prob'].median():.4f}")
print(f" Min: {results_df['max_prob'].min():.4f}")
print(f" Max: {results_df['max_prob'].max():.4f}")
print(f"\nMean probability (per field):")
print(f" Mean: {results_df['mean_prob'].mean():.4f}")
print(f" Median: {results_df['mean_prob'].median():.4f}")
print(f"\nConsecutive days above threshold=0.3:")
print(f" Mean: {results_df['consecutive_0.3'].mean():.1f}")
print(f" Median: {results_df['consecutive_0.3'].median():.1f}")
print(f" Min: {results_df['consecutive_0.3'].min():.0f}")
print(f" Max: {results_df['consecutive_0.3'].max():.0f}")
print(f"\nConsecutive days above threshold=0.2:")
print(f" Mean: {results_df['consecutive_0.2'].mean():.1f}")
print(f" Median: {results_df['consecutive_0.2'].median():.1f}")
print(f" Min: {results_df['consecutive_0.2'].min():.0f}")
print(f" Max: {results_df['consecutive_0.2'].max():.0f}")
print(f"\n{'='*80}")
print(f"RECOMMENDATION:")
print(f"{'='*80}")
# Calculate recommendations based on data
median_consec_0_3 = results_df['consecutive_0.3'].median()
median_consec_0_2 = results_df['consecutive_0.2'].median()
if median_consec_0_3 >= 3:
print(f"✓ Threshold=0.3 with consecutive_days=3 should work")
print(f" (median consecutive days: {median_consec_0_3:.0f})")
elif median_consec_0_3 >= 2:
print(f"✓ Threshold=0.3 with consecutive_days=2 recommended")
print(f" (median consecutive days: {median_consec_0_3:.0f})")
elif median_consec_0_2 >= 3:
print(f"✓ Threshold=0.2 with consecutive_days=3 recommended")
print(f" (median consecutive days: {median_consec_0_2:.0f})")
else:
print(f"✓ Threshold=0.2 with consecutive_days=2 recommended")
print(f" (median consecutive days @ 0.2: {median_consec_0_2:.0f})")
print(f"\nCurrent production settings: threshold=0.45, consecutive_days=2")
print(f" → These are likely TOO STRICT (only 289 fields detected in batch run)")
print(f"\nSuggested adjustment:")
print(f" → Lower threshold to 0.2-0.3")
print(f" → Reduce consecutive_days to 1-2")
print(f" → Re-run batch to get ~1000+ fields detected")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,65 @@
"""
Test DOY reset logic for harvest detection.
Verify that DOY resets to 1, 2, 3, ... after harvest is detected.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))
import pandas as pd
import numpy as np
from harvest_date_pred_utils import extract_features, load_model_and_config
import torch
# Load sample data
ci_data = pd.read_csv('../laravel_app/storage/app/angata/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv')
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Get field 779 data
field_779 = ci_data[ci_data['field'] == '779'].reset_index(drop=True)
print(f"Field 779: {len(field_779)} days of data")
print(f"Date range: {field_779['Date'].min().date()} to {field_779['Date'].max().date()}\n")
# Load model config
model, config, scalers = load_model_and_config(Path.cwd())
# Test 1: First season (season_anchor_day = 0)
print("=" * 80)
print("TEST 1: First season (season_anchor_day=0, lookback_start=0)")
print("=" * 80)
window = field_779.iloc[0:20].copy().reset_index(drop=True)
features = extract_features(window, config['features'], ci_column='FitData',
season_anchor_day=0, lookback_start=0)
# Extract DOY_normalized column (index 13 or find it)
feature_names = config['features']
doy_idx = feature_names.index('DOY_normalized') if 'DOY_normalized' in feature_names else -1
if doy_idx >= 0:
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
print(f"Window size: {len(window)} days")
print(f"DOY values: {doy_values[:10]}")
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]")
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not incrementing correctly!"
print("✓ DOY incrementing correctly for first season\n")
# Test 2: After harvest detected at day 100, next season starts
print("=" * 80)
print("TEST 2: After harvest at day 100, new season starts (season_anchor_day=101, lookback_start=101)")
print("=" * 80)
harvest_day = 100
window = field_779.iloc[harvest_day:harvest_day+20].copy().reset_index(drop=True)
features = extract_features(window, config['features'], ci_column='FitData',
season_anchor_day=harvest_day+1, lookback_start=harvest_day+1)
if doy_idx >= 0:
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
print(f"Window size: {len(window)} days (starting at day {harvest_day})")
print(f"DOY values: {doy_values[:10]}")
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (fresh season)")
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not reset after harvest!"
print("✓ DOY reset correctly for new season\n")
print("=" * 80)
print("ALL TESTS PASSED! DOY logic is correct.")
print("=" * 80)

View file

@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""
Quick test: Verify feature extraction works
"""
import sys
import pandas as pd
import numpy as np
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from harvest_date_pred_utils import extract_features, load_model_and_config
project_name = "angata"
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
print("="*80)
print("DEBUG: Feature Extraction Test")
print("="*80)
# Load model config
print("\n[1] Loading model config...")
model, config, scalers = load_model_and_config(Path("."))
print(f" Config features: {config['features']}")
print(f" Number of features: {len(config['features'])}")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
print(f" Columns: {ci_data.columns.tolist()}")
print(f" Total rows: {len(ci_data)}")
# Test on a single field
test_field = "1"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Testing on field {test_field}...")
print(f" Data points: {len(field_data)}")
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
print(f" Columns in field data: {field_data.columns.tolist()}")
print(f" Sample values:")
print(field_data[['Date', 'value']].head())
# Test feature extraction on first 50 days
print(f"\n[4] Extracting features for first 50 days...")
try:
subset = field_data.iloc[:50].copy()
features = extract_features(subset, config['features'], ci_column='value')
print(f" ✓ Success!")
print(f" Feature shape: {features.shape}")
print(f" Expected shape: (50, {len(config['features'])})")
print(f" Feature values sample (first 5 days):")
for i in range(min(5, features.shape[0])):
print(f" Day {i}: {features[i]}")
except Exception as e:
print(f" ✗ Error: {e}")
import traceback
traceback.print_exc()
print("\n[5] Testing on growing windows...")
try:
for window_size in [10, 20, 30, 50]:
window_data = field_data.iloc[:window_size].copy()
features = extract_features(window_data, config['features'], ci_column='value')
print(f" Window size {window_size}: shape={features.shape}, min={features.min():.4f}, max={features.max():.4f}")
except Exception as e:
print(f" ✗ Error: {e}")
import traceback
traceback.print_exc()
print("\n✓ Feature extraction test complete")

View file

@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Test ONLY the growing window method (what production actually uses)
Never run model on full sequence - only on expanding windows
This matches real deployment where data arrives daily
"""
import sys
import pandas as pd
import numpy as np
import torch
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
project_name = "angata"
# Find root by walking up until we find laravel_app
script_dir = Path(__file__).parent
root = script_dir
while root != root.parent: # Stop at filesystem root
if (root / "laravel_app").exists():
break
root = root.parent
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
MODEL_DIR = root / "python_app"
print("="*80)
print("GROWING WINDOW METHOD ONLY (Real Production Simulation)")
print("="*80)
# Load model
print("\n[1] Loading model...")
model, config, scalers = load_model_and_config(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Test on field 779
test_field = "779"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Field {test_field}: {len(field_data)} data points")
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
# Simulate growing window (real production)
print(f"\n[4] Simulating growing window (expanding daily)...")
harvest_dates = []
current_pos = 0
consecutive_above = 0
threshold = 0.3
consecutive_days = 2
model_runs = 0
while current_pos < len(field_data):
consecutive_above = 0
found_harvest = False
for window_end in range(current_pos + 1, len(field_data) + 1):
# Expand window: current_pos to window_end
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
try:
# Extract features for THIS window only
features = extract_features(window_data, config['features'], ci_column='value')
# Normalize
features_scaled = features.copy().astype(float)
for fi, scaler in enumerate(scalers):
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
# Run model on expanding window
with torch.no_grad():
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
model_runs += 1
# Check LAST timestep only
last_prob = detected_probs[0, -1].item()
last_date = window_data.iloc[-1]['Date'].date()
# Print every 50th window to track progress
if window_end % 50 == 0 or window_end < 10:
print(f" Window [{current_pos:3d}:{window_end:3d}] ({last_date}): prob={last_prob:.4f}", end="")
if last_prob > threshold:
print(" ✓ ABOVE", end="")
print()
# Check threshold
if last_prob > threshold:
consecutive_above += 1
else:
consecutive_above = 0
# Harvest detected
if consecutive_above >= consecutive_days:
harvest_idx = current_pos + window_end - consecutive_days
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx, last_prob))
print(f"\n ✓ HARVEST DETECTED at {harvest_date.date()} (index {harvest_idx})")
print(f" {consecutive_days} consecutive days above {threshold}")
# Jump past this harvest
current_pos = harvest_idx + 1
found_harvest = True
break
except Exception as e:
print(f" ERROR at window [{current_pos}:{window_end}]: {e}")
continue
if not found_harvest:
break
print(f"\n[5] Results:")
print(f" Total model runs: {model_runs}")
print(f" Harvests found: {len(harvest_dates)}")
if harvest_dates:
print(f"\n Harvest dates:")
for date, idx, prob in harvest_dates:
print(f" {date.date()}: index {idx}, last_prob={prob:.4f}")
else:
print(f"\n No harvests detected")
print(f"\n[6] Analysis:")
print(f" Model runs per day: {model_runs / len(field_data):.2f}x")
print(f" Expected: ~{len(field_data):.0f} runs (one per day)")

View file

@ -0,0 +1,272 @@
"""
COMPARISON TEST: Growing Window vs Single Run approach
Purpose: Compare harvest dates detected by two different methods
Method 1 (Growing Window - Current):
- Day 1: Run model on [day1]
- Day 2: Run model on [day1:2]
- Day 3: Run model on [day1:3]
- ... up to day 477
- This matches real-time production where data arrives daily
- Takes ~477 model runs per field (SLOW)
Method 2 (Single Run - Proposed):
- Run model ONCE on full [day1:477] sequence
- Use these probabilities to scan for harvests
- This is 477x faster but assumes different LSTM context
- May produce different harvest dates
Question: Do these methods detect similar harvest dates or different ones?
"""
import pandas as pd
import numpy as np
import torch
import sys
from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
import time
def method_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
"""Original method: expanding window, run model multiple times"""
harvest_dates = []
current_pos = 0
while current_pos < len(field_data):
consecutive_above_threshold = 0
for window_end in range(current_pos + 1, len(field_data) + 1):
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
try:
features = extract_features(window_data, config['features'], ci_column=ci_column)
# Apply scalers
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception:
pass
# Run model
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
output = model(x_tensor)
if isinstance(output, tuple):
_, detected_probs = output
detected_probs = detected_probs.squeeze(0).cpu().numpy()
else:
detected_probs = output.squeeze(0).cpu().numpy()
# Check LAST timestep
last_prob = detected_probs[-1]
if last_prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
# Harvest detected
if consecutive_above_threshold >= consecutive_days:
harvest_idx = current_pos + window_end - consecutive_days - 1
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx, last_prob))
# Reset to next day after harvest
current_pos = current_pos + window_end - consecutive_days
break
except Exception:
continue
else:
break
return harvest_dates
def method_single_run(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
"""Proposed method: run model once on full sequence"""
harvest_dates = []
current_pos = 0
try:
# Extract features ONCE for full dataset
features = extract_features(field_data, config['features'], ci_column=ci_column)
# Apply scalers
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception:
pass
# Run model ONCE to get all probabilities
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
output = model(x_tensor)
if isinstance(output, tuple):
_, detected_probs = output
detected_probs = detected_probs.squeeze(0).cpu().numpy()
else:
detected_probs = output.squeeze(0).cpu().numpy()
# Scan forward looking for harvests
while current_pos < len(field_data):
consecutive_above_threshold = 0
for pos in range(current_pos, len(field_data)):
prob = detected_probs[pos]
if prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
# Harvest detected
if consecutive_above_threshold >= consecutive_days:
harvest_idx = pos - consecutive_days + 1
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx, prob))
# Move anchor point past this harvest
current_pos = harvest_idx + 1
break
else:
# No harvest found in remaining data
break
except Exception as e:
pass
return harvest_dates
def compare_field(field_id, ci_data, model, config, scalers, device, threshold=0.3):
"""Compare both methods for a single field"""
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
if len(field_data) < 10:
return None
ci_column = config['data']['ci_column']
# Method 1: Growing window (SLOW)
print(f"\n Growing Window method...", end=" ", flush=True)
start = time.time()
growing_harvests = method_growing_window(field_data, model, config, scalers, ci_column, device, threshold)
time_growing = time.time() - start
print(f"({time_growing:.2f}s, {len(field_data)} model runs)")
# Method 2: Single run (FAST)
print(f" Single Run method...", end=" ", flush=True)
start = time.time()
single_harvests = method_single_run(field_data, model, config, scalers, ci_column, device, threshold)
time_single = time.time() - start
print(f"({time_single:.2f}s, 1 model run)")
return {
'field': field_id,
'num_days': len(field_data),
'growing_harvests': growing_harvests,
'single_harvests': single_harvests,
'time_growing': time_growing,
'time_single': time_single,
'speedup': time_growing / time_single if time_single > 0 else 0,
}
def main():
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 3
threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3
# Load data
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
if not CI_DATA_FILE.exists():
print(f"ERROR: {CI_DATA_FILE} not found")
return
print(f"Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Get sample of fields
all_fields = sorted(ci_data['field'].unique())
np.random.seed(42)
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
# Load model
print(f"Loading model...")
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n{'='*80}")
print(f"COMPARING METHODS (threshold={threshold}, consecutive_days=2)")
print(f"{'='*80}")
all_results = []
for idx, field_id in enumerate(sample_fields, 1):
print(f"\n[{idx}/{len(sample_fields)}] Field {field_id}")
result = compare_field(field_id, ci_data, model, config, scalers, device, threshold)
if result:
all_results.append(result)
print(f"\n Growing Window found {len(result['growing_harvests'])} harvests:")
for date, idx, prob in result['growing_harvests']:
print(f" - {date.date()}: prob={prob:.4f}")
print(f"\n Single Run found {len(result['single_harvests'])} harvests:")
for date, idx, prob in result['single_harvests']:
print(f" - {date.date()}: prob={prob:.4f}")
print(f"\n ⏱ Speed comparison:")
print(f" Growing Window: {result['time_growing']:.2f}s ({result['num_days']} days processed)")
print(f" Single Run: {result['time_single']:.2f}s (1 run)")
print(f" Speedup: {result['speedup']:.0f}x faster")
# Compare harvests
growing_dates = [d for d, _, _ in result['growing_harvests']]
single_dates = [d for d, _, _ in result['single_harvests']]
if growing_dates == single_dates:
print(f"\n ✓ IDENTICAL: Both methods found the same harvest dates")
else:
print(f"\n ✗ DIFFERENT: Methods found different harvest dates")
print(f" Growing only: {[d for d in growing_dates if d not in single_dates]}")
print(f" Single only: {[d for d in single_dates if d not in growing_dates]}")
# Summary
print(f"\n{'='*80}")
print(f"SUMMARY ({len(all_results)} fields tested)")
print(f"{'='*80}")
identical = sum(1 for r in all_results if [d for d, _, _ in r['growing_harvests']] == [d for d, _, _ in r['single_harvests']])
different = len(all_results) - identical
print(f"\nIdentical results: {identical}/{len(all_results)}")
print(f"Different results: {different}/{len(all_results)}")
if all_results:
avg_speedup = np.mean([r['speedup'] for r in all_results])
print(f"\nAverage speedup: {avg_speedup:.0f}x")
print(f"\nConclusion:")
if identical == len(all_results):
print(f" ✓ Methods are EQUIVALENT - Single run approach is safe to use")
print(f" Recommend switching to single run for {avg_speedup:.0f}x faster execution")
else:
print(f" ✗ Methods produce DIFFERENT results - Need to understand why")
print(f" Growing window is slower but may be more accurate for live deployment")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Complete test: Feature extraction + Model inference + Phase 1 detection
"""
import sys
import pandas as pd
import numpy as np
import torch
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
run_phase1_growing_window
)
project_name = "angata"
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
print("="*80)
print("DEBUG: Model Inference + Phase 1 Detection")
print("="*80)
# Load model
print("\n[1] Loading model...")
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Device: {device}")
print(f" Scalers type: {type(scalers)}")
print(f" Number of scalers: {len(scalers) if isinstance(scalers, list) else 'N/A (dict/object)'}")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Test on a known field (field 1)
test_field = "1"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Testing on field {test_field}...")
print(f" Data points: {len(field_data)}")
# Test with first 100 days
subset_100 = field_data.iloc[:100].copy().reset_index(drop=True)
print(f"\n[4] Testing model inference on first 100 days...")
try:
features = extract_features(subset_100, config['features'], ci_column='value')
print(f" Features shape: {features.shape}")
print(f" Features dtype: {features.dtype}")
# Apply scalers
features_scaled = features.copy().astype(float)
print(f" Applying {len(scalers)} scalers...")
for fi, scaler in enumerate(scalers):
try:
col_data = features[:, fi].reshape(-1, 1)
scaled_col = scaler.transform(col_data)
features_scaled[:, fi] = scaled_col.flatten()
if fi < 3: # Show first 3 scalers
print(f" Scaler {fi}: transformed {features[0, fi]:.4f}{features_scaled[0, fi]:.4f}")
except Exception as e:
print(f" ERROR in scaler {fi}: {e}")
raise
# Run model
print(f"\n Running model inference...")
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
print(f" Tensor shape: {x_tensor.shape}")
with torch.no_grad():
imminent_probs, detected_probs = model(x_tensor)
print(f" Imminent probs shape: {imminent_probs.shape}")
print(f" Detected probs shape: {detected_probs.shape}")
print(f" Detected probs dtype: {detected_probs.dtype}")
# Analyze detected probs
detected_np = detected_probs[0].cpu().numpy() # Get first (only) batch
print(f"\n Detected head analysis:")
print(f" Min: {detected_np.min():.4f}")
print(f" Max: {detected_np.max():.4f}")
print(f" Mean: {detected_np.mean():.4f}")
print(f" Median: {np.median(detected_np):.4f}")
print(f" > 0.1: {(detected_np > 0.1).sum()} days")
print(f" > 0.3: {(detected_np > 0.3).sum()} days")
print(f" > 0.5: {(detected_np > 0.5).sum()} days")
# Show top 5 peaks
top_indices = np.argsort(detected_np)[-5:][::-1]
print(f"\n Top 5 detected peaks:")
for idx in top_indices:
date = subset_100.iloc[idx]['Date'].date()
prob = detected_np[idx]
print(f" Day {idx} ({date}): {prob:.4f}")
except Exception as e:
print(f" ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test Phase 1 growing window
print(f"\n[5] Testing Phase 1 growing window (threshold=0.3, consecutive=2)...")
try:
phase1_results = run_phase1_growing_window(
subset_100, model, config, scalers, 'value', device,
threshold=0.3, consecutive_days=2
)
print(f" ✓ Phase 1 found {len(phase1_results)} harvest(s):")
for harvest_date, harvest_idx in phase1_results:
print(f" {harvest_date.date()}: index {harvest_idx}")
except Exception as e:
print(f" ERROR: {e}")
import traceback
traceback.print_exc()
print("\n✓ Model inference test complete")

View file

@ -0,0 +1,276 @@
"""
TEST SCRIPT: Inspect raw model output distributions for a single field
Purpose: Diagnose why thresholding is failing and harvest dates are wrong
This shows:
1. Distribution of harvest probability scores (0-1 range)
2. How often consecutive_days=3 is actually achievable
3. Actual season boundaries detected
4. Recommendations for threshold adjustment
Usage:
python test_model_output_distribution.py angata [field_id]
Examples:
python test_model_output_distribution.py angata 1
python test_model_output_distribution.py angata 10042
"""
import pandas as pd
import numpy as np
import torch
import sys
from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
def analyze_single_field(ci_data, field_id, model, config, scalers, device):
"""Analyze raw model outputs for a single field"""
# Filter to field
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
if len(field_data) == 0:
print(f"ERROR: No data for field {field_id}")
return
print(f"\n{'='*80}")
print(f"FIELD: {field_id}")
print(f"{'='*80}")
print(f"Date range: {field_data['Date'].min()} to {field_data['Date'].max()}")
print(f"Total days: {len(field_data)}")
# Extract features (same as main pipeline)
try:
ci_column = config['data']['ci_column']
features = extract_features(field_data, config['features'], ci_column=ci_column)
if features is None or len(features) == 0:
print(f"ERROR: extract_features returned empty for field {field_id}")
return
except Exception as e:
print(f"ERROR extracting features: {e}")
import traceback
traceback.print_exc()
return
# Apply scalers (CRITICAL - same as production code)
try:
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception as e:
print(f" Warning: Scaler {fi} failed: {e}")
pass
except Exception as e:
print(f"ERROR applying scalers: {e}")
import traceback
traceback.print_exc()
return
# Run model
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
with torch.no_grad():
output = model(features_tensor)
# Convert to numpy (handle different output formats)
# Model has TWO heads: imminent_probs, detected_probs
if isinstance(output, tuple):
imminent_probs = output[0].cpu().numpy().flatten()
detected_probs = output[1].cpu().numpy().flatten()
probs = detected_probs # Use DETECTED head (current harvest), not imminent (future)
print(f"Model outputs TWO heads: imminent + detected")
print(f" Using DETECTED head for harvest detection (sparse spikes at harvest)")
else:
# Fallback for single output
probs = output.cpu().numpy().flatten()
print(f"Model output: single head")
print(f"\nModel output shape: {probs.shape}")
print(f"Output range: {probs.min():.4f} to {probs.max():.4f}")
# Statistics
print(f"\nHarvest probability statistics:")
print(f" Mean: {probs.mean():.4f}")
print(f" Median: {np.median(probs):.4f}")
print(f" Std Dev: {probs.std():.4f}")
print(f" Min: {probs.min():.4f}")
print(f" Max: {probs.max():.4f}")
# Distribution by threshold
print(f"\nDistribution by threshold:")
thresholds = [0.2, 0.3, 0.4, 0.45, 0.5, 0.6, 0.7]
for thresh in thresholds:
count = np.sum(probs > thresh)
pct = 100 * count / len(probs)
print(f" > {thresh}: {count:4d} days ({pct:5.1f}%)")
# Consecutive days analysis (key metric)
print(f"\nConsecutive days above threshold (Phase 1 requirement):")
for thresh in [0.3, 0.4, 0.45, 0.5]:
above = (probs > thresh).astype(int)
# Find consecutive runs
changes = np.diff(np.concatenate(([0], above, [0])))
starts = np.where(changes == 1)[0]
ends = np.where(changes == -1)[0]
runs = ends - starts
if len(runs) > 0:
max_run = np.max(runs)
print(f" Threshold {thresh}: max consecutive = {max_run} days (need 3 for Phase 1)")
if len(runs) > 5:
print(f" {len(runs)} separate runs detected (seasons?)")
else:
print(f" Threshold {thresh}: no runs detected")
# Show top predictions
print(f"\nTop 10 harvest probability peaks:")
top_indices = np.argsort(probs)[-10:][::-1]
for rank, idx in enumerate(top_indices, 1):
date = field_data.iloc[idx]['Date']
prob = probs[idx]
print(f" {rank:2d}. Day {idx:4d} ({date}): {prob:.4f}")
# Show timeline
print(f"\nTimeline of probabilities (sampling every 10 days):")
for idx in range(0, len(probs), max(1, len(probs) // 20)):
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
ci_value = field_data.iloc[idx]['FitData']
prob = probs[idx]
bar = '-' * int(prob * 35)
try:
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f} {bar}")
except UnicodeEncodeError:
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f}")
# FULL DAILY PROBABILITIES WITH CI VALUES
print(f"\n{'='*80}")
print(f"FULL DAILY PROBABILITIES WITH CI VALUES ({len(probs)} days):")
print(f"{'='*80}")
print(f"{'Day':>4} {'Date':<12} {'CI':>8} {'Probability':>12} {'Visual':<40}")
print(f"{'-'*100}")
for idx in range(len(probs)):
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
ci_value = field_data.iloc[idx]['FitData']
prob = probs[idx]
bar = '-' * int(prob * 35) # Use dashes instead of█ for Unicode safety
try:
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f} {bar}")
except UnicodeEncodeError:
# Fallback for Windows encoding issues
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f}")
print(f"{'-'*100}")
# Find valleys (days with low probabilities that could indicate season boundaries)
print(f"\nDays with LOWEST probabilities (potential season boundaries):")
valleys_threshold = 0.5 # Days below this might be season breaks
valley_indices = np.where(probs < valleys_threshold)[0]
if len(valley_indices) > 0:
print(f" Found {len(valley_indices)} days below {valleys_threshold}")
# Get valleys sorted by probability
valley_data = [(idx, probs[idx], field_data.iloc[idx]['Date']) for idx in valley_indices]
valley_data.sort(key=lambda x: x[1]) # Sort by probability (lowest first)
print(f"\n Bottom 20 lowest-probability days:")
for rank, (idx, prob, date) in enumerate(valley_data[:20], 1):
print(f" {rank:2d}. Day {idx:3d} ({date}): {prob:.4f}")
else:
print(f" None - all days above {valleys_threshold}")
# Identify likely harvest dates by finding local minima (valleys between growing periods)
print(f"\nLikely season boundaries (local minima approach):")
# Find indices where probability suddenly drops (derivative)
if len(probs) > 7:
smoothed = pd.Series(probs).rolling(window=7, center=True).mean()
derivatives = smoothed.diff().fillna(0)
# Find big drops (where derivative is very negative)
drops = np.where(derivatives < -0.2)[0] # Significant downward moves
if len(drops) > 0:
print(f" Found {len(drops)} significant drops (prob falling by 0.2+):")
for idx in drops[:10]: # Show first 10
date = field_data.iloc[idx]['Date']
before = probs[max(0, idx-1)]
after = probs[idx]
print(f" Day {idx:3d} ({date}): {before:.4f}{after:.4f}")
else:
print(f" No significant drops detected (probabilities don't dip much)")
# Show which harvest dates would be detected at different thresholds
print(f"\nHarvest detection (first day where prob > threshold for N consecutive days):")
for thresh in [0.2, 0.3, 0.4, 0.5, 0.6]:
for consec in [1, 2, 3]:
above = (probs > thresh).astype(int)
changes = np.diff(np.concatenate(([0], above, [0])))
starts = np.where(changes == 1)[0]
if len(starts) > 0:
# For each harvest start, find where it would trigger with consecutive_days
detected_harvests = []
for start_idx in starts:
# Check if we have enough consecutive days
if start_idx + consec - 1 < len(probs):
if all(probs[start_idx:start_idx + consec] > thresh):
harvest_date = field_data.iloc[start_idx]['Date']
detected_harvests.append((start_idx, harvest_date))
if detected_harvests:
first_idx, first_date = detected_harvests[0]
print(f" Threshold={thresh}, consecutive={consec}: {first_date} (day {first_idx})")
else:
print(f" Threshold={thresh}, consecutive={consec}: None detected")
def main():
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
field_id = sys.argv[2] if len(sys.argv) > 2 else None
# Paths
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
if not CI_DATA_FILE.exists():
print(f"ERROR: {CI_DATA_FILE} not found")
return
print(f"Loading CI data from {CI_DATA_FILE}...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
if field_id is None:
# Show first 10 fields if none specified
fields = sorted(ci_data['field'].unique())[:10]
field_id = fields[0]
print(f"No field specified. Testing first field: {field_id}")
print(f"Available fields: {', '.join(fields)}")
# Load model
print(f"\nLoading model...")
from harvest_date_pred_utils import load_model_and_config
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
# Analyze
analyze_single_field(ci_data, str(field_id), model, config, scalers, device)
print(f"\n{'='*80}")
print("INTERPRETATION:")
print(" If max consecutive < 3 for threshold=0.5:")
print(" → Lower threshold to 0.3-0.4, or reduce consecutive_days to 1-2")
print(" If multiple runs detected but harvest_date == first date:")
print(" → Season detection is failing (check extract_features)")
print(" If peaks are scattered randomly:")
print(" → Model may need retraining or data validation")
print("="*80)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
Debug script: Test if script 22 logic is working
Tests the two-step refinement on a single known field
"""
import sys
import time
import pandas as pd
import numpy as np
import torch
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
run_phase1_growing_window,
)
project_name = "angata"
# Find the workspace root by looking for laravel_app folder
script_dir = Path(__file__).parent
root = script_dir
while root != root.parent:
if (root / "laravel_app").exists():
break
root = root.parent
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
MODEL_DIR = root / "python_app"
print("="*80)
print("DEBUG: Script 22 Two-Step Refinement Logic")
print("="*80)
# Load model
print("\n[1] Loading model...")
model, config, scalers = load_model_and_config(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Device: {device}")
print(f" Model features: {config['features']}")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
print(f" Total rows: {len(ci_data)}")
print(f" Fields: {ci_data['field'].nunique()}")
print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
# Test on a known field (field 779 from our previous tests)
test_field = "779"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Testing on field {test_field}...")
print(f" Data points: {len(field_data)}")
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
if len(field_data) == 0:
print(f" ERROR: No data for field {test_field}")
sys.exit(1)
# Extract features
print(f"\n[4] Extracting features for field {test_field}...")
try:
features = extract_features(field_data.reset_index(drop=True), config['features'], ci_column='value')
print(f" Features shape: {features.shape}")
print(f" Features dtype: {features.dtype}")
except Exception as e:
print(f" ERROR: Could not extract features: {e}")
sys.exit(1)
# Normalize and run model
print(f"\n[5] Running Phase 1 GROWING WINDOW method (threshold=0.5, consecutive=3)...")
print(f" This simulates real production: expanding windows, checking each day")
print(f" Expected: ~477 model runs for 477 days (SLOW)")
import time
start_time = time.time()
# Add instrumentation to see how many model runs are happening
original_run = run_phase1_growing_window
def instrumented_run(*args, **kwargs):
import sys
from harvest_date_pred_utils import extract_features
field_data = args[0]
model = args[1]
config = args[2]
scalers = args[3]
ci_column = args[4]
device = args[5]
threshold = kwargs.get('threshold', 0.3)
consecutive_days = kwargs.get('consecutive_days', 2)
harvest_dates = []
current_pos = 0
model_runs = 0
print(f" Starting growing window loop...")
while current_pos < len(field_data):
consecutive_above_threshold = 0
loop_start = current_pos
for window_end in range(current_pos + 1, len(field_data) + 1):
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
try:
features = extract_features(window_data, config['features'], ci_column=ci_column)
features_scaled = features.copy().astype(float)
for fi, scaler in enumerate(scalers):
try:
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception as e:
raise ValueError(f"Scaler {fi} failed: {e}")
import torch
with torch.no_grad():
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
model_runs += 1
last_prob = detected_probs[0, -1].item()
if last_prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
if consecutive_above_threshold >= consecutive_days:
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
current_pos = current_pos + window_end - consecutive_days + 1
break
except Exception as e:
pass
else:
break
print(f" Model runs performed: {model_runs}")
return harvest_dates
phase1_results = instrumented_run(
field_data.reset_index(drop=True),
model, config, scalers, 'value', device,
threshold=0.5,
consecutive_days=3
)
elapsed = time.time() - start_time
print(f"\n Time elapsed: {elapsed:.2f}s")
if phase1_results:
print(f" ✓ Phase 1 detected {len(phase1_results)} harvest(s):")
# Get probabilities for display by running model once on full field
with torch.no_grad():
X = features.reshape(1, -1, len(config['features']))
X_normalized = np.zeros_like(X)
for fi, scaler in enumerate(scalers):
X_normalized[0, :, fi] = scaler.transform(X[0, :, fi].reshape(-1, 1)).flatten()
X_tensor = torch.from_numpy(X_normalized).float().to(device)
_, detected_probs = model(X_tensor)
detected_np = detected_probs[0].cpu().numpy()
for harvest_date, harvest_idx in phase1_results:
prob = detected_np[harvest_idx] if harvest_idx < len(detected_np) else 0.0
print(f" {harvest_date.date()}: index {harvest_idx}, probability={prob:.4f}")
else:
print(f" ✗ Phase 1: No harvest detected")

View file

@ -1,29 +0,0 @@
# Merge all CI RDS files into a single CSV
library(tidyverse)
# Paths
ci_data_dir <- "r_app/experiments/ci_graph_exploration/CI_data"
output_csv <- "python_app/lstm_ci_data_combined.csv"
# Find all RDS files
rds_files <- list.files(ci_data_dir, pattern = "\\.rds$", full.names = TRUE)
print(paste("Found", length(rds_files), "RDS files"))
# Load and combine all files
combined_data <- tibble()
for (file in rds_files) {
filename <- basename(file)
client_name <- sub("\\.rds$", "", filename) # Extract client name from filename
print(paste("Loading:", filename, "- Client:", client_name))
data <- readRDS(file)
data$client <- client_name
combined_data <- bind_rows(combined_data, data)
}
print(paste("Total rows:", nrow(combined_data)))
print(paste("Columns:", paste(names(combined_data), collapse = ", ")))
# Write to CSV
write.csv(combined_data, output_csv, row.names = FALSE)
print(paste("✓ Saved to:", output_csv))

View file

@ -0,0 +1,791 @@
#!/usr/bin/env python
"""
RGB Visualization Tool for Harvest Date Validation
Creates 3x3 temporal grids showing satellite imagery around registered and predicted harvest dates.
Extracts RGB from 8-band Planet scope data and clips to field boundaries from GeoJSON.
Functions:
- load_field_boundaries(): Load field geometries from GeoJSON
- find_closest_tiff(): Find available TIFF file closest to target date
- load_and_clip_tiff_rgb(): Load TIFF, extract RGB, clip to field boundary
- create_temporal_grid(): Create 3x3 grid (4 pre-harvest, 1 near, 2-3 post-harvest)
- generate_rgb_grids(): Main orchestration function
Usage:
from rgb_visualization import generate_rgb_grids
generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates, output_dir, tiff_dir, geojson_path)
"""
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
import matplotlib
matplotlib.use('Agg') # Use non-interactive backend to avoid display hangs
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.colors import Normalize
import warnings
warnings.filterwarnings('ignore')
try:
import rasterio
from rasterio.mask import mask
import shapely.geometry as shgeom
except ImportError:
print("Warning: rasterio not available. RGB visualization will be skipped.")
rasterio = None
def load_field_boundaries(geojson_path, field_id):
"""
Load field boundary from GeoJSON file.
Args:
geojson_path (Path): Path to pivot.geojson
field_id (str): Field identifier (e.g., "13973")
Returns:
dict: GeoJSON feature or None if not found
shapely.geometry.Polygon: Field boundary polygon or None
"""
try:
with open(geojson_path) as f:
geojson_data = json.load(f)
# Match field ID in properties
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
# Try matching on 'field' or 'sub_field'
if str(props.get('field', '')) == str(field_id) or \
str(props.get('sub_field', '')) == str(field_id):
geometry = feature.get('geometry')
if geometry:
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
# Handle MultiPolygon: coordinates[i] = [[[ring coords]], [[inner ring coords]], ...]
if geom_type == 'MultiPolygon':
# Use the first polygon from the multipolygon
if coordinates and len(coordinates) > 0:
coords = coordinates[0][0] # First polygon's exterior ring
polygon = shgeom.Polygon(coords)
return feature, polygon
# Handle Polygon: coordinates = [[[ring coords]], [[inner ring coords]], ...]
elif geom_type == 'Polygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0] # Exterior ring
polygon = shgeom.Polygon(coords)
return feature, polygon
print(f" ⚠ Field {field_id} not found in GeoJSON")
return None, None
except Exception as e:
print(f" ✗ Error loading GeoJSON: {e}")
return None, None
def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=None, debug=False):
"""
Find tile files with actual data (not cloud-masked) for target_date or nearest date.
Searches by increasing distance from target date until finding tiles with data.
Avoids reusing dates in exclude_dates to ensure temporal diversity in grids.
Args:
target_date (pd.Timestamp): Target date to find tiles near
tiff_dir (Path): Directory containing 5x5 date subdirectories
field_boundary (shapely.Polygon): Field boundary for overlap detection
days_window (int): Max days to search before/after target
exclude_dates (list): List of dates to skip (avoid repetition)
debug (bool): Enable detailed debugging output
Returns:
tuple: (list of tile paths, actual_date, days_diff)
"""
target_date = pd.Timestamp(target_date)
tiff_dir = Path(tiff_dir)
exclude_dates = exclude_dates or []
exclude_dates = [pd.Timestamp(d) for d in exclude_dates]
if not tiff_dir.exists():
if debug:
print(f" [DEBUG] TIFF dir does not exist: {tiff_dir}")
return [], None, None
# Build map of all available dates
available_dates = {}
date_parse_errors = 0
for date_dir in tiff_dir.iterdir():
if not date_dir.is_dir():
continue
try:
dir_name = date_dir.name
date_str = dir_name.split('_')[0]
tile_date = pd.Timestamp(date_str)
tile_files = []
for tile_file in date_dir.glob('*.tif'):
# Include ALL tiles, regardless of size
# Some tiles may be small but still contain valid data for specific fields
tile_files.append(tile_file)
if tile_files:
available_dates[tile_date] = (tile_files, dir_name)
except Exception as e:
date_parse_errors += 1
if debug:
print(f" [DEBUG] Failed to parse date from {date_dir.name}: {e}")
if debug:
print(f" [DEBUG] Found {len(available_dates)} dates with tile files, {date_parse_errors} parse errors")
print(f" [DEBUG] Date range: {min(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'} to {max(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'}")
if not available_dates:
return [], None, None
# Search dates by increasing distance from target, looking for tiles with actual data
sorted_dates = sorted(available_dates.keys(), key=lambda d: abs((d - target_date).days))
for search_date in sorted_dates:
# Skip if this date was recently used (avoid temporal repetition)
if search_date in exclude_dates:
continue
tiles, dir_name = available_dates[search_date]
days_diff = (search_date - target_date).days
# Try to find overlapping tiles at this date
overlapping_tiles = []
tile_check_errors = 0
for tile_path in tiles:
try:
with rasterio.open(tile_path) as src:
tile_bounds = src.bounds
tile_geom = shgeom.box(*tile_bounds)
# Debug first tile
if debug and len(overlapping_tiles) == 0 and tile_check_errors == 0:
print(f" [DEBUG] First tile check for {tile_path.name}:")
print(f" Tile bounds: {tile_bounds}")
print(f" Tile CRS: {src.crs}")
print(f" Field bounds: {field_boundary.bounds}")
print(f" Field geom type: {field_boundary.geom_type}")
print(f" Intersects: {tile_geom.intersects(field_boundary)}")
if tile_geom.intersects(field_boundary):
overlapping_tiles.append(tile_path)
except Exception as e:
tile_check_errors += 1
if debug:
print(f" [DEBUG] Error checking tile {tile_path.name}: {e}")
if debug:
print(f" [DEBUG] Date {search_date.strftime('%Y-%m-%d')}: {len(tiles)} tiles, {len(overlapping_tiles)} overlap field, {tile_check_errors} errors")
if overlapping_tiles:
# Found overlapping tiles, return them
print(f" [FIND_TILES] Target: {target_date.strftime('%Y-%m-%d')}, Using: {search_date.strftime('%Y-%m-%d')} ({days_diff:+d}d), Tiles: {[Path(t).name for t in overlapping_tiles]}")
return overlapping_tiles, search_date, days_diff
# No overlapping tiles found at all
if debug:
print(f" [DEBUG] No overlapping tiles found for {target_date.strftime('%Y-%m-%d')} within {len(sorted_dates)} searched dates")
return [], None, None
def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
"""
Load TIFF and extract RGB bands clipped to field boundary.
For merged_final_tif files (cloud-masked and filtered):
- Band 1: Red
- Band 2: Green
- Band 3: Blue
- Band 4: NIR
- Band 5: CI
Args:
tiff_path (Path): Path to TIFF file
field_boundary (shapely.Polygon): Field boundary for clipping
rgb_bands (tuple): Band indices for RGB (1-indexed, defaults to 1,2,3 for merged_final_tif)
Returns:
np.ndarray: RGB data (height, width, 3) with values 0-1
or None if error occurs
"""
if rasterio is None or field_boundary is None:
return None
try:
with rasterio.open(tiff_path) as src:
# Check band count
if src.count < 3:
return None
# For merged_final_tif: bands 1,2,3 are R,G,B
bands_to_read = (1, 2, 3)
# Mask and read bands - extract ONLY the specific field polygon
geom = shgeom.mapping(field_boundary)
try:
masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read))
rgb = np.stack([masked_data[i] for i in range(3)], axis=-1)
except (ValueError, RuntimeError) as e:
# Mask failed - field doesn't overlap this tile or geometry issue
print(f" MASK ERROR on {Path(tiff_path).name}: {str(e)[:50]}")
return None
# Convert to float32 if not already
rgb = rgb.astype(np.float32)
# Normalize to 0-1 range
# Data appears to be 8-bit (0-255 range) stored as float32
# Check actual max value to determine normalization
max_val = np.nanmax(rgb)
if max_val > 0:
# If max is around 255 or less, assume 8-bit
if max_val <= 255:
rgb = rgb / 255.0
# If max is around 65535, assume 16-bit
elif max_val <= 65535:
rgb = rgb / 65535.0
# Otherwise divide by max to normalize
else:
rgb = rgb / max_val
rgb = np.clip(rgb, 0, 1)
# Check if result is all NaN
if np.all(np.isnan(rgb)):
return None
# Replace any remaining NaN with 0 (cloud/invalid pixels)
rgb = np.nan_to_num(rgb, nan=0.0)
return rgb
except Exception as e:
return None
def load_and_composite_tiles_rgb(tile_paths, field_boundary):
"""
Load RGB from multiple overlapping tiles and composite them into a single image.
Args:
tile_paths (list[Path]): List of tile file paths
field_boundary (shapely.Polygon): Field boundary for clipping
Returns:
np.ndarray: Composited RGB data (height, width, 3) with values 0-1
or None if error occurs
"""
if rasterio is None or field_boundary is None or not tile_paths:
return None
try:
# Load and composite all tiles
rgb_arrays = []
for tile_path in tile_paths:
rgb = load_and_clip_tiff_rgb(tile_path, field_boundary)
if rgb is not None:
rgb_arrays.append(rgb)
if not rgb_arrays:
return None
# If single tile, return it
if len(rgb_arrays) == 1:
composited = rgb_arrays[0]
else:
# If multiple tiles, need to handle different shapes
# Find common shape or use max/min approach that handles variable sizes
try:
# Try to stack if same shape
stacked = np.stack(rgb_arrays, axis=0)
composited = np.max(stacked, axis=0)
except ValueError:
# Different shapes - use the largest (most complete) tile
# This happens when tiles are masked to different field areas
composited = max(rgb_arrays, key=lambda x: x.size)
composited = composited.astype(np.float32)
# Stretch contrast: normalize to 0-1 range based on actual min/max in the data
# This makes dim images visible
valid_data = composited[composited > 0]
if len(valid_data) > 0:
data_min = np.percentile(valid_data, 2) # 2nd percentile to handle outliers
data_max = np.percentile(valid_data, 98) # 98th percentile
if data_max > data_min:
composited = (composited - data_min) / (data_max - data_min)
composited = np.clip(composited, 0, 1)
return composited.astype(np.float32)
except Exception as e:
return None
def has_valid_rgb_data(rgb_data, threshold=0.05):
"""
Check if RGB image has actual data (not black/empty).
Returns True if max value > threshold (not all zeros/black).
"""
if rgb_data is None:
return False
try:
# Check if there's any variation in the RGB data
data_max = np.nanmax(rgb_data)
data_min = np.nanmin(rgb_data)
# Image is valid if max > threshold AND there's variation
has_data = data_max > threshold and (data_max - data_min) > 0.01
return has_data
except:
return False
def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary,
title, output_dir, harvest_type='registered', model_name=None, harvest_index=None):
"""
Create 5x3 temporal grid around harvest date (15 images, 7-day intervals).
Layout:
Row 1: T-56d, T-42d, T-35d, T-28d, T-21d (pre-harvest)
Row 2: T-14d, T-7d, T~0d, T+7d, T+14d (near harvest)
Row 3: T+21d, T+28d, T+35d, T+42d, T+56d (post-harvest progression)
Args:
harvest_date (pd.Timestamp): Target harvest date
field_data (pd.DataFrame): Field data with Date column
field_id (str): Field identifier
tiff_dir (Path): Directory with TIFF files
field_boundary (shapely.Polygon): Field boundary
title (str): Plot title
output_dir (Path): Output directory
harvest_type (str): 'registered' or 'predicted'
model_name (str): Model name for predicted harvests (e.g., 'Original', 'Long-Season')
harvest_index (int): Index of harvest within same model (for multiple harvests)
Returns:
Path: Path to saved PNG or None if failed
"""
harvest_date = pd.Timestamp(harvest_date)
# Pre-allocate 15 image slots
rgb_images = [None] * 15
days_offsets = [None] * 15
actual_dates = [None] * 15
used_dates = set() # Use set for efficient lookups
# STEP 0: Debug - List all available dates
print(f" [STEP 0] Checking available TIFF dates in {tiff_dir}...")
available_dates = []
if tiff_dir.exists():
for date_folder in sorted(tiff_dir.iterdir()):
if date_folder.is_dir():
try:
date_obj = datetime.strptime(date_folder.name, '%Y-%m-%d').date()
available_dates.append(date_obj)
except:
pass
print(f" Found {len(available_dates)} dates with data: {available_dates[:5]}... (showing first 5)")
# STEP 1: Find anchor image (closest to predicted harvest date) FIRST
# Search within ±14 days of predicted harvest date first, then expand if needed
print(f" [STEP 1] Finding anchor (closest to harvest {harvest_date.strftime('%Y-%m-%d')}, searching ±14 days)...")
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
harvest_date, tiff_dir, field_boundary, days_window=14, exclude_dates=[], debug=False
)
anchor_rgb = None
anchor_idx = 8 # Position 8 is the center (T~0d / harvest date position)
failed_anchor_dates = [] # Track dates that failed validation
if anchor_tile_paths and anchor_date:
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
rgb_images[anchor_idx] = anchor_rgb
days_offsets[anchor_idx] = 0 # Anchor is reference point
actual_dates[anchor_idx] = anchor_date
used_dates.add(anchor_date)
print(f" ✓ ANCHOR FOUND (±14d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
else:
failed_anchor_dates.append(anchor_date)
print(f" ⚠ Found date {anchor_date.strftime('%Y-%m-%d')} within ±14d, but image has no valid data")
print(f" [RETRY] Expanding anchor search to ±60 days (excluding failed dates)...")
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=set(failed_anchor_dates), debug=False
)
if anchor_tile_paths and anchor_date:
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
rgb_images[anchor_idx] = anchor_rgb
days_offsets[anchor_idx] = 0 # Anchor is reference point
actual_dates[anchor_idx] = anchor_date
used_dates.add(anchor_date)
print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
else:
failed_anchor_dates.append(anchor_date)
print(f" ✗ No valid anchor found even within ±60 days")
else:
print(f" ✗ No tiles found for any date within ±60 days")
else:
print(f" ⚠ No tiles found within ±14 days, expanding search...")
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=[], debug=False
)
if anchor_tile_paths and anchor_date:
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
rgb_images[anchor_idx] = anchor_rgb
days_offsets[anchor_idx] = 0 # Anchor is reference point
actual_dates[anchor_idx] = anchor_date
used_dates.add(anchor_date)
print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
else:
print(f" ✗ No valid anchor found even within ±60 days")
else:
print(f" ✗ No tiles found for any date within ±60 days")
# STEP 2: Dynamically collect images BEFORE anchor date
# Strategy: Go backwards from anchor with progressively larger search windows
# Start at 7 days, then try 10, 15, 20, 30+ days apart
print(f" [STEP 2] Collecting images BEFORE anchor (going backwards, flexible spacing)...")
before_positions = [7, 6, 5, 4, 3, 2, 1, 0] # Will fill in reverse order (7→0)
before_images = [] # (position, date, rgb, offset)
pos_idx = 0 # Index into before_positions
last_found_date = anchor_date
# Progressive search offsets: try these day offsets in order
search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days before last found image
while pos_idx < len(before_positions) and last_found_date.year >= 2024:
found_this_iteration = False
# Try each offset until we find a valid image
for days_offset in search_offsets:
search_target_date = last_found_date - timedelta(days=days_offset)
tile_paths, actual_date, days_diff = find_overlapping_tiles(
search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False
)
if tile_paths and actual_date:
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
if rgb is not None and has_valid_rgb_data(rgb):
# Found valid image!
overall_max = np.nanmax(rgb)
overall_min = np.nanmin(rgb)
offset_from_anchor = (actual_date - anchor_date).days
before_images.append((before_positions[pos_idx], actual_date, rgb, offset_from_anchor))
used_dates.add(actual_date)
last_found_date = actual_date # Move backwards from this date
print(f" ✓ Before[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}")
pos_idx += 1
found_this_iteration = True
break # Found one, stop trying larger offsets
# If nothing found with any offset, we're done collecting before images
if not found_this_iteration:
break
# Store collected before images
for pos, actual_date, rgb, offset in before_images:
rgb_images[pos] = rgb
actual_dates[pos] = actual_date
days_offsets[pos] = offset
# STEP 3: Dynamically collect images AFTER anchor date
# Strategy: Go forwards from anchor with progressively larger search windows
# Start at 7 days, then try 10, 15, 20, 30+ days apart
print(f" [STEP 3] Collecting images AFTER anchor (going forwards, flexible spacing)...")
after_positions = [9, 10, 11, 12, 13, 14] # Will fill in order (9→14)
after_images = [] # (position, date, rgb, offset)
pos_idx = 0 # Index into after_positions
last_found_date = anchor_date
max_search_date = anchor_date + timedelta(days=200) # Don't search beyond 200 days forward
# Progressive search offsets: try these day offsets in order
search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days after last found image
while pos_idx < len(after_positions) and last_found_date < max_search_date:
found_this_iteration = False
# Try each offset until we find a valid image
for days_offset in search_offsets:
search_target_date = last_found_date + timedelta(days=days_offset)
# Don't search beyond max date
if search_target_date > max_search_date:
break
tile_paths, actual_date, days_diff = find_overlapping_tiles(
search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False
)
if tile_paths and actual_date:
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
if rgb is not None and has_valid_rgb_data(rgb):
# Found valid image!
overall_max = np.nanmax(rgb)
overall_min = np.nanmin(rgb)
offset_from_anchor = (actual_date - anchor_date).days
after_images.append((after_positions[pos_idx], actual_date, rgb, offset_from_anchor))
used_dates.add(actual_date)
last_found_date = actual_date # Move forwards from this date
print(f" ✓ After[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}")
pos_idx += 1
found_this_iteration = True
break # Found one, stop trying larger offsets
# If nothing found with any offset, we're done collecting after images
if not found_this_iteration:
break
# Store collected after images
for pos, actual_date, rgb, offset in after_images:
rgb_images[pos] = rgb
actual_dates[pos] = actual_date
days_offsets[pos] = offset
# Create 5x3 grid plot (15 images)
fig, axes = plt.subplots(3, 5, figsize=(25, 15))
# Build title with anchor offset information
anchor_offset_from_harvest = (actual_dates[8] - harvest_date).days if actual_dates[8] is not None else None
if anchor_offset_from_harvest is not None and anchor_offset_from_harvest != 0:
anchor_info = f"(Anchor: {actual_dates[8].strftime('%Y-%m-%d')}, {anchor_offset_from_harvest:+d}d from predicted harvest)"
else:
anchor_info = f"(Exact match with anchor: {actual_dates[8].strftime('%Y-%m-%d')})" if actual_dates[8] is not None else ""
fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")} {anchor_info}',
fontsize=16, fontweight='bold')
# Grid positions (5 columns, 3 rows = 15 images)
positions = [
('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4),
('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('HARVEST', 1, 3), ('T+7d', 1, 4),
('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4),
]
for idx, (label, row, col) in enumerate(positions):
ax = axes[row, col]
if idx < len(rgb_images) and rgb_images[idx] is not None:
rgb_data = rgb_images[idx]
# Debug: check data range for ALL bands
data_min = np.nanmin(rgb_data)
data_max = np.nanmax(rgb_data)
data_mean = np.nanmean(rgb_data)
data_std = np.nanstd(rgb_data)
# Check per-band stats
r_min, r_max, r_mean = np.nanmin(rgb_data[:,:,0]), np.nanmax(rgb_data[:,:,0]), np.nanmean(rgb_data[:,:,0])
g_min, g_max, g_mean = np.nanmin(rgb_data[:,:,1]), np.nanmax(rgb_data[:,:,1]), np.nanmean(rgb_data[:,:,1])
b_min, b_max, b_mean = np.nanmin(rgb_data[:,:,2]), np.nanmax(rgb_data[:,:,2]), np.nanmean(rgb_data[:,:,2])
print(f" DEBUG VALID {label} ({actual_dates[idx].strftime('%Y-%m-%d')}): RGB overall {data_min:.4f}-{data_max:.4f} (mean={data_mean:.4f}, std={data_std:.4f})")
print(f" R: {r_min:.4f}-{r_max:.4f} (μ={r_mean:.4f}), G: {g_min:.4f}-{g_max:.4f} (μ={g_mean:.4f}), B: {b_min:.4f}-{b_max:.4f} (μ={b_mean:.4f})")
# Display with explicit vmin/vmax to handle normalized 0-1 data
ax.imshow(rgb_data, vmin=0, vmax=1)
# Build title: show BOTH anchor offset AND harvest offset
if days_offsets[idx] is not None:
offset_from_anchor = days_offsets[idx]
offset_from_harvest = (actual_dates[idx] - harvest_date).days
if idx == 8: # ANCHOR/HARVEST position
if offset_from_harvest == 0:
offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')})"
else:
offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')}, {offset_from_harvest:+d}d from predicted)"
else:
# Show both offsets: from anchor and from harvest
offset_str = f"{offset_from_anchor:+d}d from anchor\n{offset_from_harvest:+d}d from harvest\n{actual_dates[idx].strftime('%Y-%m-%d')}"
else:
offset_str = "No Data"
ax.set_title(offset_str, fontsize=9, fontweight='bold')
# Add red box around the ANCHOR IMAGE (position 8 is harvest/anchor)
if idx == 8: # Position 8 is the anchor
for spine in ax.spines.values():
spine.set_edgecolor('red')
spine.set_linewidth(4)
else:
ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray')
ax.set_title('No Data', fontsize=10)
print(f" DEBUG EMPTY {label}: No image data collected")
# Add red box for anchor position even if no data
if idx == 8: # Position 8 is the anchor
for spine in ax.spines.values():
spine.set_edgecolor('red')
spine.set_linewidth(4)
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
# Save figure with detailed naming: field_ID_harvestdate_model_harvestyle.png
harvest_date_str = harvest_date.strftime('%Y%m%d')
if harvest_type == 'registered':
filename = f'field_{field_id}_{harvest_date_str}_registered_harvest_rgb.png'
else:
# For predicted: include model name and harvest index if multiple
if harvest_index is not None and harvest_index > 0:
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest{harvest_index}_rgb.png'
else:
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png'
output_path = Path(output_dir) / filename
try:
plt.savefig(output_path, dpi=100, format='png')
plt.close()
print(f" ✓ Saved: {filename}")
return output_path
except Exception as e:
plt.close()
print(f" ✗ Error saving PNG: {e}")
return None
def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates,
output_dir, tiff_dir, geojson_path):
"""
Main orchestration function for RGB visualization.
Creates 3x3 grids for:
1. Registered harvest dates (if available)
2. Predicted harvest dates (if available)
Args:
field_data (pd.DataFrame): Field data with Date, CI columns
field_id (str): Field identifier
registered_harvest_dates (list): List of registered harvest dates (pd.Timestamp)
predicted_harvest_dates (list): List of predicted harvest dates (dict or pd.Timestamp)
output_dir (Path): Output directory for plots
tiff_dir (Path): Directory containing TIFF files
geojson_path (Path): Path to pivot.geojson
Returns:
dict: Summary of generated plots with keys 'registered' and 'predicted'
"""
if rasterio is None:
print(" ⚠ Rasterio not available - skipping RGB visualization")
return {'registered': [], 'predicted': []}
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
tiff_dir = Path(tiff_dir)
geojson_path = Path(geojson_path)
if not tiff_dir.exists():
print(f" ✗ TIFF directory not found: {tiff_dir}")
return {'registered': [], 'predicted': []}
if not geojson_path.exists():
print(f" ✗ GeoJSON not found: {geojson_path}")
return {'registered': [], 'predicted': []}
# Load field boundary
print(f" Loading field boundary for {field_id}...")
feature, field_boundary = load_field_boundaries(geojson_path, field_id)
if field_boundary is None:
print(f" ✗ Could not load field boundary for {field_id}")
return {'registered': [], 'predicted': []}
results = {'registered': [], 'predicted': []}
# Process registered harvest dates
if registered_harvest_dates and len(registered_harvest_dates) > 0:
print(f" Processing {len(registered_harvest_dates)} registered harvest dates...")
for i, harvest_date in enumerate(registered_harvest_dates):
if pd.isna(harvest_date):
continue
print(f" [{i+1}/{len(registered_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')}")
output_path = create_temporal_rgb_grid(
harvest_date, field_data, field_id, tiff_dir, field_boundary,
title='Registered Harvest Validation',
output_dir=output_dir,
harvest_type='registered',
model_name=None,
harvest_index=i
)
if output_path:
results['registered'].append(output_path)
# Process predicted harvest dates - grouped by model
if predicted_harvest_dates and len(predicted_harvest_dates) > 0:
print(f" Processing {len(predicted_harvest_dates)} predicted harvest dates...")
# Group by model to track index per model
harvest_by_model = {}
for harvest_info in predicted_harvest_dates:
# Handle both dict and Timestamp formats
if isinstance(harvest_info, dict):
harvest_date = harvest_info.get('harvest_date')
model_name = harvest_info.get('model_name', 'predicted')
else:
harvest_date = harvest_info
model_name = 'predicted'
if model_name not in harvest_by_model:
harvest_by_model[model_name] = []
harvest_by_model[model_name].append(harvest_date)
# Process each model's harvests
overall_index = 1
for model_name, harvest_dates in harvest_by_model.items():
for model_harvest_idx, harvest_date in enumerate(harvest_dates):
if pd.isna(harvest_date):
continue
print(f" [{overall_index}/{len(predicted_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')} ({model_name})")
output_path = create_temporal_rgb_grid(
harvest_date, field_data, field_id, tiff_dir, field_boundary,
title=f'Predicted Harvest Validation ({model_name})',
output_dir=output_dir,
harvest_type='predicted',
model_name=model_name,
harvest_index=model_harvest_idx
)
if output_path:
results['predicted'].append(output_path)
overall_index += 1
return results
if __name__ == '__main__':
# Example usage
print("RGB Visualization Tool")
print("This module is intended to be imported and called from compare_307_models_production.py")
print("\nExample:")
print(" from rgb_visualization import generate_rgb_grids")
print(" generate_rgb_grids(field_data, field_id, registered_dates, predicted_dates, output_dir, tiff_dir, geojson_path)")

View file

@ -6,14 +6,33 @@
#' 2. Create master 5×5 grid covering all TIFFs
#' 3. Split each daily TIFF into 25 tiles using the master grid
#' 4. Save tiles in date-specific folders: daily_tiles/[DATE]/[DATE]_[TILE_ID].tif
#' & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-18
library(terra)
library(sf)
# ============================================================================
# CONFIGURATION
# CONFIGURATION & COMMAND-LINE ARGUMENTS
# ============================================================================
# Parse command-line arguments for date filtering
args <- commandArgs(trailingOnly = TRUE)
# Example: Rscript 10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-17
start_date <- NULL
end_date <- NULL
if (length(args) >= 1) {
start_date <- as.Date(args[1])
cat("Filtering: start date =", as.character(start_date), "\n")
}
if (length(args) >= 2) {
end_date <- as.Date(args[2])
cat("Filtering: end date =", as.character(end_date), "\n")
}
PROJECT <- "angata"
TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b")
@ -40,31 +59,50 @@ cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "")
cat("\n[PART 1] Creating Master Grid\n")
# Load field boundaries for overlap checking
cat("\n[1] Loading field boundaries from GeoJSON...\n")
cat("\n[1] Checking for existing master grid...\n")
if (!file.exists(GEOJSON_PATH)) {
stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n",
"Please ensure ", PROJECT, " has a pivot.geojson file.")
}
# Check if master grid already exists
MASTER_GRID_PATH <- file.path(OUTPUT_FOLDER, paste0("master_grid_", GRID_SIZE_LABEL, ".geojson"))
field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE)
field_boundaries_vect <- terra::vect(GEOJSON_PATH)
cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s)\n", sep = "")
# Try to find a name column (could be 'name', 'field', 'field_name', etc.)
field_names <- NA
if ("name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$name
} else if ("field" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field
} else if ("field_name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field_name
if (file.exists(MASTER_GRID_PATH)) {
cat(" ✓ Found existing master grid at:\n ", MASTER_GRID_PATH, "\n", sep = "")
master_grid_sf <- st_read(MASTER_GRID_PATH, quiet = TRUE)
field_boundaries_sf <- NULL # No need to load pivot.geojson
field_boundaries_vect <- NULL
cat(" ✓ Loaded grid with ", nrow(master_grid_sf), " tiles\n", sep = "")
} else {
field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices
# No existing grid - need to create one from pivot.geojson
cat(" No existing grid found. Creating new one from pivot.geojson...\n")
if (!file.exists(GEOJSON_PATH)) {
stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n",
"Please ensure ", PROJECT, " has a pivot.geojson file, or run this script ",
"from the same directory as a previous successful run (grid already exists).")
}
field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE)
field_boundaries_vect <- terra::vect(GEOJSON_PATH)
cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s) from GeoJSON\n", sep = "")
}
cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "")
# Try to find a name column (only if field_boundaries_sf exists)
if (!is.null(field_boundaries_sf)) {
field_names <- NA
if ("name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$name
} else if ("field" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field
} else if ("field_name" %in% names(field_boundaries_sf)) {
field_names <- field_boundaries_sf$field_name
} else {
field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices
}
cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "")
}
# Helper function: Check if a tile overlaps with any field (simple bbox overlap)
tile_overlaps_fields <- function(tile_extent, field_geoms) {
@ -105,6 +143,27 @@ cat("\n[2] Checking TIFF extents...\n")
tiff_files <- list.files(TIFF_FOLDER, pattern = "\\.tif$", full.names = FALSE)
tiff_files <- sort(tiff_files)
# Filter by date range if specified
if (!is.null(start_date) || !is.null(end_date)) {
cat("\nApplying date filter...\n")
file_dates <- as.Date(sub("\\.tif$", "", tiff_files))
if (!is.null(start_date) && !is.null(end_date)) {
keep_idx <- file_dates >= start_date & file_dates <= end_date
cat(" Date range: ", as.character(start_date), " to ", as.character(end_date), "\n", sep = "")
} else if (!is.null(start_date)) {
keep_idx <- file_dates >= start_date
cat(" From: ", as.character(start_date), "\n", sep = "")
} else {
keep_idx <- file_dates <= end_date
cat(" Until: ", as.character(end_date), "\n", sep = "")
}
tiff_files <- tiff_files[keep_idx]
cat(" ✓ Filtered to ", length(tiff_files), " file(s)\n", sep = "")
}
if (length(tiff_files) == 0) {
stop("No TIFF files found in ", TIFF_FOLDER)
}
@ -242,26 +301,33 @@ if (file.exists(master_grid_file)) {
cat("\n[PART 2] Creating Filtered Grid (only overlapping tiles)\n")
cat("\n[7] Filtering master grid to only overlapping tiles...\n")
# Check which tiles overlap with any field
overlapping_tile_indices <- c()
for (tile_idx in 1:nrow(master_grid_sf)) {
tile_geom <- master_grid_sf[tile_idx, ]
# If grid was loaded from file, it's already filtered. Skip filtering.
if (!file.exists(MASTER_GRID_PATH)) {
cat("\n[7] Filtering master grid to only overlapping tiles...\n")
# Check overlap with any field
if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) {
overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx)
# Check which tiles overlap with any field
overlapping_tile_indices <- c()
for (tile_idx in 1:nrow(master_grid_sf)) {
tile_geom <- master_grid_sf[tile_idx, ]
# Check overlap with any field
if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) {
overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx)
}
}
cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "")
cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "")
# Create filtered grid with only overlapping tiles
filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ]
filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices)
} else {
cat("\n[7] Using pre-filtered grid (already loaded from file)...\n")
# Grid was already loaded - it's already filtered
filtered_grid_sf <- master_grid_sf
}
cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "")
cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "")
# Create filtered grid with only overlapping tiles
filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ]
filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices)
# Convert to SpatVector for makeTiles
filtered_grid_vect <- terra::vect(filtered_grid_sf)
@ -314,7 +380,7 @@ for (file_idx in seq_along(tiff_files)) {
dir.create(date_output_folder, recursive = TRUE, showWarnings = FALSE)
}
cat(" Creating ", length(overlapping_tile_indices), " tiles...\n", sep = "")
cat(" Creating ", nrow(filtered_grid_sf), " tiles...\n", sep = "")
# Use makeTiles with FILTERED grid (only overlapping tiles)
tiles_list <- terra::makeTiles(

View file

@ -13,13 +13,13 @@
#
# Examples:
# # Angata 8-band data (with UDM cloud masking)
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/02_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
#
# # Aura 4-band data
# Rscript 02_ci_extraction.R 2025-11-26 7 aura merged_tif
# Rscript 20_ci_extraction.R 2025-11-26 7 aura merged_tif
#
# # Auto-detects and uses tiles if available:
# Rscript 02_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists)
# Rscript 20_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists)
# 1. Load required packages
# -----------------------

View file

@ -8,7 +8,7 @@
#
# Usage: Rscript interpolate_growth_model.R [project_dir]
# - project_dir: Project directory name (e.g., "chemba")
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/03_interpolate_growth_model.R angata
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/30_interpolate_growth_model.R angata
# 1. Load required packages
# -----------------------

View file

@ -14,8 +14,8 @@
# - tile_size: Tile size in km (default: 5, only used if use_tiles=TRUE)
#
# Examples:
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata week_51.tif TRUE 5 [tile-based]
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation.R 2026-01-12 7 angata
#
# 1. Load required packages
@ -54,14 +54,12 @@ main <- function() {
if (is.na(end_date)) {
message("Invalid end_date provided. Using current date.")
end_date <- Sys.Date()
end_date <- "2026-01-01" # Default date for testing
}
} else if (exists("end_date_str", envir = .GlobalEnv)) {
end_date <- as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
# Default to current date if no argument is provided
end_date <- Sys.Date()
end_date <- "2026-01-01" # Default date for testing
message("No end_date provided. Using current date: ", format(end_date))
}
@ -142,15 +140,35 @@ main <- function() {
if (use_tile_mosaic) {
# TILE-BASED APPROACH: Create per-tile weekly MAX mosaics
# This is used for projects like Angata with large ROIs requiring spatial partitioning
# Input data comes from merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif (5-band tiles from script 20)
tryCatch({
safe_log("Starting per-tile mosaic creation (tile-based approach)...")
# Set output directory for per-tile mosaics
tile_output_base <- file.path(laravel_storage, "weekly_tile_max")
# Detect grid size from merged_final_tif folder structure
# Expected: merged_final_tif/5x5/ or merged_final_tif/10x10/ etc.
merged_final_base <- file.path(laravel_storage, "merged_final_tif")
grid_subfolders <- list.dirs(merged_final_base, full.names = FALSE, recursive = FALSE)
# Look for grid size patterns like "5x5", "10x10", "20x20"
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) == 0) {
stop("No grid size subfolder found in merged_final_tif/ (expected: 5x5, 10x10, etc.)")
}
grid_size <- grid_patterns[1] # Use first grid size found
safe_log(paste("Detected grid size:", grid_size))
# Point to the grid-specific merged_final_tif directory
merged_final_with_grid <- file.path(merged_final_base, grid_size)
# Set output directory for per-tile mosaics, organized by grid size
# Output: weekly_tile_max/{grid_size}/week_WW_YYYY_TT.tif
tile_output_base <- file.path(laravel_storage, "weekly_tile_max", grid_size)
dir.create(tile_output_base, recursive = TRUE, showWarnings = FALSE)
created_tile_files <- create_weekly_mosaic_from_tiles(
dates = dates,
merged_final_dir = merged_final,
merged_final_dir = merged_final_with_grid,
tile_output_dir = tile_output_base,
field_boundaries = field_boundaries
)

View file

@ -354,18 +354,17 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
}
# Get filenames of best-coverage images
# Match by extracting tile ID from both cloud stats and TIF filenames
# Match by full filename from cloud stats to TIF files
rasters_to_use <- character()
for (idx in best_coverage) {
# Extract tile ID from cloud_coverage_stats filename (e.g., "tile_18" → 18)
# Get the full filename from cloud coverage stats
cc_filename <- cloud_coverage_stats$filename[idx]
cc_tile_id <- gsub(".*_([0-9]+).*", "\\1", cc_filename)
# Find matching TIF file by matching tile ID
# Find matching TIF file by full filename
matching_tif <- NULL
for (tif_file in tif_files) {
tif_tile_id <- gsub(".*_([0-9]+)\\.tif", "\\1", basename(tif_file))
if (tif_tile_id == cc_tile_id) {
tif_basename <- basename(tif_file)
if (tif_basename == cc_filename) {
matching_tif <- tif_file
break
}
@ -373,6 +372,8 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
if (!is.null(matching_tif)) {
rasters_to_use <- c(rasters_to_use, matching_tif)
} else {
safe_log(paste("Warning: Could not find TIF file matching cloud stats entry:", cc_filename), "WARNING")
}
}
@ -420,42 +421,60 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
mosaic <- tryCatch({
safe_log(paste("Creating max composite from", length(all_rasters), "images to fill clouds"))
# Get extent from field boundaries if available, otherwise use raster intersection
if (!is.null(field_boundaries)) {
crop_extent <- terra::ext(field_boundaries)
safe_log("Using field boundaries extent for consistent area across all dates")
# Check if all rasters have identical grids (extent and resolution)
# This is likely for per-tile mosaics from the same tiling scheme
reference_raster <- all_rasters[[1]]
ref_ext <- terra::ext(reference_raster)
ref_res <- terra::res(reference_raster)
grids_match <- all(sapply(all_rasters[-1], function(r) {
isTRUE(all.equal(terra::ext(r), ref_ext, tolerance = 1e-6)) &&
isTRUE(all.equal(terra::res(r), ref_res, tolerance = 1e-6))
}))
if (grids_match) {
# All rasters have matching grids - no cropping/resampling needed!
safe_log("All rasters have identical grids - stacking directly for max composite")
raster_collection <- terra::sprc(all_rasters)
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
} else {
# Fallback: use intersection of all raster extents
crop_extent <- terra::ext(all_rasters[[1]])
for (i in 2:length(all_rasters)) {
crop_extent <- terra::intersect(crop_extent, terra::ext(all_rasters[[i]]))
# Grids don't match - need to crop and resample
safe_log("Rasters have different grids - cropping and resampling to common extent")
# Get extent from field boundaries if available, otherwise use raster union
if (!is.null(field_boundaries)) {
crop_extent <- terra::ext(field_boundaries)
safe_log("Using field boundaries extent for consistent area across all dates")
} else {
# Use union of all extents (covers all data)
crop_extent <- terra::ext(all_rasters[[1]])
for (i in 2:length(all_rasters)) {
crop_extent <- terra::union(crop_extent, terra::ext(all_rasters[[i]]))
}
safe_log("Using raster union extent")
}
safe_log("Using raster intersection extent")
# Crop all rasters to common extent
cropped_rasters <- lapply(all_rasters, function(r) {
terra::crop(r, crop_extent)
})
# Resample all cropped rasters to match the first one's grid
reference_grid <- cropped_rasters[[1]]
aligned_rasters <- lapply(cropped_rasters, function(r) {
if (isTRUE(all.equal(terra::ext(r), terra::ext(reference_grid), tolerance = 1e-6)) &&
isTRUE(all.equal(terra::res(r), terra::res(reference_grid), tolerance = 1e-6))) {
return(r) # Already aligned
}
terra::resample(r, reference_grid, method = "near")
})
# Create max composite using mosaic on aligned rasters
raster_collection <- terra::sprc(aligned_rasters)
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
}
# Crop all rasters to common extent
cropped_rasters <- lapply(all_rasters, function(r) {
terra::crop(r, crop_extent)
})
# Resample all cropped rasters to match the first one's grid
# This handles pixel grid misalignment from Python's dynamic extent adjustment
reference_grid <- cropped_rasters[[1]]
safe_log("Resampling rasters to common grid for stacking")
aligned_rasters <- lapply(cropped_rasters, function(r) {
if (identical(terra::ext(r), terra::ext(reference_grid)) &&
identical(terra::res(r), terra::res(reference_grid))) {
return(r) # Already aligned
}
terra::resample(r, reference_grid, method = "near")
})
# Create max composite using mosaic on aligned rasters
# Resample ensures all rasters have matching grids (no resolution mismatch)
raster_collection <- terra::sprc(aligned_rasters)
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
max_mosaic
}, error = function(e) {
safe_log(paste("Max composite creation failed:", e$message), "WARNING")
@ -686,6 +705,16 @@ create_weekly_mosaic_from_tiles <- function(dates, merged_final_dir, tile_output
next
}
# DEBUG: Check mosaic content before saving
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "dimensions:", nrow(tile_mosaic), "x", ncol(tile_mosaic)))
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "bands:", terra::nlyr(tile_mosaic)))
# Check first band values
band1 <- tile_mosaic[[1]]
band1_min <- terra::global(band1, fun = "min", na.rm = TRUE)$min
band1_max <- terra::global(band1, fun = "max", na.rm = TRUE)$max
safe_log(paste(" DEBUG: Band 1 MIN=", round(band1_min, 2), "MAX=", round(band1_max, 2)))
# Step 2c: Save this tile's weekly MAX mosaic
# Filename format: week_WW_YYYY_TT.tif (e.g., week_02_2026_01.tif for week 2, 2026, tile 1)
tile_filename <- paste0("week_", sprintf("%02d", dates$week), "_", dates$year, "_",
@ -763,7 +792,7 @@ count_cloud_coverage_for_tile <- function(tile_files, field_boundaries = NULL) {
missing_pct <- round(100 - ((total_notna / total_pixels) * 100))
aggregated_results[[idx]] <- data.frame(
filename = paste0("tile_", sprintf("%02d", as.integer(gsub(".*_([0-9]+)\\.tif", "\\1", basename(tile_file))))),
filename = basename(tile_file), # Keep full filename: 2026-01-07_03.tif
notNA = total_notna,
total_pixels = total_pixels,
missing_pixels_percentage = missing_pct,

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,258 @@
# 80_REPORT_BUILDING_UTILS.R
# ============================================================================
# UTILITY FUNCTIONS FOR REPORT GENERATION AND EXCEL/CSV EXPORT
#
# This file contains reusable functions for:
# - Field analysis summary generation
# - Excel/CSV/RDS export functionality
# - Farm-level KPI aggregation and summary
# - Tile-based KPI extraction (alternative calculation method)
#
# Used by: 80_calculate_kpis.R, run_full_pipeline.R, other reporting scripts
# ============================================================================
# ============================================================================
# SUMMARY GENERATION
# ============================================================================
generate_field_analysis_summary <- function(field_df) {
message("Generating summary statistics...")
total_acreage <- sum(field_df$Acreage, na.rm = TRUE)
germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE)
tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE)
grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE)
maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE)
unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE)
harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE)
stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE)
recovery_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "strong_recovery"], na.rm = TRUE)
growth_on_track_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "growth_on_track"], na.rm = TRUE)
germination_complete_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_complete"], na.rm = TRUE)
germination_started_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_started"], na.rm = TRUE)
no_trigger_acreage <- sum(field_df$Acreage[is.na(field_df$Status_trigger)], na.rm = TRUE)
clear_fields <- sum(field_df$Cloud_category == "Clear view", na.rm = TRUE)
partial_fields <- sum(field_df$Cloud_category == "Partial coverage", na.rm = TRUE)
no_image_fields <- sum(field_df$Cloud_category == "No image available", na.rm = TRUE)
total_fields <- nrow(field_df)
clear_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Clear view"], na.rm = TRUE)
partial_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Partial coverage"], na.rm = TRUE)
no_image_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "No image available"], na.rm = TRUE)
summary_df <- data.frame(
Category = c(
"--- PHASE DISTRIBUTION ---",
"Germination",
"Tillering",
"Grand Growth",
"Maturation",
"Unknown phase",
"--- STATUS TRIGGERS ---",
"Harvest ready",
"Stress detected",
"Strong recovery",
"Growth on track",
"Germination complete",
"Germination started",
"No trigger",
"--- CLOUD COVERAGE (FIELDS) ---",
"Clear view",
"Partial coverage",
"No image available",
"--- CLOUD COVERAGE (ACREAGE) ---",
"Clear view",
"Partial coverage",
"No image available",
"--- TOTAL ---",
"Total Acreage"
),
Acreage = c(
NA,
round(germination_acreage, 2),
round(tillering_acreage, 2),
round(grand_growth_acreage, 2),
round(maturation_acreage, 2),
round(unknown_phase_acreage, 2),
NA,
round(harvest_ready_acreage, 2),
round(stress_acreage, 2),
round(recovery_acreage, 2),
round(growth_on_track_acreage, 2),
round(germination_complete_acreage, 2),
round(germination_started_acreage, 2),
round(no_trigger_acreage, 2),
NA,
paste0(clear_fields, " fields"),
paste0(partial_fields, " fields"),
paste0(no_image_fields, " fields"),
NA,
round(clear_acreage, 2),
round(partial_acreage, 2),
round(no_image_acreage, 2),
NA,
round(total_acreage, 2)
),
stringsAsFactors = FALSE
)
return(summary_df)
}
# ============================================================================
# EXPORT FUNCTIONS
# ============================================================================
export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, year, reports_dir) {
message("Exporting per-field analysis to Excel, CSV, and RDS...")
field_df_rounded <- field_df %>%
mutate(across(where(is.numeric), ~ round(., 2)))
# Handle NULL summary_df
summary_df_rounded <- if (!is.null(summary_df)) {
summary_df %>%
mutate(across(where(is.numeric), ~ round(., 2)))
} else {
NULL
}
output_subdir <- file.path(reports_dir, "kpis", "field_analysis")
if (!dir.exists(output_subdir)) {
dir.create(output_subdir, recursive = TRUE)
}
excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".xlsx")
excel_path <- file.path(output_subdir, excel_filename)
excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE)
# Build sheets list dynamically
sheets <- list(
"Field Data" = field_df_rounded
)
if (!is.null(summary_df_rounded)) {
sheets[["Summary"]] <- summary_df_rounded
}
write_xlsx(sheets, excel_path)
message(paste("✓ Field analysis Excel exported to:", excel_path))
kpi_data <- list(
field_analysis = field_df_rounded,
field_analysis_summary = summary_df_rounded,
metadata = list(
current_week = current_week,
year = year,
project = project_dir,
created_at = Sys.time()
)
)
rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d_%d", current_week, year), ".rds")
rds_path <- file.path(reports_dir, "kpis", rds_filename)
saveRDS(kpi_data, rds_path)
message(paste("✓ Field analysis RDS exported to:", rds_path))
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".csv")
csv_path <- file.path(output_subdir, csv_filename)
write_csv(field_df_rounded, csv_path)
message(paste("✓ Field analysis CSV exported to:", csv_path))
return(list(excel = excel_path, rds = rds_path, csv = csv_path))
}
# ============================================================================
# TILE-BASED KPI EXTRACTION (Alternative calculation method)
# ============================================================================
# [COMMENTED OUT / UNUSED - kept for reference]
# These functions provide tile-based extraction as an alternative to field_statistics approach
# Currently replaced by calculate_field_statistics() in 80_weekly_stats_utils.R
# Uncomment if parallel processing of tiles is needed in future
# calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) {
# message("Calculating field-level KPI statistics from tiles...")
#
# tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
# tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE)
#
# if (length(tile_files) == 0) {
# message("No tiles found for week", week_num, year)
# return(NULL)
# }
#
# message(paste("Processing", length(tile_files), "tiles in parallel..."))
#
# field_kpi_list <- furrr::future_map(
# tile_files,
# ~ process_single_kpi_tile(
# tile_file = .,
# field_boundaries_sf = field_boundaries_sf,
# tile_grid = tile_grid
# ),
# .progress = TRUE,
# .options = furrr::furrr_options(seed = TRUE)
# )
#
# field_kpi_stats <- dplyr::bind_rows(field_kpi_list)
#
# if (nrow(field_kpi_stats) == 0) {
# message(" No KPI data extracted from tiles")
# return(NULL)
# }
#
# message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields"))
# return(field_kpi_stats)
# }
# process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) {
# # Helper function for calculate_field_kpis_from_tiles
# tryCatch({
# tile_basename <- basename(tile_file)
# tile_raster <- terra::rast(tile_file)
# ci_band <- tile_raster[[1]]
#
# field_bbox <- sf::st_bbox(field_boundaries_sf)
# ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out")
#
# extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE)
#
# tile_results <- data.frame()
# tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename))
#
# for (field_idx in seq_len(nrow(field_boundaries_sf))) {
# field_id <- field_boundaries_sf$field[field_idx]
# mean_ci <- extracted_vals[field_idx, 2]
#
# if (is.na(mean_ci)) {
# next
# }
#
# tile_results <- rbind(tile_results, data.frame(
# field = field_id,
# tile_id = tile_id_match,
# tile_file = tile_basename,
# mean_ci = round(mean_ci, 4),
# stringsAsFactors = FALSE
# ))
# }
#
# return(tile_results)
#
# }, error = function(e) {
# message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message))
# return(data.frame())
# })
# }
# calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf,
# harvesting_data, cumulative_CI_vals_dir,
# weekly_CI_mosaic, reports_dir, current_week, year,
# tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") {
# # Farm-level KPI calculation using tile-based extraction (alternative approach)
# # [Implementation kept as reference for alternative calculation method]
# }

View file

@ -0,0 +1,941 @@
# 80_WEEKLY_STATS_UTILS.R
# ============================================================================
# UTILITY FUNCTIONS FOR WEEKLY STATISTICS CALCULATION
#
# This file contains reusable functions for:
# - Tile grid management
# - Tile loading and merging
# - Field-level statistics calculation from CI rasters
# - Weekly stats caching (RDS/CSV export/import)
# - KPI trend calculations
# - Historical data loading and auto-generation from mosaics
#
# Used by: 80_calculate_kpis.R, run_full_pipeline.R, other reporting scripts
# ============================================================================
# ============================================================================
# TILE-AWARE HELPER FUNCTIONS
# ============================================================================
get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) {
if (inherits(field_geom, "sf")) {
field_bbox <- sf::st_bbox(field_geom)
field_xmin <- field_bbox["xmin"]
field_xmax <- field_bbox["xmax"]
field_ymin <- field_bbox["ymin"]
field_ymax <- field_bbox["ymax"]
} else if (inherits(field_geom, "SpatVector")) {
field_bbox <- terra::ext(field_geom)
field_xmin <- field_bbox$xmin
field_xmax <- field_bbox$xmax
field_ymin <- field_bbox$ymin
field_ymax <- field_bbox$ymax
} else {
stop("field_geom must be sf or terra::vect object")
}
intersecting_tiles <- tile_grid$id[
!(tile_grid$xmax < field_xmin |
tile_grid$xmin > field_xmax |
tile_grid$ymax < field_ymin |
tile_grid$ymin > field_ymax)
]
return(as.numeric(intersecting_tiles))
}
load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_dir) {
if (length(tile_ids) == 0) {
return(NULL)
}
tiles_list <- list()
for (tile_id in sort(tile_ids)) {
tile_filename <- sprintf("week_%02d_%d_%02d.tif", week_num, year, tile_id)
tile_path <- file.path(mosaic_dir, tile_filename)
if (file.exists(tile_path)) {
tryCatch({
tile_rast <- terra::rast(tile_path)
ci_band <- terra::subset(tile_rast, 5)
tiles_list[[length(tiles_list) + 1]] <- ci_band
}, error = function(e) {
message(paste(" Warning: Could not load tile", tile_id, ":", e$message))
})
}
}
if (length(tiles_list) == 0) {
return(NULL)
}
if (length(tiles_list) == 1) {
return(tiles_list[[1]])
} else {
tryCatch({
rsrc <- terra::sprc(tiles_list)
merged <- terra::mosaic(rsrc, fun = "max")
return(merged)
}, error = function(e) {
message(paste(" Warning: Could not merge tiles:", e$message))
return(tiles_list[[1]])
})
}
}
build_tile_grid <- function(mosaic_dir, week_num, year) {
# Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/)
detected_grid_size <- NA
if (dir.exists(mosaic_dir)) {
subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
detected_grid_size <- grid_patterns[1]
mosaic_dir <- file.path(mosaic_dir, detected_grid_size)
message(paste(" Using grid-size subdirectory:", detected_grid_size))
}
}
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir))
}
tile_grid <- data.frame(
id = integer(),
xmin = numeric(),
xmax = numeric(),
ymin = numeric(),
ymax = numeric(),
stringsAsFactors = FALSE
)
for (tile_file in tile_files) {
tryCatch({
matches <- regmatches(basename(tile_file), regexpr("_([0-9]{2})\\.tif$", basename(tile_file)))
if (length(matches) > 0) {
tile_id <- as.integer(sub("_|\\.tif", "", matches[1]))
tile_rast <- terra::rast(tile_file)
tile_ext <- terra::ext(tile_rast)
tile_grid <- rbind(tile_grid, data.frame(
id = tile_id,
xmin = tile_ext$xmin,
xmax = tile_ext$xmax,
ymin = tile_ext$ymin,
ymax = tile_ext$ymax,
stringsAsFactors = FALSE
))
}
}, error = function(e) {
message(paste(" Warning: Could not process tile", basename(tile_file), ":", e$message))
})
}
if (nrow(tile_grid) == 0) {
stop("Could not extract extents from any tile files")
}
return(list(
tile_grid = tile_grid,
mosaic_dir = mosaic_dir,
grid_size = detected_grid_size
))
}
# ============================================================================
# STATISTICAL CATEGORIZATION FUNCTIONS
# ============================================================================
categorize_four_week_trend <- function(ci_values_list) {
if (is.null(ci_values_list) || length(ci_values_list) < 2) {
return(NA_character_)
}
ci_values_list <- ci_values_list[!is.na(ci_values_list)]
if (length(ci_values_list) < 2) {
return(NA_character_)
}
weekly_changes <- diff(ci_values_list)
avg_weekly_change <- mean(weekly_changes, na.rm = TRUE)
if (avg_weekly_change >= FOUR_WEEK_TREND_STRONG_GROWTH_MIN) {
return("strong growth")
} else if (avg_weekly_change >= FOUR_WEEK_TREND_GROWTH_MIN &&
avg_weekly_change < FOUR_WEEK_TREND_GROWTH_MAX) {
return("growth")
} else if (abs(avg_weekly_change) <= FOUR_WEEK_TREND_NO_GROWTH_RANGE) {
return("no growth")
} else if (avg_weekly_change <= FOUR_WEEK_TREND_DECLINE_MIN &&
avg_weekly_change > FOUR_WEEK_TREND_STRONG_DECLINE_MAX) {
return("decline")
} else if (avg_weekly_change < FOUR_WEEK_TREND_STRONG_DECLINE_MAX) {
return("strong decline")
} else {
return("no growth")
}
}
round_cloud_to_intervals <- function(cloud_pct_clear) {
if (is.na(cloud_pct_clear)) {
return(NA_character_)
}
if (cloud_pct_clear < 10) return("0-10%")
if (cloud_pct_clear < 20) return("10-20%")
if (cloud_pct_clear < 30) return("20-30%")
if (cloud_pct_clear < 40) return("30-40%")
if (cloud_pct_clear < 50) return("40-50%")
if (cloud_pct_clear < 60) return("50-60%")
if (cloud_pct_clear < 70) return("60-70%")
if (cloud_pct_clear < 80) return("70-80%")
if (cloud_pct_clear < 90) return("80-90%")
if (cloud_pct_clear < 95) return("90-95%")
return("95-100%")
}
get_ci_percentiles <- function(ci_values) {
if (is.null(ci_values) || length(ci_values) == 0) {
return(NA_character_)
}
ci_values <- ci_values[!is.na(ci_values)]
if (length(ci_values) == 0) {
return(NA_character_)
}
p10 <- quantile(ci_values, CI_PERCENTILE_LOW, na.rm = TRUE)
p90 <- quantile(ci_values, CI_PERCENTILE_HIGH, na.rm = TRUE)
return(sprintf("%.1f-%.1f", p10, p90))
}
calculate_cv_trend <- function(cv_current, cv_previous) {
if (is.na(cv_current) || is.na(cv_previous)) {
return(NA_real_)
}
return(round(cv_current - cv_previous, 4))
}
calculate_four_week_trend <- function(mean_ci_values) {
#' Calculate four-week CI trend from available weeks
#' Uses whatever weeks are available (1-4 weeks) to estimate trend
if (is.null(mean_ci_values) || length(mean_ci_values) == 0) {
return(NA_real_)
}
ci_clean <- mean_ci_values[!is.na(mean_ci_values)]
if (length(ci_clean) < 2) {
return(NA_real_)
}
trend <- ci_clean[length(ci_clean)] - ci_clean[1]
return(round(trend, 2))
}
categorize_cv_slope <- function(slope) {
#' Categorize CV slope (8-week regression) into field uniformity interpretation
if (is.na(slope)) {
return(NA_character_)
}
if (slope <= CV_SLOPE_IMPROVEMENT_MIN) {
return("Excellent uniformity")
} else if (slope < CV_SLOPE_HOMOGENOUS_MIN) {
return("Homogenous growth")
} else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) {
return("Homogenous growth")
} else if (slope <= CV_SLOPE_PATCHINESS_MAX) {
return("Minor patchiness")
} else {
return("Severe fragmentation")
}
}
calculate_cv_trend_long_term <- function(cv_values) {
#' Calculate 8-week CV trend via linear regression slope
if (is.null(cv_values) || length(cv_values) == 0) {
return(NA_real_)
}
cv_clean <- cv_values[!is.na(cv_values)]
if (length(cv_clean) < 2) {
return(NA_real_)
}
weeks <- seq_along(cv_clean)
tryCatch({
lm_fit <- lm(cv_clean ~ weeks)
slope <- coef(lm_fit)["weeks"]
return(round(as.numeric(slope), 4))
}, error = function(e) {
return(NA_real_)
})
}
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
get_phase_by_age <- function(age_weeks) {
if (is.na(age_weeks)) return(NA_character_)
for (i in seq_len(nrow(PHASE_DEFINITIONS))) {
if (age_weeks >= PHASE_DEFINITIONS$age_start[i] &&
age_weeks <= PHASE_DEFINITIONS$age_end[i]) {
return(PHASE_DEFINITIONS$phase[i])
}
}
return("Unknown")
}
get_status_trigger <- function(ci_values, ci_change, age_weeks) {
if (is.na(age_weeks) || length(ci_values) == 0) return(NA_character_)
ci_values <- ci_values[!is.na(ci_values)]
if (length(ci_values) == 0) return(NA_character_)
pct_above_2 <- sum(ci_values > 2) / length(ci_values) * 100
pct_at_or_above_2 <- sum(ci_values >= 2) / length(ci_values) * 100
ci_cv <- if (mean(ci_values, na.rm = TRUE) > 0) sd(ci_values) / mean(ci_values, na.rm = TRUE) else 0
mean_ci <- mean(ci_values, na.rm = TRUE)
if (age_weeks >= 0 && age_weeks <= 6) {
if (pct_at_or_above_2 >= 70) {
return("germination_complete")
} else if (pct_above_2 > 10) {
return("germination_started")
}
}
if (age_weeks >= 45) {
return("harvest_ready")
}
if (age_weeks > 6 && !is.na(ci_change) && ci_change < -1.5 && ci_cv < 0.25) {
return("stress_detected_whole_field")
}
if (age_weeks > 6 && !is.na(ci_change) && ci_change > 1.5) {
return("strong_recovery")
}
if (age_weeks >= 4 && age_weeks < 39 && !is.na(ci_change) && ci_change > 0.2) {
return("growth_on_track")
}
if (age_weeks >= 39 && age_weeks < 45 && mean_ci > 3.5) {
return("maturation_progressing")
}
return(NA_character_)
}
extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
# Extract planting dates from harvest.xlsx (season_start column)
# Returns: data.frame with columns (field_id, planting_date)
if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
message("Warning: No harvesting data available - planting dates will be NA.")
if (!is.null(field_boundaries_sf)) {
return(data.frame(
field_id = field_boundaries_sf$field,
planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)),
stringsAsFactors = FALSE
))
}
return(NULL)
}
tryCatch({
planting_dates <- harvesting_data %>%
arrange(field, desc(season_start)) %>%
distinct(field, .keep_all = TRUE) %>%
select(field, season_start) %>%
rename(field_id = field, planting_date = season_start) %>%
filter(!is.na(planting_date)) %>%
as.data.frame()
message(paste("Extracted planting dates for", nrow(planting_dates), "fields from harvest.xlsx"))
return(planting_dates)
}, error = function(e) {
message(paste("Error extracting planting dates:", e$message))
return(NULL)
})
}
# ============================================================================
# MODULAR STATISTICS CALCULATION
# ============================================================================
calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
mosaic_dir, report_date = Sys.Date()) {
message(paste("Calculating statistics for all fields - Week", week_num, year))
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir))
}
message(paste(" Found", length(tile_files), "tiles for week", week_num))
results_list <- list()
fields_processed <- 0
for (tile_idx in seq_along(tile_files)) {
tile_file <- tile_files[tile_idx]
tryCatch({
current_rast <- terra::rast(tile_file)
ci_band <- current_rast[["CI"]]
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
message(paste(" [SKIP] Tile", basename(tile_file), "- CI band not found"))
return(NULL)
}
extracted <- terra::extract(ci_band, field_boundaries_sf, na.rm = FALSE)
unique_field_ids <- unique(extracted$ID[!is.na(extracted$ID)])
for (field_poly_idx in unique_field_ids) {
field_id <- field_boundaries_sf$field[field_poly_idx]
ci_vals <- extracted$CI[extracted$ID == field_poly_idx]
ci_vals <- ci_vals[!is.na(ci_vals)]
if (length(ci_vals) == 0) {
next
}
mean_ci <- mean(ci_vals, na.rm = TRUE)
ci_std <- sd(ci_vals, na.rm = TRUE)
cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_
range_min <- min(ci_vals, na.rm = TRUE)
range_max <- max(ci_vals, na.rm = TRUE)
range_str <- sprintf("%.1f-%.1f", range_min, range_max)
ci_percentiles_str <- get_ci_percentiles(ci_vals)
# Count pixels with CI >= 2 (germination threshold)
GERMINATION_CI_THRESHOLD <- 2.0
num_pixels_gte_2 <- sum(ci_vals >= GERMINATION_CI_THRESHOLD, na.rm = TRUE)
num_pixels_total <- length(ci_vals)
pct_pixels_gte_2 <- if (num_pixels_total > 0) round((num_pixels_gte_2 / num_pixels_total) * 100, 1) else 0
field_rows <- extracted[extracted$ID == field_poly_idx, ]
num_total <- nrow(field_rows)
num_data <- sum(!is.na(field_rows$CI))
pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0
cloud_cat <- if (num_data == 0) "No image available"
else if (pct_clear >= 95) "Clear view"
else "Partial coverage"
# Age_week and Phase are now calculated in main script using actual planting dates
# Germination_progress is calculated in main script after Age_week is known
existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id)
if (length(existing_idx) > 0) {
next
}
results_list[[length(results_list) + 1]] <- data.frame(
Field_id = field_id,
Mean_CI = round(mean_ci, 2),
CV = round(cv * 100, 2),
CI_range = range_str,
CI_Percentiles = ci_percentiles_str,
Pct_pixels_CI_gte_2 = pct_pixels_gte_2,
Cloud_pct_clear = pct_clear,
Cloud_category = cloud_cat,
stringsAsFactors = FALSE
)
fields_processed <- fields_processed + 1
}
message(paste(" Tile", tile_idx, "of", length(tile_files), "processed"))
}, error = function(e) {
message(paste(" [ERROR] Tile", basename(tile_file), ":", e$message))
})
}
if (length(results_list) == 0) {
stop(paste("No fields processed successfully for week", week_num))
}
stats_df <- dplyr::bind_rows(results_list)
message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields"))
return(stats_df)
}
# ============================================================================
# CALCULATE KPI TRENDS
# ============================================================================
calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
project_dir = NULL, reports_dir = NULL,
current_week = NULL, year = NULL) {
message("Calculating KPI trends from current and previous week data")
current_stats$Weekly_ci_change <- NA_real_
current_stats$CV_Trend_Short_Term <- NA_real_
current_stats$Four_week_trend <- NA_real_
current_stats$CV_Trend_Long_Term <- NA_real_
current_stats$nmr_of_weeks_analysed <- 1L
if (is.null(prev_stats) || nrow(prev_stats) == 0) {
message(" No previous week data available - using defaults")
return(current_stats)
}
message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns"))
prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id)
prev_field_analysis <- NULL
tryCatch({
analysis_dir <- file.path(reports_dir, "kpis", "field_analysis")
if (dir.exists(analysis_dir)) {
analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE)
if (length(analysis_files) > 0) {
recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)]
prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE,
col_select = c(Field_id, nmr_of_weeks_analysed, Phase))
}
}
}, error = function(e) {
message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message))
})
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
message(paste(" Using previous field_analysis to track nmr_of_weeks_analysed"))
}
historical_4weeks <- list()
historical_8weeks <- list()
if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) {
message(" Loading historical field_stats for 4-week and 8-week trends...")
for (lookback in 1:4) {
target_week <- current_week - lookback
target_year <- year
if (target_week < 1) {
target_week <- target_week + 52
target_year <- target_year - 1
}
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
if (file.exists(rds_path)) {
tryCatch({
stats_data <- readRDS(rds_path)
historical_4weeks[[length(historical_4weeks) + 1]] <- list(
week = target_week,
stats = stats_data
)
}, error = function(e) {
message(paste(" Warning: Could not load week", target_week, ":", e$message))
})
}
}
for (lookback in 1:8) {
target_week <- current_week - lookback
target_year <- year
if (target_week < 1) {
target_week <- target_week + 52
target_year <- target_year - 1
}
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
if (file.exists(rds_path)) {
tryCatch({
stats_data <- readRDS(rds_path)
historical_8weeks[[length(historical_8weeks) + 1]] <- list(
week = target_week,
stats = stats_data
)
}, error = function(e) {
# Silently skip
})
}
}
if (length(historical_4weeks) > 0) {
message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend"))
}
if (length(historical_8weeks) > 0) {
message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend"))
}
}
cv_trends_calculated <- 0
four_week_trends_calculated <- 0
cv_long_term_calculated <- 0
for (i in seq_len(nrow(current_stats))) {
field_id <- current_stats$Field_id[i]
prev_idx <- prev_lookup[field_id]
if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) {
prev_row <- prev_stats[prev_idx, , drop = FALSE]
prev_ci <- prev_row$Mean_CI[1]
if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) {
current_stats$Weekly_ci_change[i] <-
round(current_stats$Mean_CI[i] - prev_ci, 2)
}
prev_cv <- prev_row$CV[1]
if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) {
current_stats$CV_Trend_Short_Term[i] <-
calculate_cv_trend(current_stats$CV[i], prev_cv)
cv_trends_calculated <- cv_trends_calculated + 1
}
if (length(historical_4weeks) > 0) {
ci_values_4week <- numeric()
for (hist_idx in rev(seq_along(historical_4weeks))) {
hist_data <- historical_4weeks[[hist_idx]]$stats
hist_field <- which(hist_data$Field_id == field_id)
if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) {
ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]])
}
}
ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i])
if (length(ci_values_4week) >= 2) {
current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week)
four_week_trends_calculated <- four_week_trends_calculated + 1
}
}
if (length(historical_8weeks) > 0) {
cv_values_8week <- numeric()
for (hist_idx in rev(seq_along(historical_8weeks))) {
hist_data <- historical_8weeks[[hist_idx]]$stats
hist_field <- which(hist_data$Field_id == field_id)
if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) {
cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]])
}
}
cv_values_8week <- c(cv_values_8week, current_stats$CV[i])
if (length(cv_values_8week) >= 2) {
slope <- calculate_cv_trend_long_term(cv_values_8week)
current_stats$CV_Trend_Long_Term[i] <- slope
cv_long_term_calculated <- cv_long_term_calculated + 1
}
}
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
prev_analysis_row <- prev_field_analysis %>%
dplyr::filter(Field_id == field_id)
if (nrow(prev_analysis_row) > 0) {
prev_nmr_weeks_analysis <- prev_analysis_row$nmr_of_weeks_analysed[1]
# Only increment nmr_of_weeks_analysed if we have previous data
if (!is.na(prev_nmr_weeks_analysis)) {
current_stats$nmr_of_weeks_analysed[i] <- prev_nmr_weeks_analysis + 1L
} else {
current_stats$nmr_of_weeks_analysed[i] <- 1L
}
}
}
}
}
message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields"))
message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields"))
message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields"))
return(current_stats)
}
# ============================================================================
# LOAD OR CALCULATE WEEKLY STATISTICS
# ============================================================================
load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf,
mosaic_dir, reports_dir, report_date = Sys.Date()) {
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year)
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
if (file.exists(rds_path)) {
message(paste("Loading cached statistics from:", basename(rds_path)))
return(readRDS(rds_path))
}
message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num))
stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year,
mosaic_dir, report_date)
output_dir <- file.path(reports_dir, "kpis", "field_stats")
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
}
saveRDS(stats_df, rds_path)
message(paste("Saved weekly statistics RDS:", basename(rds_path)))
csv_filename <- sprintf("%s_field_stats_week%02d_%d.csv", project_dir, week_num, year)
csv_path <- file.path(output_dir, csv_filename)
readr::write_csv(stats_df, csv_path)
message(paste("Saved weekly statistics CSV:", basename(csv_path)))
return(stats_df)
}
load_historical_field_data <- function(project_dir, current_week, reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL) {
historical_data <- list()
loaded_weeks <- c()
missing_weeks <- c()
for (lookback in 0:(num_weeks - 1)) {
target_week <- current_week - lookback
if (target_week < 1) target_week <- target_week + 52
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", target_week), ".csv")
csv_path <- file.path(reports_dir, "kpis", "field_analysis", csv_filename)
if (file.exists(csv_path)) {
tryCatch({
data <- read_csv(csv_path, show_col_types = FALSE)
historical_data[[lookback + 1]] <- list(
week = target_week,
data = data
)
loaded_weeks <- c(loaded_weeks, target_week)
}, error = function(e) {
message(paste(" Warning: Could not load week", target_week, ":", e$message))
missing_weeks <<- c(missing_weeks, target_week)
})
} else {
missing_weeks <- c(missing_weeks, target_week)
}
}
if (length(missing_weeks) > 0 && auto_generate) {
message(paste("⚠ Missing weeks:", paste(missing_weeks, collapse = ", ")))
message("Scanning for ALL available weekly mosaics and calculating stats...\n")
if (is.null(field_boundaries_sf)) {
message(" Error: field_boundaries_sf not provided - cannot auto-generate")
return(historical_data)
}
if (!exists("weekly_tile_max")) {
message(" ✗ weekly_tile_max path not defined")
return(historical_data)
}
check_paths <- c(file.path(weekly_tile_max, "5x5"), weekly_tile_max)
mosaic_scan_dir <- NA
for (check_path in check_paths) {
if (dir.exists(check_path)) {
tif_files <- list.files(check_path, pattern = "week_.*\\.tif$", full.names = TRUE)
if (length(tif_files) > 0) {
mosaic_scan_dir <- check_path
break
}
}
}
if (is.na(mosaic_scan_dir)) {
message(" ✗ No mosaic files found in weekly_tile_max")
return(historical_data)
}
weeks_to_load <- 8
today <- Sys.Date()
target_dates <- today - (0:(weeks_to_load - 1)) * 7
expected_weeks <- data.frame(
date = target_dates,
week = as.numeric(format(target_dates, "%V")),
year = as.numeric(format(target_dates, "%Y")),
stringsAsFactors = FALSE
)
expected_weeks <- unique(expected_weeks)
message(paste(" Expected weeks (last 8 from", format(today, "%Y-%m-%d"), "):"))
for (i in seq_len(nrow(expected_weeks))) {
message(paste(" Week", sprintf("%02d", expected_weeks$week[i]), expected_weeks$year[i]))
}
message("")
tif_files <- list.files(mosaic_scan_dir, pattern = "week_([0-9]{2})_([0-9]{4})_[0-9]{2}\\.tif$",
full.names = FALSE)
available_weeks <- data.frame()
for (filename in tif_files) {
matches <- regmatches(filename, gregexpr("week_([0-9]{2})_([0-9]{4})", filename))[[1]]
if (length(matches) > 0) {
week_year <- strsplit(matches[1], "_")[[1]]
if (length(week_year) == 3) {
week_num <- as.numeric(week_year[2])
year_num <- as.numeric(week_year[3])
if (week_num %in% expected_weeks$week && year_num %in% expected_weeks$year) {
available_weeks <- rbind(available_weeks,
data.frame(week = week_num, year = year_num))
}
}
}
}
available_weeks <- unique(available_weeks)
available_weeks <- merge(available_weeks, expected_weeks[, c("week", "year", "date")], by = c("week", "year"))
available_weeks <- available_weeks[order(available_weeks$date, decreasing = TRUE), ]
if (nrow(available_weeks) == 0) {
message(" ✗ No matching mosaic files found")
message(paste(" Scanned directory:", mosaic_scan_dir))
return(historical_data)
}
message(paste(" Found", nrow(available_weeks), "week(s) with available mosaics:"))
for (i in seq_len(nrow(available_weeks))) {
week_to_calc <- available_weeks$week[i]
year_to_calc <- available_weeks$year[i]
date_to_calc <- available_weeks$date[i]
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_to_calc, year_to_calc)
tile_files <- list.files(mosaic_scan_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
message(paste(" ✗ Week", sprintf("%02d", week_to_calc), year_to_calc, "- no tiles found"))
next
}
message(paste(" ✓ Week", sprintf("%02d", week_to_calc), year_to_calc, "-", length(tile_files), "mosaics"))
tryCatch({
week_stats <- load_or_calculate_weekly_stats(
week_num = week_to_calc,
year = year_to_calc,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
mosaic_dir = mosaic_scan_dir,
reports_dir = reports_dir,
report_date = date_to_calc
)
if (!is.null(week_stats) && nrow(week_stats) > 0) {
message(paste(" ✓ Calculated stats for", nrow(week_stats), "fields"))
historical_data[[length(historical_data) + 1]] <- list(
week = week_to_calc,
year = year_to_calc,
data = week_stats
)
loaded_weeks <- c(loaded_weeks, paste0(week_to_calc, "_", year_to_calc))
}
}, error = function(e) {
message(paste(" ✗ Error:", e$message))
})
}
}
if (length(historical_data) == 0) {
message(paste("Error: No historical field data found and could not auto-generate weeks"))
return(NULL)
}
message(paste("✓ Loaded", length(historical_data), "weeks of historical data:",
paste(loaded_weeks, collapse = ", ")))
return(historical_data)
}
# ============================================================================
# HELPER: Extract field-level statistics from CI raster
# ============================================================================
extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
#' Extract CI statistics for all fields from a single CI raster band
extract_result <- terra::extract(ci_band, field_boundaries_sf)
stats_list <- list()
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
field_pixels <- extract_result[extract_result$ID == field_idx, 2]
pixels <- as.numeric(field_pixels[!is.na(field_pixels)])
if (length(pixels) == 0) {
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = NA_real_,
cv = NA_real_,
p10 = NA_real_,
p90 = NA_real_,
min_ci = NA_real_,
max_ci = NA_real_,
pixel_count_valid = 0,
pixel_count_total = 0,
stringsAsFactors = FALSE
)
next
}
mean_val <- mean(pixels, na.rm = TRUE)
cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
min_val <- min(pixels, na.rm = TRUE)
max_val <- max(pixels, na.rm = TRUE)
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = mean_val,
cv = cv_val,
p10 = p10_val,
p90 = p90_val,
min_ci = min_val,
max_ci = max_val,
pixel_count_valid = length(pixels),
pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]),
stringsAsFactors = FALSE
)
}
return(dplyr::bind_rows(stats_list))
}
# ============================================================================
# COMMENTED OUT / UNUSED FUNCTIONS (kept for future use)
# ============================================================================
# analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week_num, year,
# mosaic_dir, historical_data = NULL, planting_dates = NULL,
# report_date = Sys.Date(), harvest_imminence_data = NULL,
# harvesting_data = NULL) {
# # [Function kept as reference for parallel field analysis]
# # Currently replaced by calculate_field_statistics() for efficiency
# }

View file

@ -50,7 +50,7 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
}
# PRIORITY 2: File-based detection (fallback if metadata not found)
# Check if merged_final_tif/ contains tile-named files
# Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories
if (!dir.exists(merged_final_tif_dir)) {
return(list(
@ -61,6 +61,30 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
))
}
# First check if there are grid-size subdirectories (5x5, 10x10, etc.)
# This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
# Found grid-size subdirectories - tiles exist!
grid_size <- grid_patterns[1]
grid_dir <- file.path(merged_final_tif_dir, grid_size)
# List sample tile files from the grid directory
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
return(list(
has_tiles = TRUE,
detected_tiles = sample_tiles,
total_files = length(sample_tiles),
source = "grid_subdirectory_detection",
grid_size = grid_size,
grid_path = grid_dir
))
}
# Fall back to checking for tile-named files directly in merged_final_tif
# List all .tif files in merged_final_tif
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)

View file

@ -1,31 +1,40 @@
# ==============================================================================
# FULL PIPELINE RUNNER
# ==============================================================================
# Runs scripts 02, 03, 04, 09 (KPIs), 09 (Weekly), and 10 (CI Report Simple)
# Mixed Python/R pipeline:
# 1. Python: Download Planet images
# 2. R 10: Create master grid and split TIFFs
# 3. R 20: CI Extraction
# 4. R 21: Convert CI RDS to CSV
# 5. R 30: Interpolate growth model
# 6. Python 31: Harvest imminent weekly
# 7. R 40: Mosaic creation
# 8. R 80: Calculate KPIs
#
# ==============================================================================
# HOW TO RUN THIS SCRIPT
# ==============================================================================
#
# In PowerShell or Command Prompt:
# Run from the smartcane/ directory:
#
# Option 1 (Recommended - shows real-time output):
# Rscript run_full_pipeline.R
# Rscript r_app/run_full_pipeline.R
#
# Option 2 (Full path to Rscript - use & in PowerShell for paths with spaces):
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" run_full_pipeline.R
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R
#
# Option 3 (Batch mode - output saved to .Rout file):
# R CMD BATCH --vanilla run_full_pipeline.R
# R CMD BATCH --vanilla r_app/run_full_pipeline.R
#
# ==============================================================================
# ==============================================================================
# *** EDIT THESE VARIABLES ***
end_date <- "2025-12-24" # or specify: "2025-12-02", Sys.Date()
end_date <- as.Date("2026-01-27") # or specify: as.Date("2026-01-27") , Sys.Date()
offset <- 7 # days to look back
project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba"
data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif"
force_rerun <- FALSE # Set to TRUE to force all scripts to run even if outputs exist
# ***************************
# Format dates
@ -35,117 +44,350 @@ end_date_str <- format(as.Date(end_date), "%Y-%m-%d")
pipeline_success <- TRUE
# ==============================================================================
# SCRIPT 02: CI EXTRACTION
# INTELLIGENT CHECKING: What has already been completed?
# ==============================================================================
cat("\n========== RUNNING SCRIPT 02: CI EXTRACTION ==========\n")
tryCatch({
source("r_app/02_ci_extraction.R")
main() # Call the main function
cat("✓ Script 02 completed\n")
}, error = function(e) {
cat("✗ Error in Script 02:", e$message, "\n")
pipeline_success <<- FALSE
})
cat("\n========== CHECKING EXISTING OUTPUTS ==========\n")
# ==============================================================================
# SCRIPT 03: INTERPOLATE GROWTH MODEL
# ==============================================================================
cat("\n========== RUNNING SCRIPT 03: INTERPOLATE GROWTH MODEL ==========\n")
tryCatch({
source("r_app/03_interpolate_growth_model.R")
main() # Call the main function
cat("✓ Script 03 completed\n")
}, error = function(e) {
cat("✗ Error in Script 03:", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 04: MOSAIC CREATION
# ==============================================================================
cat("\n========== RUNNING SCRIPT 04: MOSAIC CREATION ==========\n")
tryCatch({
source("r_app/04_mosaic_creation.R")
main() # Call the main function
cat("✓ Script 04 completed\n")
}, error = function(e) {
cat("✗ Error in Script 04:", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 09: CALCULATE KPIs
# ==============================================================================
cat("\n========== RUNNING SCRIPT 09: CALCULATE KPIs ==========\n")
tryCatch({
source("r_app/09_calculate_kpis.R")
main() # Call the main function
cat("✓ Script 09 (KPIs) completed\n")
}, error = function(e) {
cat("✗ Error in Script 09 (KPIs):", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 09: FIELD ANALYSIS WEEKLY
# ==============================================================================
# Only run field analysis weekly for angata project
if (project_dir == "angata") {
cat("\n========== RUNNING SCRIPT 09: FIELD ANALYSIS WEEKLY ==========\n")
tryCatch({
source("r_app/09_field_analysis_weekly.R")
main() # Call the main function
cat("✓ Script 09 (Weekly) completed\n")
}, error = function(e) {
cat("✗ Error in Script 09 (Weekly):", e$message, "\n")
pipeline_success <<- FALSE
})
# Check Script 10 outputs (tiled splits)
tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5")
tiles_dates <- if (dir.exists(tiles_dir)) {
list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE)
} else {
cat("\n========== SKIPPING SCRIPT 09: FIELD ANALYSIS WEEKLY (only runs for angata) ==========\n")
c()
}
cat(sprintf("Script 10: %d dates already tiled\n", length(tiles_dates)))
# Check Script 20 outputs (CI extraction) - daily RDS files
ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals")
ci_files <- if (dir.exists(ci_daily_dir)) {
list.files(ci_daily_dir, pattern = "\\.rds$")
} else {
c()
}
cat(sprintf("Script 20: %d CI daily RDS files exist\n", length(ci_files)))
# Check Script 21 outputs (CSV conversion) - note: this gets overwritten each time, so we don't skip based on this
# Instead, check if CI RDS files exist - if they do, 21 should also run
# For now, just note that CSV is time-dependent, not a good skip indicator
cat("Script 21: CSV file exists but gets overwritten - will run if Script 20 runs\n")
# Check Script 40 outputs (mosaics in weekly_tile_max/5x5)
mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5")
mosaic_files <- if (dir.exists(mosaic_dir)) {
list.files(mosaic_dir, pattern = "\\.tif$")
} else {
c()
}
cat(sprintf("Script 40: %d mosaic files exist\n", length(mosaic_files)))
# Check Script 80 outputs (KPIs in reports/kpis/field_stats)
kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats")
kpi_files <- if (dir.exists(kpi_dir)) {
list.files(kpi_dir, pattern = "\\.csv$|\\.json$")
} else {
c()
}
cat(sprintf("Script 80: %d KPI files exist\n", length(kpi_files)))
# Determine if scripts should run based on outputs
skip_10 <- length(tiles_dates) > 0 && !force_rerun
skip_20 <- length(ci_files) > 0 && !force_rerun
skip_21 <- length(ci_files) > 0 && !force_rerun # Skip 21 if 20 is skipped
skip_40 <- length(mosaic_files) > 0 && !force_rerun
skip_80 <- FALSE # Always run Script 80 - it calculates KPIs for the current week (end_date), not historical weeks
cat("\nSkipping decisions:\n")
cat(sprintf(" Script 10: %s\n", if(skip_10) "SKIP (tiles exist)" else "RUN"))
cat(sprintf(" Script 20: %s\n", if(skip_20) "SKIP (CI exists)" else "RUN"))
cat(sprintf(" Script 21: %s\n", if(skip_21) "SKIP (CI exists)" else "RUN"))
cat(sprintf(" Script 40: %s\n", if(skip_40) "SKIP (mosaics exist)" else "RUN"))
cat(sprintf(" Script 80: %s\n", if(skip_80) "SKIP (KPIs exist)" else "RUN"))
# ==============================================================================
# SCRIPT 91: CI REPORT ANGATA (only for angata)
# PYTHON: DOWNLOAD PLANET IMAGES (MISSING DATES ONLY)
# ==============================================================================
if (project_dir == "angata") {
cat("\n========== RUNNING SCRIPT 91: CI REPORT ANGATA ==========\n")
if (pipeline_success) {
tryCatch({
rmarkdown::render("r_app/91_CI_report_with_kpis_Angata.Rmd",
output_format = "word_document",
params = list(data_dir = project_dir, report_date = end_date_str))
cat("✓ Script 91 (Report) completed\n")
}, error = function(e) {
cat("✗ Error in Script 91 (Report):", e$message, "\n")
})
} else {
cat("✗ Skipping Script 91: Previous pipeline scripts failed\n")
cat("\n========== DOWNLOADING PLANET IMAGES (MISSING DATES ONLY) ==========\n")
tryCatch({
# Setup paths
base_path <- file.path("laravel_app", "storage", "app", project_dir)
merged_tifs_dir <- file.path(base_path, data_source)
# Get existing dates from raw TIFFs
existing_tiff_files <- list.files(merged_tifs_dir, pattern = "^\\d{4}-\\d{2}-\\d{2}\\.tif$")
existing_tiff_dates <- sub("\\.tif$", "", existing_tiff_files)
# Get existing dates from tiles (better indicator of completion)
existing_tile_dates <- tiles_dates
# Find missing dates in the window
start_date <- end_date - offset
date_seq <- seq(start_date, end_date, by = "day")
target_dates <- format(date_seq, "%Y-%m-%d")
# Only download if tiles don't exist yet (more reliable than checking raw TIFFs)
missing_dates <- target_dates[!(target_dates %in% existing_tile_dates)]
cat(sprintf(" Existing tiled dates: %d\n", length(existing_tile_dates)))
cat(sprintf(" Missing dates in window: %d\n", length(missing_dates)))
# Download each missing date
download_count <- 0
download_failed <- 0
if (length(missing_dates) > 0) {
# Save current directory
original_dir <- getwd()
# Change to python_app directory so relative paths work correctly
setwd("python_app")
for (date_str in missing_dates) {
cmd <- sprintf('python 00_download_8band_pu_optimized.py "%s" --date "%s" --resolution 3 --cleanup', project_dir, date_str)
result <- system(cmd, ignore.stdout = FALSE, ignore.stderr = FALSE)
if (result == 0) {
download_count <- download_count + 1
} else {
download_failed <- download_failed + 1
}
}
# Change back to original directory
setwd(original_dir)
}
cat(sprintf("✓ Downloaded %d dates, %d failed\n", download_count, download_failed))
if (download_failed > 0) {
cat("⚠ Some downloads failed, but continuing pipeline\n")
}
# Force Script 10 to run ONLY if downloads actually succeeded (not just attempted)
if (download_count > 0) {
skip_10 <- FALSE
}
}, error = function(e) {
cat("✗ Error in planet download:", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs
# ==============================================================================
if (pipeline_success && !skip_10) {
cat("\n========== RUNNING SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs ==========\n")
tryCatch({
# Set environment variables for the script (Script 10 uses these for filtering)
assign("PROJECT", project_dir, envir = .GlobalEnv)
# Suppress verbose per-date output, show only summary
sink(nullfile())
source("r_app/10_create_master_grid_and_split_tiffs.R")
sink()
# Verify output
tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5")
if (dir.exists(tiles_dir)) {
subdirs <- list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE)
cat(sprintf("✓ Script 10 completed - created tiles for %d dates\n", length(subdirs)))
} else {
cat("✓ Script 10 completed\n")
}
}, error = function(e) {
sink()
cat("✗ Error in Script 10:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_10) {
cat("\n========== SKIPPING SCRIPT 10 (tiles already exist) ==========\n")
}
# ==============================================================================
# SCRIPT 10: CI REPORT (SIMPLE)
# SCRIPT 20: CI EXTRACTION
# ==============================================================================
# Only run CI report for non-angata projects
if (pipeline_success && !skip_20) {
cat("\n========== RUNNING SCRIPT 20: CI EXTRACTION ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/20_ci_extraction.R")
main() # Call main() to execute the script with the environment variables
# Verify CI output was created
ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals")
if (dir.exists(ci_daily_dir)) {
files <- list.files(ci_daily_dir, pattern = "\\.rds$")
cat(sprintf("✓ Script 20 completed - generated %d CI files\n", length(files)))
} else {
cat("✓ Script 20 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 20:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_20) {
cat("\n========== SKIPPING SCRIPT 20 (CI already extracted) ==========\n")
}
# ==============================================================================
# SCRIPT 21: CONVERT CI RDS TO CSV
# ==============================================================================
if (pipeline_success && !skip_21) {
cat("\n========== RUNNING SCRIPT 21: CONVERT CI RDS TO CSV ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
source("r_app/21_convert_ci_rds_to_csv.R")
main() # Call main() to execute the script with the environment variables
# Verify CSV output was created
ci_csv_path <- file.path("laravel_app", "storage", "app", project_dir, "ci_extracted")
if (dir.exists(ci_csv_path)) {
csv_files <- list.files(ci_csv_path, pattern = "\\.csv$")
cat(sprintf("✓ Script 21 completed - converted to %d CSV files\n", length(csv_files)))
} else {
cat("✓ Script 21 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 21:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_21) {
cat("\n========== SKIPPING SCRIPT 21 (CSV already created) ==========\n")
}
if (project_dir != "angata") {
cat("\n========== RUNNING SCRIPT 10: CI REPORT SIMPLE ==========\n")
if (pipeline_success) {
tryCatch({
rmarkdown::render("r_app/10_CI_report_with_kpis_simple.Rmd",
output_format = "word_document",
params = list(data_dir = project_dir, report_date = end_date_str))
cat("✓ Script 10 (Report) completed\n")
}, error = function(e) {
cat("✗ Error in Script 10 (Report):", e$message, "\n")
})
} else {
cat("✗ Skipping Script 10: Previous pipeline scripts failed\n")
}
} else {
cat("\n========== SKIPPING SCRIPT 10: CI REPORT SIMPLE (not applicable for angata) ==========\n")
}
# ==============================================================================
# SCRIPT 30: INTERPOLATE GROWTH MODEL
# ==============================================================================
if (pipeline_success) {
cat("\n========== RUNNING SCRIPT 30: INTERPOLATE GROWTH MODEL ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/30_interpolate_growth_model.R")
main() # Call main() to execute the script with the environment variables
# Verify interpolated output
growth_dir <- file.path("laravel_app", "storage", "app", project_dir, "growth_model_interpolated")
if (dir.exists(growth_dir)) {
files <- list.files(growth_dir, pattern = "\\.rds$|\\.csv$")
cat(sprintf("✓ Script 30 completed - generated %d growth model files\n", length(files)))
} else {
cat("✓ Script 30 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 30:", e$message, "\n")
pipeline_success <<- FALSE
})
}
# ==============================================================================
# PYTHON 31: HARVEST IMMINENT WEEKLY
# ==============================================================================
if (pipeline_success) {
cat("\n========== RUNNING PYTHON 31: HARVEST IMMINENT WEEKLY ==========\n")
tryCatch({
# Run Python script in pytorch_gpu conda environment
# Script expects positional project name (not --project flag)
# Run from smartcane root so conda can find the environment
cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir)
cat("DEBUG: Running command:", cmd, "\n")
result <- system(cmd)
if (result == 0) {
# Verify harvest output - check for THIS WEEK's specific file
current_week <- as.numeric(format(end_date, "%V"))
current_year <- as.numeric(format(end_date, "%Y"))
expected_file <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats",
sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year))
if (file.exists(expected_file)) {
cat(sprintf("✓ Script 31 completed - generated harvest imminent file for week %02d\n", current_week))
} else {
cat("✓ Script 31 completed (check if harvest.xlsx is available)\n")
}
} else {
cat("⚠ Script 31 completed with errors (check harvest.xlsx availability)\n")
}
}, error = function(e) {
setwd(original_dir)
cat("⚠ Script 31 error:", e$message, "\n")
})
}
# ==============================================================================
# SCRIPT 40: MOSAIC CREATION
# ==============================================================================
if (pipeline_success && !skip_40) {
cat("\n========== RUNNING SCRIPT 40: MOSAIC CREATION ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/40_mosaic_creation.R")
main() # Call main() to execute the script with the environment variables
# Verify mosaic output
mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5")
if (dir.exists(mosaic_dir)) {
files <- list.files(mosaic_dir, pattern = "\\.tif$")
cat(sprintf("✓ Script 40 completed - generated %d mosaic files\n", length(files)))
} else {
cat("✓ Script 40 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 40:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_40) {
cat("\n========== SKIPPING SCRIPT 40 (mosaics already created) ==========\n")
}
# ==============================================================================
# SCRIPT 80: CALCULATE KPIs
# ==============================================================================
if (pipeline_success) { # Always run Script 80 - it calculates KPIs for the current week
cat("\n========== RUNNING SCRIPT 80: CALCULATE KPIs ==========\n")
tryCatch({
# Set environment variables for the script (Script 80's main() uses these as fallbacks)
# NOTE: end_date is already a Date, just assign directly without as.Date()
assign("end_date", end_date, envir = .GlobalEnv)
assign("end_date_str", end_date_str, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/80_calculate_kpis.R")
main() # Call main() to execute the script with the environment variables
# Verify KPI output
kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats")
if (dir.exists(kpi_dir)) {
files <- list.files(kpi_dir, pattern = "\\.csv$|\\.json$")
cat(sprintf("✓ Script 80 completed - generated %d KPI files\n", length(files)))
} else {
cat("✓ Script 80 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 80:", e$message, "\n")
cat("Full error:\n")
print(e)
pipeline_success <<- FALSE
})
}
# ==============================================================================
# SUMMARY
@ -154,4 +396,9 @@ cat("\n========== PIPELINE COMPLETE ==========\n")
cat(sprintf("Project: %s\n", project_dir))
cat(sprintf("End Date: %s\n", end_date_str))
cat(sprintf("Offset: %d days\n", offset))
cat("Scripts executed: 02, 03, 04, 09 (KPIs), 09 (Weekly), 10 (CI Report)\n")
if (pipeline_success) {
cat("Status: ✓ All scripts completed successfully\n")
} else {
cat("Status: ✗ Pipeline failed - check errors above\n")
}
cat("Pipeline sequence: Python Download → R 10 → R 20 → R 21 → R 30 → Python 31 → R 40 → R 80\n")

View file

@ -0,0 +1,447 @@
# SmartCane Processing Pipeline - Complete Script Overview
## Pipeline Execution Order
## Complete Pipeline Mermaid Diagram
```mermaid
graph TD
%% ===== INPUTS =====
API["🔑 Planet API<br/>Credentials"]
GeoJSON["🗺️ pivot.geojson<br/>(Field Boundaries)"]
HarvestIn["📊 harvest.xlsx<br/>(from Stage 23)"]
%% ===== STAGE 00: DOWNLOAD =====
Stage00["<b>Stage 00: Python</b><br/>00_download_8band_pu_optimized.py"]
Out00["📦 merged_tif_8b/<br/>YYYY-MM-DD.tif<br/>(4-band uint16)"]
%% ===== STAGE 10: OPTIONAL TILING =====
Stage10["<b>Stage 10: R</b><br/>10_create_master_grid...<br/>(Optional)"]
Out10["📦 daily_tiles_split/5x5/<br/>YYYY-MM-DD/*.tif<br/>(25 tiles)"]
%% ===== STAGE 20: CI EXTRACTION =====
Stage20["<b>Stage 20: R</b><br/>20_ci_extraction.R"]
Out20a["📦 combined_CI_data.rds<br/>(wide: fields × dates)"]
Out20b["📦 daily RDS files<br/>(per-date stats)"]
%% ===== STAGE 21: RDS → CSV =====
Stage21["<b>Stage 21: R</b><br/>21_convert_ci_rds_to_csv.R"]
Out21["📦 ci_data_for_python.csv<br/>(long format + DOY)"]
%% ===== STAGE 22: BASELINE HARVEST =====
Stage22["<b>Stage 22: Python</b><br/>22_harvest_baseline_prediction.py<br/>(RUN ONCE)"]
Out22["📦 harvest_production_export.xlsx<br/>(baseline predictions)"]
%% ===== STAGE 23: HARVEST FORMAT =====
Stage23["<b>Stage 23: Python</b><br/>23_convert_harvest_format.py"]
Out23["📦 harvest.xlsx<br/>(standard format)<br/>→ Feeds back to Stage 80"]
%% ===== STAGE 30: GROWTH MODEL =====
Stage30["<b>Stage 30: R</b><br/>30_interpolate_growth_model.R"]
Out30["📦 All_pivots_Cumulative_CI...<br/>_quadrant_year_v2.rds<br/>(interpolated daily)"]
%% ===== STAGE 31: WEEKLY HARVEST =====
Stage31["<b>Stage 31: Python</b><br/>31_harvest_imminent_weekly.py<br/>(Weekly)"]
Out31["📦 harvest_imminent_weekly.csv<br/>(probabilities)"]
%% ===== STAGE 40: MOSAIC =====
Stage40["<b>Stage 40: R</b><br/>40_mosaic_creation.R"]
Out40["📦 weekly_mosaic/<br/>week_WW_YYYY.tif<br/>(5-band composite)"]
%% ===== STAGE 80: KPI =====
Stage80["<b>Stage 80: R</b><br/>80_calculate_kpis.R"]
Out80a["📦 field_analysis_week{WW}.xlsx"]
Out80b["📦 kpi_summary_tables_week{WW}.rds"]
%% ===== STAGE 90: REPORT =====
Stage90["<b>Stage 90: R/RMarkdown</b><br/>90_CI_report_with_kpis_simple.Rmd"]
Out90["📦 SmartCane_Report_week{WW}_{YYYY}.docx<br/>(FINAL OUTPUT)"]
%% ===== CONNECTIONS: INPUTS TO STAGE 00 =====
API --> Stage00
GeoJSON --> Stage00
%% ===== STAGE 00 → 10 OR 20 =====
Stage00 --> Out00
Out00 --> Stage10
Out00 --> Stage20
%% ===== STAGE 10 → 20 =====
Stage10 --> Out10
Out10 --> Stage20
%% ===== STAGE 20 → 21, 30, 40 =====
GeoJSON --> Stage20
Stage20 --> Out20a
Stage20 --> Out20b
Out20a --> Stage21
Out20a --> Stage30
Out00 --> Stage40
%% ===== STAGE 21 → 22, 31 =====
Stage21 --> Out21
Out21 --> Stage22
Out21 --> Stage31
%% ===== STAGE 22 → 23 =====
Stage22 --> Out22
Out22 --> Stage23
%% ===== STAGE 23 → 80 & FEEDBACK =====
Stage23 --> Out23
Out23 -.->|"Feeds back<br/>(Season context)"| Stage80
%% ===== STAGE 30 → 80 =====
Stage30 --> Out30
Out30 --> Stage80
%% ===== STAGE 31 (PARALLEL) =====
Stage31 --> Out31
%% ===== STAGE 40 → 80, 90 =====
Stage40 --> Out40
Out40 --> Stage80
Out40 --> Stage90
%% ===== STAGE 80 → 90 =====
Stage80 --> Out80a
Stage80 --> Out80b
Out80a --> Stage90
Out80b --> Stage90
%% ===== STAGE 90 FINAL =====
Stage90 --> Out90
%% ===== ADDITIONAL INPUTS =====
HarvestIn --> Stage30
HarvestIn --> Stage80
GeoJSON --> Stage30
GeoJSON --> Stage40
GeoJSON --> Stage80
%% ===== STYLING =====
classDef input fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
classDef pyStage fill:#fff3e0,stroke:#f57c00,stroke-width:2px
classDef rStage fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
classDef output fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
classDef finalOutput fill:#ffebee,stroke:#c62828,stroke-width:3px
class API,GeoJSON,HarvestIn input
class Stage00,Stage22,Stage23,Stage31 pyStage
class Stage10,Stage20,Stage21,Stage30,Stage40,Stage80,Stage90 rStage
class Out00,Out10,Out20a,Out20b,Out21,Out22,Out30,Out31,Out40,Out80a,Out80b output
class Out23,Out90 finalOutput
```
---
## Detailed Stage Descriptions
```
Stage 00: PYTHON - Download Satellite Data
└─ 00_download_8band_pu_optimized.py
INPUT: Planet API credentials, field boundaries (pivot.geojson), date range
OUTPUT: laravel_app/storage/app/{project}/merged_tif_8b/{YYYY-MM-DD}.tif (4-band uint16)
RUN FREQUENCY: Daily or as-needed
NOTES: 8-band includes UDM cloud masking, optimized for PU cost
Stage 10: R - (Optional) Create Master Grid & Split TIFFs into Tiles
└─ 10_create_master_grid_and_split_tiffs.R
INPUT: Daily GeoTIFFs from merged_tif_8b/
OUTPUT: laravel_app/storage/app/{project}/daily_tiles_split/5x5/{YYYY-MM-DD}/*.tif
RUN FREQUENCY: Optional - only if tile-based processing desired
NOTES: Creates 25 tiles per day for memory-efficient processing; 5x5 grid hardcoded
Stage 20: R - Extract Canopy Index (CI) from Daily Imagery
└─ 20_ci_extraction.R
INPUT: Daily GeoTIFFs (merged_tif_8b/ or daily_tiles_split/)
Field boundaries (pivot.geojson)
Data source parameter (merged_tif_8b, merged_tif, merged_final_tif)
OUTPUT: RDS files:
- laravel_app/storage/app/{project}/Data/extracted_ci/daily_vals/extracted_{YYYY-MM-DD}_{suffix}.rds
- laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/combined_CI_data.rds (wide format)
RUN FREQUENCY: Daily or on-demand
COMMAND: Rscript 20_ci_extraction.R [end_date] [offset] [project_dir] [data_source]
EXAMPLE: Rscript 20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
NOTES: Auto-detects tiles if daily_tiles_split/ exists; outputs cumulative CI (fields × dates)
Stage 21: R - Convert CI RDS to CSV for Python Harvest Detection
└─ 21_convert_ci_rds_to_csv.R
INPUT: combined_CI_data.rds (from Stage 20)
OUTPUT: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
RUN FREQUENCY: After Stage 20
COMMAND: Rscript 21_convert_ci_rds_to_csv.R [project_dir]
EXAMPLE: Rscript 21_convert_ci_rds_to_csv.R angata
NOTES: Converts wide RDS (fields × dates) to long CSV; interpolates missing dates; adds DOY column
Stage 22: PYTHON - Baseline Harvest Prediction (LSTM Model 307)
└─ 22_harvest_baseline_prediction.py
INPUT: ci_data_for_python.csv (complete historical CI data)
OUTPUT: laravel_app/storage/app/{project}/Data/HarvestData/harvest_production_export.xlsx
RUN FREQUENCY: ONCE - establishes ground truth baseline for all fields
COMMAND: python 22_harvest_baseline_prediction.py [project_name]
EXAMPLE: python 22_harvest_baseline_prediction.py angata
NOTES: Two-step detection (Phase 1: growing window, Phase 2: ±40 day argmax refinement)
Tuned parameters: threshold=0.3, consecutive_days=2
Uses LSTM Model 307 dual output heads (imminent + detected)
Stage 23: PYTHON - Convert Harvest Format to Standard Structure
└─ 23_convert_harvest_format.py
INPUT: harvest_production_export.xlsx (from Stage 22)
CI data date range (determines season_start for first season)
OUTPUT: laravel_app/storage/app/{project}/Data/harvest.xlsx (standard format)
RUN FREQUENCY: After Stage 22
COMMAND: python 23_convert_harvest_format.py [project_name]
EXAMPLE: python 23_convert_harvest_format.py angata
NOTES: Converts to standard harvest.xlsx format with columns:
field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha
Season format: "Data{year} : {field}"
Only includes completed seasons (with season_end filled)
Stage 30: R - Growth Model Interpolation (Smooth CI Time Series)
└─ 30_interpolate_growth_model.R
INPUT: combined_CI_data.rds (from Stage 20)
harvest.xlsx (optional, for seasonal context)
OUTPUT: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/
All_pivots_Cumulative_CI_quadrant_year_v2.rds
RUN FREQUENCY: Weekly or after CI extraction updates
COMMAND: Rscript 30_interpolate_growth_model.R [project_dir]
EXAMPLE: Rscript 30_interpolate_growth_model.R angata
NOTES: Linear interpolation across gaps; calculates daily change and cumulative CI
Outputs long-format data (Date, DOY, field, value, season, etc.)
Stage 31: PYTHON - Weekly Harvest Monitoring (Real-Time Alerts)
└─ 31_harvest_imminent_weekly.py
INPUT: ci_data_for_python.csv (recent CI data, last ~300 days)
harvest_production_export.xlsx (optional baseline reference)
OUTPUT: laravel_app/storage/app/{project}/Data/HarvestData/harvest_imminent_weekly.csv
RUN FREQUENCY: Weekly or daily for operational alerts
COMMAND: python 31_harvest_imminent_weekly.py [project_name]
EXAMPLE: python 31_harvest_imminent_weekly.py angata
NOTES: Single-run inference on recent data; outputs probabilities (imminent_prob, detected_prob)
Used for real-time decision support; compared against baseline from Stage 22
Stage 40: R - Create Weekly 5-Band Mosaics
└─ 40_mosaic_creation.R
INPUT: Daily GeoTIFFs (merged_tif_8b/ or daily_tiles_split/)
Field boundaries (pivot.geojson)
OUTPUT: laravel_app/storage/app/{project}/weekly_mosaic/week_{WW}_{YYYY}.tif
RUN FREQUENCY: Weekly
COMMAND: Rscript 40_mosaic_creation.R [end_date] [offset] [project_dir]
EXAMPLE: Rscript 40_mosaic_creation.R 2026-01-14 7 angata
NOTES: Composites daily images using MAX function; 5 bands (R, G, B, NIR, CI)
Automatically selects images with acceptable cloud coverage
Output uses ISO week numbering (week_WW_YYYY)
Stage 80: R - Calculate KPIs & Per-Field Analysis
└─ 80_calculate_kpis.R
INPUT: Weekly mosaic (from Stage 40)
Growth model data (from Stage 30)
Field boundaries (pivot.geojson)
Harvest data (harvest.xlsx)
OUTPUT: laravel_app/storage/app/{project}/reports/
- {project}_field_analysis_week{WW}.xlsx
- {project}_kpi_summary_tables_week{WW}.rds
RUN FREQUENCY: Weekly
COMMAND: Rscript 80_calculate_kpis.R [end_date] [project_dir] [offset_days]
EXAMPLE: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
NOTES: Parallel processing for 1000+ fields; calculates:
- Per-field uniformity (CV), phase assignment, growth trends
- Status triggers (germination, rapid growth, disease, harvest imminence)
- Farm-level KPI metrics (6 high-level indicators)
TEST_MODE=TRUE uses only recent weeks for development
Stage 90: R (RMarkdown) - Generate Executive Report (Word Document)
└─ 90_CI_report_with_kpis_simple.Rmd
INPUT: Weekly mosaic (from Stage 40)
KPI summary data (from Stage 80)
Field analysis (from Stage 80)
Field boundaries & harvest data (for context)
OUTPUT: laravel_app/storage/app/{project}/reports/
SmartCane_Report_week{WW}_{YYYY}.docx (PRIMARY OUTPUT)
SmartCane_Report_week{WW}_{YYYY}.html (optional)
RUN FREQUENCY: Weekly
RENDERING: R/RMarkdown with officer + flextable packages
NOTES: Executive summary with KPI overview, phase distribution, status triggers
Field-by-field detail pages with CI metrics and interpretation guides
Automatic unit conversion (hectares ↔ acres)
```
---
## Data Storage & Persistence
All data persists to the file system. No database writes occur during analysis—reads only for metadata.
```
laravel_app/storage/app/{project}/
├── Data/
│ ├── pivot.geojson # Field boundaries (read-only input)
│ ├── harvest.xlsx # Season dates & yield (standard format from Stage 23)
│ ├── vrt/ # Virtual raster files (daily VRTs from Stage 20)
│ │ └── YYYY-MM-DD.vrt
│ ├── extracted_ci/
│ │ ├── ci_data_for_python/
│ │ │ └── ci_data_for_python.csv # CSV for Python (from Stage 21)
│ │ ├── daily_vals/
│ │ │ └── extracted_YYYY-MM-DD_{suffix}.rds # Daily field CI stats (from Stage 20)
│ │ └── cumulative_vals/
│ │ ├── combined_CI_data.rds # Cumulative CI, wide format (from Stage 20)
│ │ └── All_pivots_Cumulative_CI_quadrant_year_v2.rds # Interpolated daily (from Stage 30)
│ └── HarvestData/
│ ├── harvest_production_export.xlsx # Baseline harvest predictions (from Stage 22)
│ └── harvest_imminent_weekly.csv # Weekly monitoring output (from Stage 31)
├── merged_tif_8b/ # Raw 4-band satellite imagery (Stage 00 output)
│ └── YYYY-MM-DD.tif # 4 bands: R, G, B, NIR (uint16 with UDM cloud masking)
├── daily_tiles_split/ # (Optional) Tile-based processing (Stage 10 output)
│ ├── 5x5/
│ │ ├── tiling_config.json # Metadata about tiling parameters
│ │ └── YYYY-MM-DD/ # Date-specific folder
│ │ └── YYYY-MM-DD_{00-24}.tif # 25 tiles per day
├── weekly_mosaic/ # Weekly composite mosaics (Stage 40 output)
│ └── week_WW_YYYY.tif # 5 bands: R, G, B, NIR, CI (composite)
└── reports/ # Analysis outputs & reports (Stage 80, 90 outputs)
├── SmartCane_Report_week{WW}_{YYYY}.docx # FINAL REPORT (Stage 90)
├── SmartCane_Report_week{WW}_{YYYY}.html # Alternative format
├── {project}_field_analysis_week{WW}.xlsx # Field-by-field data (Stage 80)
├── {project}_kpi_summary_tables_week{WW}.rds # Summary RDS (Stage 80)
└── kpis/
└── week_WW_YYYY/ # Week-specific KPI folder
```
---
## Key File Formats
| Format | Stage | Purpose | Example |
|--------|-------|---------|---------|
| `.tif` (GeoTIFF) | 00, 10, 40 | Geospatial raster imagery | `2026-01-14.tif` (4-band), `week_02_2026.tif` (5-band) |
| `.vrt` (Virtual Raster) | 20 | Virtual pointer to TIFFs | `2026-01-14.vrt` |
| `.rds` (R Binary) | 20, 21, 30, 80 | R serialized data objects | `combined_CI_data.rds`, `All_pivots_Cumulative_CI_quadrant_year_v2.rds` |
| `.csv` (Comma-Separated) | 21, 31 | Tabular data for Python | `ci_data_for_python.csv`, `harvest_imminent_weekly.csv` |
| `.xlsx` (Excel) | 22, 23, 80 | Tabular reports & harvest data | `harvest.xlsx`, `harvest_production_export.xlsx`, field analysis |
| `.docx` (Word) | 90 | Executive report (final output) | `SmartCane_Report_week02_2026.docx` |
| `.json` | 10 | Tiling metadata | `tiling_config.json` |
| `.geojson` | Input | Field boundaries (read-only) | `pivot.geojson` |
---
## Script Dependencies & Utility Files
```
parameters_project.R
├─ Loaded by: 20_ci_extraction.R, 30_interpolate_growth_model.R,
│ 40_mosaic_creation.R, 80_calculate_kpis.R, 90_CI_report_with_kpis_simple.Rmd
└─ Purpose: Initializes project config (paths, field boundaries, harvest data)
harvest_date_pred_utils.py
├─ Used by: 22_harvest_baseline_prediction.py, 23_convert_harvest_format.py, 31_harvest_imminent_weekly.py
└─ Purpose: LSTM model loading, feature extraction, two-step harvest detection
20_ci_extraction_utils.R
├─ Used by: 20_ci_extraction.R
└─ Purpose: CI calculation, field masking, RDS I/O, tile detection
30_growth_model_utils.R
├─ Used by: 30_interpolate_growth_model.R
└─ Purpose: Linear interpolation, daily metrics, seasonal grouping
40_mosaic_creation_utils.R, 40_mosaic_creation_tile_utils.R
├─ Used by: 40_mosaic_creation.R
└─ Purpose: Weekly composite creation, cloud assessment, raster masking
kpi_utils.R
├─ Used by: 80_calculate_kpis.R
└─ Purpose: Per-field statistics, phase assignment, trigger detection
report_utils.R
├─ Used by: 90_CI_report_with_kpis_simple.Rmd
└─ Purpose: Report building, table formatting, Word document generation
```
---
## Command-Line Execution Examples
### Daily/Weekly Workflow
```bash
# Stage 00: Download today's satellite data
cd python_app
python 00_download_8band_pu_optimized.py angata --cleanup
# Stage 20: Extract CI from daily imagery (last 7 days)
cd ../r_app
Rscript 20_ci_extraction.R 2026-01-14 7 angata merged_tif_8b
# Stage 21: Convert CI to CSV for harvest detection
Rscript 21_convert_ci_rds_to_csv.R angata
# Stage 31: Weekly harvest monitoring (real-time alerts)
cd ../python_app
python 31_harvest_imminent_weekly.py angata
# Back to R for mosaic and KPIs
cd ../r_app
Rscript 40_mosaic_creation.R 2026-01-14 7 angata
Rscript 80_calculate_kpis.R 2026-01-14 angata 7
# Stage 90: Generate report
Rscript -e "rmarkdown::render('90_CI_report_with_kpis_simple.Rmd')"
```
### One-Time Setup (Baseline Harvest Detection)
```bash
# Only run ONCE to establish baseline
cd python_app
python 22_harvest_baseline_prediction.py angata
# Convert to standard format
python 23_convert_harvest_format.py angata
```
---
## Processing Notes
### CI Extraction (Stage 20)
- Calculates CI = (NIR - Green) / (NIR + Green)
- Supports both 4-band and 8-band imagery with auto-detection
- Handles cloud masking via UDM band (8-band) or manual thresholding (4-band)
- Outputs cumulative RDS in wide format (fields × dates) for fast lookups
### Growth Model (Stage 30)
- Linear interpolation across missing dates
- Maintains seasonal context for agricultural lifecycle tracking
- Outputs long-format data for trend analysis
### Harvest Detection (Stages 22 & 31)
- **Model 307**: Unidirectional LSTM with dual output heads
- Imminent Head: Probability field will be harvestable in next 28 days
- Detected Head: Probability of immediate harvest event
- **Stage 22 (Baseline)**: Two-step detection on complete historical data
- Phase 1: Growing window expansion (real-time simulation)
- Phase 2: ±40 day refinement (argmax harvest signal)
- **Stage 31 (Weekly)**: Single-run inference on recent data (~300 days)
- Compares against baseline for anomaly detection
### KPI Calculation (Stage 80)
- **Per-field metrics**: Uniformity (CV), phase, growth trends, 4-week trends
- **Status triggers**: Germination, rapid growth, slow growth, non-uniform, weed pressure, harvest imminence
- **Farm-level KPIs**: 6 high-level indicators for executive summary
- **Parallel processing**: ~1000+ fields processed in <5 minutes
---
## Future Enhancements
- **Real-Time Monitoring**: Daily harvest probability updates integrated into web dashboard
- **SAR Integration**: Radar satellite data (Sentinel-1) for all-weather monitoring
- **IoT Sensors**: Ground-based soil moisture and weather integration
- **Advanced Yield Models**: Enhanced harvest forecasting with satellite + ground truth
- **Automated Alerts**: WhatsApp/email dispatch of critical agricultural advice

View file

@ -1,9 +1,9 @@
<!-- filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\system_architecture.md -->
# SmartCane System Architecture - R Pipeline & File-Based Processing
# SmartCane System Architecture - Python + R Pipeline & File-Based Processing
## Overview
The SmartCane system is a file-based agricultural intelligence platform that processes satellite imagery through a multi-stage R-script pipeline. Raw satellite imagery flows through sequential processing steps (CI extraction, growth model interpolation, mosaic creation, KPI analysis) with outputs persisted as GeoTIFFs, RDS files, and Excel/Word reports.
The SmartCane system is a file-based agricultural intelligence platform that processes satellite imagery through sequential Python and R scripts. Raw satellite imagery is downloaded via Planet API (Python), then flows through R processing stages (CI extraction, growth model interpolation, mosaic creation, KPI analysis, harvest detection) with outputs persisted as GeoTIFFs, RDS files, Excel sheets, and Word reports. Harvest monitoring is performed via ML-based harvest detection using LSTM models trained on historical CI sequences.
## Processing Pipeline Overview