Refactor full pipeline script to include intelligent checking of existing outputs and dynamic execution of scripts. Added new Python scripts for RGB validation and evaluation template creation. Enhanced error handling and logging throughout the pipeline.

This commit is contained in:
Timon 2026-01-27 08:58:06 +01:00
parent 4143bdf4d7
commit 9a55d2fcf8
14 changed files with 1035 additions and 568 deletions

View file

@ -1,4 +1,4 @@
""" r"""
Script: 02_harvest_imminent_weekly.py Script: 02_harvest_imminent_weekly.py
Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields
@ -38,12 +38,12 @@ Use Cases:
- Feed into 09b script for weekly dashboard reports - Feed into 09b script for weekly dashboard reports
Usage: Usage:
python 02_harvest_imminent_weekly.py [project_name] python python_app/31_harvest_imminent_weekly.py angata
Examples: Examples:
python 02_harvest_imminent_weekly.py angata python python_app/31_harvest_imminent_weekly.py angata
python 02_harvest_imminent_weekly.py esa python python_app/31_harvest_imminent_weekly.py esa
python 02_harvest_imminent_weekly.py chemba python python_app/31_harvest_imminent_weekly.py chemba
If no project specified, defaults to 'angata' If no project specified, defaults to 'angata'
""" """
@ -264,7 +264,7 @@ def main():
# [3] Load model (from python_app directory) # [3] Load model (from python_app directory)
print("\n[3/5] Loading Model 307...") print("\n[3/5] Loading Model 307...")
model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl model_dir = Path("python_app") # Model files located in python_app/ directory
model, config, scalers = load_model_and_config(model_dir) model, config, scalers = load_model_and_config(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Device: {device}") print(f" Device: {device}")

View file

@ -1,23 +1,25 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
Batch RGB Validation for Top 50 Largest Fields Batch RGB Validation for Top 100 Largest Fields - V3
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields. Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images).
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields.
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
Configuration: Configuration:
- GeoJSON: pivot.geojson (defines field boundaries and sizes) - GeoJSON: pivot.geojson (defines field boundaries and sizes)
- Harvest data: harvest.xlsx (season_end dates for completed harvests) - Harvest data: harvest.xlsx (season_end dates for completed harvests)
- CI data: ci_data_for_python.csv
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png - Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
Usage: Usage:
python batch_rgb_validation_top_fields.py python batch_rgb_validation_top_fields_v3.py --field 1
python batch_rgb_validation_top_fields_v3.py
Output: Output:
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png - Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
- Each grid shows 15 images at 7-day intervals around the season_end date - Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images)
""" """
import json import json
@ -26,6 +28,7 @@ import pandas as pd
from pathlib import Path from pathlib import Path
from datetime import datetime, timedelta from datetime import datetime, timedelta
import sys import sys
import argparse
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
@ -189,16 +192,26 @@ def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
def main(): def main():
parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)')
parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")')
parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)')
args = parser.parse_args()
print("="*90) print("="*90)
print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS") if args.field:
print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery") print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}")
else:
print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS")
print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)")
print("="*90) print("="*90)
# Configuration # Configuration
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") project = args.project
harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx") geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson")
output_dir = Path("laravel_app/storage/app/angata/RGB") harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx")
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") output_dir = Path(f"laravel_app/storage/app/{project}/RGB")
tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5")
# Verify paths # Verify paths
if not geojson_path.exists(): if not geojson_path.exists():
@ -213,18 +226,83 @@ def main():
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
# Step 1: Load GeoJSON and get top 50 largest fields # Handle single field mode
print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...") if args.field:
print(f"\n[1/3] Loading harvest data for field {args.field}...")
harvest_df = pd.read_excel(harvest_xlsx)
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
field_records = harvest_df[harvest_df['field'] == args.field]
field_records = field_records[field_records['season_end'].notna()]
if len(field_records) == 0:
print(f"✗ No harvest data found for field {args.field}")
return
# Get latest harvest for this field
latest_idx = field_records['season_end'].idxmax()
latest_row = field_records.loc[latest_idx]
harvest_date = latest_row['season_end']
print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}")
# Load field name from GeoJSON
print(f"\n[2/3] Loading field name from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_name = f"field_{args.field}"
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == args.field:
field_name = props.get('name', field_name)
break
print(f" ✓ Field name: {field_name}")
# Generate RGB grid
print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...")
results = generate_rgb_grids(
field_data=None,
field_id=args.field,
registered_harvest_dates=[],
predicted_harvest_dates=[
{
'harvest_date': harvest_date,
'model_name': 'harvest_xlsx'
}
],
output_dir=str(output_dir),
tiff_dir=str(tiff_dir),
geojson_path=str(geojson_path)
)
print("\n" + "="*90)
if results['predicted']:
print(f"✓ RGB grid generated successfully!")
print(f" Field: {field_name} (ID: {args.field})")
print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}")
print(f" Output: {output_dir}")
else:
print(f"⚠ No RGB grid generated (no imagery available)")
print("="*90)
return
# Batch mode for top 100 fields
print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...")
fields_df = load_geojson_and_calculate_areas(geojson_path) fields_df = load_geojson_and_calculate_areas(geojson_path)
if fields_df is None: if fields_df is None:
return return
top_50_fields = fields_df.head(50) top_100_fields = fields_df.head(100)
print(f" ✓ Selected {len(top_50_fields)} largest fields for processing") print(f" ✓ Selected {len(top_100_fields)} largest fields for processing")
# Step 2: Load harvest dates from Excel # Step 2: Load harvest dates from Excel
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields) harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields)
if len(harvest_dates) == 0: if len(harvest_dates) == 0:
print("✗ No harvest dates found in Excel file") print("✗ No harvest dates found in Excel file")
@ -237,7 +315,7 @@ def main():
print(f" ... and {len(harvest_dates) - 5} more") print(f" ... and {len(harvest_dates) - 5} more")
# Step 3: Generate RGB grids for each field # Step 3: Generate RGB grids for each field
print("\n[3/4] Generating RGB validation grids...") print("\n[3/4] Generating RGB validation grids (v3 dynamic)...")
rgb_count = 0 rgb_count = 0
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):

View file

@ -0,0 +1,193 @@
"""
Create an Excel evaluation template for RGB harvest date predictions.
Parses field names and dates directly from RGB image filenames.
"""
import os
import glob
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import re
from datetime import datetime
# Configuration
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
RGB_DIR = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB")
OUTPUT_PATH = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB_Evaluation_Template.xlsx")
# Evaluators
EVALUATORS = ["Joey", "Daniel", "Nik", "Dimitra", "Timon"]
def parse_rgb_filenames():
"""
Parse field names and harvest dates from RGB image filenames.
Expected format: field_{field_id or name}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png
"""
fields_data = []
# Find all RGB images
rgb_files = glob.glob(os.path.join(RGB_DIR, "field_*.png"))
for filepath in sorted(rgb_files):
filename = os.path.basename(filepath)
# Pattern: field_{field_id}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png
match = re.match(r"field_(.+?)_(\d{8})_harvest_xlsx_harvest_rgb\.png", filename)
if match:
field_id = match.group(1) # e.g., "1000" or "91&92"
date_str = match.group(2) # e.g., "20250814"
# Format date as YYYY-MM-DD
try:
harvest_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
except ValueError:
harvest_date = date_str
fields_data.append({
"field_id": field_id,
"harvest_date": harvest_date,
"filename": filename
})
# Sort by field_id (treating numeric parts as integers where possible)
fields_data = sorted(fields_data, key=lambda x: (x["field_id"].replace("&92", ""), ))
return fields_data
def create_evaluation_template():
"""Create the Excel evaluation template."""
print("Loading field data from RGB images...")
fields_data = parse_rgb_filenames()
if not fields_data:
print("ERROR: No RGB images found in", RGB_DIR)
return
print(f"Found {len(fields_data)} RGB images")
# Create workbook
wb = Workbook()
# === SHEET 1: Evaluation Form ===
ws_eval = wb.active
ws_eval.title = "Evaluation"
# Define styles
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
header_font = Font(bold=True, color="FFFFFF", size=11)
border = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin')
)
center_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
left_align = Alignment(horizontal="left", vertical="center", wrap_text=True)
# Column headers
headers = ["Field ID", "Predicted Harvest Date"] + EVALUATORS
for col_idx, header in enumerate(headers, start=1):
cell = ws_eval.cell(row=1, column=col_idx, value=header)
cell.fill = header_fill
cell.font = header_font
cell.alignment = center_align
cell.border = border
# Set column widths
ws_eval.column_dimensions['A'].width = 15
ws_eval.column_dimensions['B'].width = 20
for col_idx in range(3, 3 + len(EVALUATORS)):
ws_eval.column_dimensions[get_column_letter(col_idx)].width = 12
# Add data rows
for row_idx, field in enumerate(fields_data, start=2):
ws_eval.cell(row=row_idx, column=1, value=field["field_id"])
ws_eval.cell(row=row_idx, column=2, value=field["harvest_date"])
# Add empty cells for evaluator responses
for col_idx in range(3, 3 + len(EVALUATORS)):
cell = ws_eval.cell(row=row_idx, column=col_idx)
cell.alignment = center_align
cell.border = border
# Light alternating row color
if row_idx % 2 == 0:
for col_idx in range(1, 3 + len(EVALUATORS)):
ws_eval.cell(row=row_idx, column=col_idx).fill = PatternFill(
start_color="D9E8F5", end_color="D9E8F5", fill_type="solid"
)
# Apply borders to all data cells
for col_idx in range(1, 3 + len(EVALUATORS)):
ws_eval.cell(row=row_idx, column=col_idx).border = border
if col_idx == 1 or col_idx == 2:
ws_eval.cell(row=row_idx, column=col_idx).alignment = left_align
# Freeze panes
ws_eval.freeze_panes = "C2"
# === SHEET 2: Instructions ===
ws_instr = wb.create_sheet("Instructions")
instr_content = [
["RGB Evaluation Instructions", ""],
["", ""],
["Overview:", ""],
["The generated RGB images visualize the predicted harvest dates for each field.", ""],
["The images are 3x3 grids showing satellite imagery from different dates", ""],
["centered on the predicted harvest date (the center/red-box image).", ""],
["", ""],
["What to Evaluate:", ""],
["For each field, determine if the predicted harvest date is CORRECT:", ""],
["", ""],
["Instructions for Reviewing:", ""],
["1. Look at the CENTER image (red box) - this is the predicted harvest date", ""],
["2. Compare to surrounding dates (before and after)", ""],
["3. Look for change in field color/status:", ""],
[" • BEFORE: Field appears GREEN (growing/healthy crop)", ""],
[" • AT PREDICTED DATE: Field shows BROWN/YELLOW (soil visible, ripe for harvest)", ""],
[" • AFTER: Field continues BROWN (post-harvest or dormant)", ""],
["", ""],
["How to Enter Your Assessment:", ""],
["Enter one of the following in your evaluator column for each field:", ""],
[" • YES = Predicted date is CORRECT (brown/harvest-ready at center date)", ""],
[" • NO = Predicted date is INCORRECT (not ready or already harvested)", ""],
[" • ? or MAYBE = Uncertain (cloudy images, unclear field status)", ""],
["", ""],
["Workflow Options:", ""],
["Option A (Divide Work): Assign 2-3 fields per evaluator (rows divided by column)", ""],
["Option B (Full Review): Each evaluator reviews all fields (everyone fills all rows)", ""],
["Option C (Spot Check): Each evaluator samples 5-10 random fields", ""],
["", ""],
["Image Location:", ""],
["All RGB images are in: /laravel_app/storage/app/angata/RGB/", ""],
["Format: field_{FIELD_ID}_{YYYY-MM-DD}_harvest_xlsx_harvest_rgb.png", ""],
["", ""],
["Notes:", ""],
["• Cloud cover may obscure ground truth - use best judgment", ""],
["• Fields with multiple bands or irregular shapes: focus on dominant area", ""],
["• Use context from previous/next dates to validate your assessment", ""],
]
# Add instructions text
for row_idx, row_data in enumerate(instr_content, start=1):
for col_idx, value in enumerate(row_data, start=1):
cell = ws_instr.cell(row=row_idx, column=col_idx, value=value)
if row_idx == 1:
cell.font = Font(bold=True, size=14)
elif any(keyword in str(value) for keyword in ["Overview:", "Instructions", "Workflow", "Image Location", "Notes"]):
cell.font = Font(bold=True, size=11)
cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True)
ws_instr.column_dimensions['A'].width = 50
ws_instr.column_dimensions['B'].width = 80
# Save workbook
wb.save(OUTPUT_PATH)
print(f"✓ Evaluation template created: {OUTPUT_PATH}")
print(f"{len(fields_data)} fields added to evaluation form")
print(f"✓ Evaluators: {', '.join(EVALUATORS)}")
if __name__ == "__main__":
create_evaluation_template()

View file

@ -1,107 +0,0 @@
#!/usr/bin/env python
"""
Debug script to find all tiles for a date and check which overlap with field boundary
"""
import json
import rasterio
from rasterio.mask import mask
from pathlib import Path
import numpy as np
import shapely.geometry as shgeom
import pandas as pd
# Load field 79 boundary
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
field_id = "79"
print(f"Loading field {field_id} from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_boundary = None
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == str(field_id):
geometry = feature.get('geometry')
if geometry:
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
if geom_type == 'MultiPolygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0][0]
field_boundary = shgeom.Polygon(coords)
elif geom_type == 'Polygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0]
field_boundary = shgeom.Polygon(coords)
break
if field_boundary is None:
print(f"Field {field_id} not found")
exit(1)
print(f"Field boundary bounds: {field_boundary.bounds}")
print(f"Field boundary area: {field_boundary.area}")
# Find a specific date directory
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
target_date = pd.Timestamp("2026-01-15") # Use a recent date that exists
# Find tiles for that date
date_dirs = []
for date_dir in tiff_dir.iterdir():
if date_dir.is_dir():
try:
dir_name = date_dir.name
date_str = dir_name.split('_')[0]
tile_date = pd.Timestamp(date_str)
if tile_date == target_date:
date_dirs.append(date_dir)
except:
pass
if not date_dirs:
print(f"No tiles found for {target_date}")
exit(1)
print(f"\nFound {len(date_dirs)} date directory(ies) for {target_date}")
for date_dir in date_dirs:
print(f"\n=== Checking date directory: {date_dir.name} ===")
tiles = list(date_dir.glob("*.tif"))
print(f"Found {len(tiles)} tiles in this directory")
for tile_path in sorted(tiles):
try:
with rasterio.open(tile_path) as src:
tile_bounds = src.bounds
tile_geom = shgeom.box(*tile_bounds)
intersects = field_boundary.intersects(tile_geom)
intersection = field_boundary.intersection(tile_geom) if intersects else None
intersection_area = intersection.area if intersection else 0
print(f"\n{tile_path.name}")
print(f" Tile bounds: {tile_bounds}")
print(f" Intersects field: {intersects}")
if intersects:
print(f" Intersection area: {intersection_area:.8f}")
# Try to mask this tile
geom = shgeom.mapping(field_boundary)
try:
masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3])
print(f" ✓ Successfully masked! Shape: {masked_data.shape}")
# Check the data in each band
for i, band_idx in enumerate([1, 2, 3]):
band_data = masked_data[i]
non_zero = (band_data != 0).sum()
print(f" Band {band_idx}: {non_zero} non-zero pixels out of {band_data.size}")
except ValueError as e:
print(f" ✗ Masking failed: {e}")
except Exception as e:
print(f" Error reading tile: {e}")

View file

@ -1,102 +0,0 @@
#!/usr/bin/env python
"""
Debug script to diagnose why field boundary masking produces no data
"""
import json
import rasterio
from rasterio.mask import mask
from pathlib import Path
import numpy as np
import shapely.geometry as shgeom
# Load a sample field boundary
geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson")
field_id = "79" # A field that had issues
print(f"Loading field {field_id} from GeoJSON...")
with open(geojson_path) as f:
geojson_data = json.load(f)
field_boundary = None
for feature in geojson_data.get('features', []):
props = feature.get('properties', {})
if str(props.get('field', '')) == str(field_id):
geometry = feature.get('geometry')
if geometry:
geom_type = geometry.get('type', '')
coordinates = geometry.get('coordinates', [])
if geom_type == 'MultiPolygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0][0]
field_boundary = shgeom.Polygon(coords)
elif geom_type == 'Polygon':
if coordinates and len(coordinates) > 0:
coords = coordinates[0]
field_boundary = shgeom.Polygon(coords)
break
if field_boundary is None:
print(f"Field {field_id} not found")
exit(1)
print(f"Field boundary bounds: {field_boundary.bounds}")
print(f"Field boundary area: {field_boundary.area}")
# Load a sample TIFF tile
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
tile_file = None
for date_dir in sorted(tiff_dir.iterdir()):
if date_dir.is_dir():
for tif in date_dir.glob("*.tif"):
if tif.stat().st_size > 12e6:
tile_file = tif
break
if tile_file:
break
if not tile_file:
print("No suitable TIFF found")
exit(1)
print(f"\nTesting with TIFF: {tile_file.name}")
with rasterio.open(tile_file) as src:
print(f"TIFF Bounds: {src.bounds}")
print(f"TIFF CRS: {src.crs}")
# Check if field boundary is within tile bounds
tile_box = shgeom.box(*src.bounds)
intersects = field_boundary.intersects(tile_box)
print(f"Field boundary intersects tile: {intersects}")
if intersects:
intersection = field_boundary.intersection(tile_box)
print(f"Intersection area: {intersection.area}")
print(f"Intersection bounds: {intersection.bounds}")
# Try to mask and see what we get
print("\nAttempting to mask...")
geom = shgeom.mapping(field_boundary)
try:
masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3])
print(f"Masked data shape: {masked_data.shape}")
print(f"Masked data dtype: {masked_data.dtype}")
# Check the data
for i, band_idx in enumerate([1, 2, 3]):
band_data = masked_data[i]
print(f"\nBand {band_idx}:")
print(f" min: {np.nanmin(band_data):.6f}")
print(f" max: {np.nanmax(band_data):.6f}")
print(f" mean: {np.nanmean(band_data):.6f}")
print(f" % valid (non-zero): {(band_data != 0).sum() / band_data.size * 100:.2f}%")
print(f" % NaN: {np.isnan(band_data).sum() / band_data.size * 100:.2f}%")
# Show sample values
valid_pixels = band_data[band_data != 0]
if len(valid_pixels) > 0:
print(f" Sample valid values: {valid_pixels[:10]}")
except ValueError as e:
print(f"Error during masking: {e}")

View file

@ -1,47 +0,0 @@
#!/usr/bin/env python
"""
Debug script to inspect TIFF file structure and data
"""
import rasterio
from pathlib import Path
import numpy as np
# Pick a tile file to inspect
tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5")
# Find first available tile
tile_file = None
for date_dir in sorted(tiff_dir.iterdir()):
if date_dir.is_dir():
for tif in date_dir.glob("*.tif"):
if tif.stat().st_size > 12e6: # Skip empty files
tile_file = tif
break
if tile_file:
break
if not tile_file:
print("No suitable TIFF files found")
exit(1)
print(f"Inspecting: {tile_file.name}")
print("=" * 80)
with rasterio.open(tile_file) as src:
print(f"Band count: {src.count}")
print(f"Data type: {src.dtypes[0]}")
print(f"Shape: {src.height} x {src.width}")
print(f"CRS: {src.crs}")
print(f"Bounds: {src.bounds}")
print()
# Read each band
for band_idx in range(1, min(6, src.count + 1)):
data = src.read(band_idx)
print(f"Band {band_idx}:")
print(f" dtype: {data.dtype}")
print(f" range: {data.min():.6f} - {data.max():.6f}")
print(f" mean: {data.mean():.6f}")
print(f" % valid (non-zero): {(data != 0).sum() / data.size * 100:.1f}%")
print()

View file

@ -4,7 +4,7 @@ Purpose: Download Planet satellite data for missing dates only (skip existing fi
Can be called from batch scripts or other Python scripts. Can be called from batch scripts or other Python scripts.
Usage: Usage:
python download_planet_missing_dates.py --start 2022-01-01 --end 2025-12-15 --project angata python download_planet_missing_dates.py --start 2026-01-17 --end 2026-12-20 --project angata
python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run
Environment variables (alternative to CLI args): Environment variables (alternative to CLI args):

View file

@ -1,29 +0,0 @@
# Merge all CI RDS files into a single CSV
library(tidyverse)
# Paths
ci_data_dir <- "r_app/experiments/ci_graph_exploration/CI_data"
output_csv <- "python_app/lstm_ci_data_combined.csv"
# Find all RDS files
rds_files <- list.files(ci_data_dir, pattern = "\\.rds$", full.names = TRUE)
print(paste("Found", length(rds_files), "RDS files"))
# Load and combine all files
combined_data <- tibble()
for (file in rds_files) {
filename <- basename(file)
client_name <- sub("\\.rds$", "", filename) # Extract client name from filename
print(paste("Loading:", filename, "- Client:", client_name))
data <- readRDS(file)
data$client <- client_name
combined_data <- bind_rows(combined_data, data)
}
print(paste("Total rows:", nrow(combined_data)))
print(paste("Columns:", paste(names(combined_data), collapse = ", ")))
# Write to CSV
write.csv(combined_data, output_csv, row.names = FALSE)
print(paste("✓ Saved to:", output_csv))

View file

@ -88,95 +88,116 @@ def load_field_boundaries(geojson_path, field_id):
return None, None return None, None
def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60): def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=None, debug=False):
""" """
Find all tile files for target_date (or closest date) that overlap with field_boundary. Find tile files with actual data (not cloud-masked) for target_date or nearest date.
Tile files are organized in subdirectories by date: 5x5/YYYY-MM-DD_HH/*.tif Searches by increasing distance from target date until finding tiles with data.
Avoids reusing dates in exclude_dates to ensure temporal diversity in grids.
Args: Args:
target_date (pd.Timestamp): Target date to find tiles near target_date (pd.Timestamp): Target date to find tiles near
tiff_dir (Path): Directory containing 5x5 date subdirectories tiff_dir (Path): Directory containing 5x5 date subdirectories
field_boundary (shapely.Polygon): Field boundary for overlap detection field_boundary (shapely.Polygon): Field boundary for overlap detection
days_window (int): Max days to search before/after target days_window (int): Max days to search before/after target
exclude_dates (list): List of dates to skip (avoid repetition)
debug (bool): Enable detailed debugging output
Returns: Returns:
tuple: (list of tile paths, actual_date, days_diff) tuple: (list of tile paths, actual_date, days_diff)
list: tile paths that overlap field
pd.Timestamp: actual date of tiles found
int: days difference from target to actual date found
""" """
target_date = pd.Timestamp(target_date) target_date = pd.Timestamp(target_date)
tiff_dir = Path(tiff_dir) tiff_dir = Path(tiff_dir)
exclude_dates = exclude_dates or []
exclude_dates = [pd.Timestamp(d) for d in exclude_dates]
if not tiff_dir.exists(): if not tiff_dir.exists():
if debug:
print(f" [DEBUG] TIFF dir does not exist: {tiff_dir}")
return [], None, None return [], None, None
# Find all date subdirectories # Build map of all available dates
available_dates = {} # {date: ([tile file paths], actual_dir_name)} available_dates = {}
min_size_mb = 12.0 # Empty files are ~11.56 MB date_parse_errors = 0
for date_dir in tiff_dir.iterdir(): for date_dir in tiff_dir.iterdir():
if not date_dir.is_dir(): if not date_dir.is_dir():
continue continue
try: try:
# Parse date from directory name (YYYY-MM-DD or YYYY-MM-DD_HH)
dir_name = date_dir.name dir_name = date_dir.name
# Extract just the date part before underscore if it exists
date_str = dir_name.split('_')[0] date_str = dir_name.split('_')[0]
tile_date = pd.Timestamp(date_str) tile_date = pd.Timestamp(date_str)
days_diff = (tile_date - target_date).days
if abs(days_diff) > days_window:
continue
# Find all .tif files in this date directory
tile_files = [] tile_files = []
for tile_file in date_dir.glob('*.tif'): for tile_file in date_dir.glob('*.tif'):
# Skip obviously empty files # Include ALL tiles, regardless of size
file_size_mb = tile_file.stat().st_size / (1024 * 1024) # Some tiles may be small but still contain valid data for specific fields
if file_size_mb >= min_size_mb: tile_files.append(tile_file)
tile_files.append(tile_file)
if tile_files: if tile_files:
available_dates[tile_date] = (tile_files, dir_name) available_dates[tile_date] = (tile_files, dir_name)
except: except Exception as e:
pass date_parse_errors += 1
if debug:
print(f" [DEBUG] Failed to parse date from {date_dir.name}: {e}")
if debug:
print(f" [DEBUG] Found {len(available_dates)} dates with tile files, {date_parse_errors} parse errors")
print(f" [DEBUG] Date range: {min(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'} to {max(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'}")
if not available_dates: if not available_dates:
return [], None, None return [], None, None
# Find closest date # Search dates by increasing distance from target, looking for tiles with actual data
closest_date = min(available_dates.keys(), key=lambda d: abs((d - target_date).days)) sorted_dates = sorted(available_dates.keys(), key=lambda d: abs((d - target_date).days))
days_diff = (closest_date - target_date).days
tiles, _ = available_dates[closest_date]
# Filter tiles to only those that overlap field boundary for search_date in sorted_dates:
if rasterio is None or field_boundary is None: # Skip if this date was recently used (avoid temporal repetition)
# If rasterio not available, use all tiles (conservative approach) if search_date in exclude_dates:
return tiles, closest_date, days_diff continue
overlapping_tiles = [] tiles, dir_name = available_dates[search_date]
days_diff = (search_date - target_date).days
for tile_path in tiles: # Try to find overlapping tiles at this date
try: overlapping_tiles = []
with rasterio.open(tile_path) as src: tile_check_errors = 0
# Get tile bounds
tile_bounds = src.bounds # (left, bottom, right, top)
tile_geom = shgeom.box(*tile_bounds)
# Check if tile overlaps field for tile_path in tiles:
if tile_geom.intersects(field_boundary): try:
overlapping_tiles.append(tile_path) with rasterio.open(tile_path) as src:
except: tile_bounds = src.bounds
pass tile_geom = shgeom.box(*tile_bounds)
if not overlapping_tiles: # Debug first tile
# No overlapping tiles found, return all tiles for the closest date if debug and len(overlapping_tiles) == 0 and tile_check_errors == 0:
return tiles, closest_date, days_diff print(f" [DEBUG] First tile check for {tile_path.name}:")
print(f" Tile bounds: {tile_bounds}")
print(f" Tile CRS: {src.crs}")
print(f" Field bounds: {field_boundary.bounds}")
print(f" Field geom type: {field_boundary.geom_type}")
print(f" Intersects: {tile_geom.intersects(field_boundary)}")
return overlapping_tiles, closest_date, days_diff if tile_geom.intersects(field_boundary):
overlapping_tiles.append(tile_path)
except Exception as e:
tile_check_errors += 1
if debug:
print(f" [DEBUG] Error checking tile {tile_path.name}: {e}")
if debug:
print(f" [DEBUG] Date {search_date.strftime('%Y-%m-%d')}: {len(tiles)} tiles, {len(overlapping_tiles)} overlap field, {tile_check_errors} errors")
if overlapping_tiles:
# Found overlapping tiles, return them
print(f" [FIND_TILES] Target: {target_date.strftime('%Y-%m-%d')}, Using: {search_date.strftime('%Y-%m-%d')} ({days_diff:+d}d), Tiles: {[Path(t).name for t in overlapping_tiles]}")
return overlapping_tiles, search_date, days_diff
# No overlapping tiles found at all
if debug:
print(f" [DEBUG] No overlapping tiles found for {target_date.strftime('%Y-%m-%d')} within {len(sorted_dates)} searched dates")
return [], None, None
def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
@ -211,44 +232,44 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
# For merged_final_tif: bands 1,2,3 are R,G,B # For merged_final_tif: bands 1,2,3 are R,G,B
bands_to_read = (1, 2, 3) bands_to_read = (1, 2, 3)
# Mask and read bands # Mask and read bands - extract ONLY the specific field polygon
geom = shgeom.mapping(field_boundary) geom = shgeom.mapping(field_boundary)
try: try:
masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read)) masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read))
# Stack RGB
rgb = np.stack([masked_data[i] for i in range(3)], axis=-1) rgb = np.stack([masked_data[i] for i in range(3)], axis=-1)
except (ValueError, RuntimeError) as e:
# Convert to float32 if not already # Mask failed - field doesn't overlap this tile or geometry issue
rgb = rgb.astype(np.float32) print(f" MASK ERROR on {Path(tiff_path).name}: {str(e)[:50]}")
# Normalize to 0-1 range
# Data appears to be 8-bit (0-255 range) stored as float32
# Check actual max value to determine normalization
max_val = np.nanmax(rgb)
if max_val > 0:
# If max is around 255 or less, assume 8-bit
if max_val <= 255:
rgb = rgb / 255.0
# If max is around 65535, assume 16-bit
elif max_val <= 65535:
rgb = rgb / 65535.0
# Otherwise divide by max to normalize
else:
rgb = rgb / max_val
rgb = np.clip(rgb, 0, 1)
# Check if result is all NaN
if np.all(np.isnan(rgb)):
return None
# Replace any remaining NaN with 0 (cloud/invalid pixels)
rgb = np.nan_to_num(rgb, nan=0.0)
return rgb
except ValueError:
return None return None
# Convert to float32 if not already
rgb = rgb.astype(np.float32)
# Normalize to 0-1 range
# Data appears to be 8-bit (0-255 range) stored as float32
# Check actual max value to determine normalization
max_val = np.nanmax(rgb)
if max_val > 0:
# If max is around 255 or less, assume 8-bit
if max_val <= 255:
rgb = rgb / 255.0
# If max is around 65535, assume 16-bit
elif max_val <= 65535:
rgb = rgb / 65535.0
# Otherwise divide by max to normalize
else:
rgb = rgb / max_val
rgb = np.clip(rgb, 0, 1)
# Check if result is all NaN
if np.all(np.isnan(rgb)):
return None
# Replace any remaining NaN with 0 (cloud/invalid pixels)
rgb = np.nan_to_num(rgb, nan=0.0)
return rgb
except Exception as e: except Exception as e:
return None return None
@ -284,9 +305,16 @@ def load_and_composite_tiles_rgb(tile_paths, field_boundary):
if len(rgb_arrays) == 1: if len(rgb_arrays) == 1:
composited = rgb_arrays[0] composited = rgb_arrays[0]
else: else:
# If multiple tiles, use max composite # If multiple tiles, need to handle different shapes
stacked = np.stack(rgb_arrays, axis=0) # Find common shape or use max/min approach that handles variable sizes
composited = np.max(stacked, axis=0) try:
# Try to stack if same shape
stacked = np.stack(rgb_arrays, axis=0)
composited = np.max(stacked, axis=0)
except ValueError:
# Different shapes - use the largest (most complete) tile
# This happens when tiles are masked to different field areas
composited = max(rgb_arrays, key=lambda x: x.size)
composited = composited.astype(np.float32) composited = composited.astype(np.float32)
@ -307,6 +335,26 @@ def load_and_composite_tiles_rgb(tile_paths, field_boundary):
return None return None
def has_valid_rgb_data(rgb_data, threshold=0.05):
"""
Check if RGB image has actual data (not black/empty).
Returns True if max value > threshold (not all zeros/black).
"""
if rgb_data is None:
return False
try:
# Check if there's any variation in the RGB data
data_max = np.nanmax(rgb_data)
data_min = np.nanmin(rgb_data)
# Image is valid if max > threshold AND there's variation
has_data = data_max > threshold and (data_max - data_min) > 0.01
return has_data
except:
return False
def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary, def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary,
title, output_dir, harvest_type='registered', model_name=None, harvest_index=None): title, output_dir, harvest_type='registered', model_name=None, harvest_index=None):
""" """
@ -334,91 +382,263 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field
""" """
harvest_date = pd.Timestamp(harvest_date) harvest_date = pd.Timestamp(harvest_date)
# Target dates: 15 images at 7-day intervals (8 pre, 1 near, 6 post) # Pre-allocate 15 image slots
target_dates = [ rgb_images = [None] * 15
harvest_date - timedelta(days=56), # T-56d days_offsets = [None] * 15
harvest_date - timedelta(days=49), # T-49d actual_dates = [None] * 15
harvest_date - timedelta(days=42), # T-42d used_dates = set() # Use set for efficient lookups
harvest_date - timedelta(days=35), # T-35d
harvest_date - timedelta(days=28), # T-28d
harvest_date - timedelta(days=21), # T-21d
harvest_date - timedelta(days=14), # T-14d
harvest_date - timedelta(days=7), # T-7d
harvest_date, # T~0d (near harvest)
harvest_date + timedelta(days=7), # T+7d
harvest_date + timedelta(days=14), # T+14d
harvest_date + timedelta(days=21), # T+21d
harvest_date + timedelta(days=28), # T+28d
harvest_date + timedelta(days=35), # T+35d
harvest_date + timedelta(days=42), # T+42d
harvest_date + timedelta(days=56), # T+56d (Note: non-standard to fill 5th col in row 3)
]
# Find TIFFs for each date # STEP 0: Debug - List all available dates
rgb_images = [] print(f" [STEP 0] Checking available TIFF dates in {tiff_dir}...")
days_offsets = [] available_dates = []
actual_dates = [] # Store actual dates of TIFFs found if tiff_dir.exists():
for date_folder in sorted(tiff_dir.iterdir()):
if date_folder.is_dir():
try:
date_obj = datetime.strptime(date_folder.name, '%Y-%m-%d').date()
available_dates.append(date_obj)
except:
pass
print(f" Found {len(available_dates)} dates with data: {available_dates[:5]}... (showing first 5)")
for target in target_dates: # STEP 1: Find anchor image (closest to predicted harvest date) FIRST
tile_paths, actual_date, days_diff = find_overlapping_tiles(target, tiff_dir, field_boundary, days_window=60) # Search within ±14 days of predicted harvest date first, then expand if needed
print(f" [STEP 1] Finding anchor (closest to harvest {harvest_date.strftime('%Y-%m-%d')}, searching ±14 days)...")
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
harvest_date, tiff_dir, field_boundary, days_window=14, exclude_dates=[], debug=False
)
if not tile_paths or actual_date is None: anchor_rgb = None
rgb_images.append(None) anchor_idx = 8 # Position 8 is the center (T~0d / harvest date position)
days_offsets.append(None) failed_anchor_dates = [] # Track dates that failed validation
actual_dates.append(None)
print(f" ⚠ No tiles found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data")
continue
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) if anchor_tile_paths and anchor_date:
rgb_images.append(rgb) anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
days_offsets.append(days_diff) if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
actual_dates.append(actual_date) rgb_images[anchor_idx] = anchor_rgb
days_offsets[anchor_idx] = 0 # Anchor is reference point
if rgb is not None: actual_dates[anchor_idx] = anchor_date
print(f" ✓ Loaded {len(tile_paths)} tile(s) for {actual_date.strftime('%Y-%m-%d')} ({days_diff:+d}d from target)") used_dates.add(anchor_date)
print(f" ✓ ANCHOR FOUND (±14d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
else: else:
print(f" ⚠ Loaded {len(tile_paths)} tile(s) but RGB data is None") failed_anchor_dates.append(anchor_date)
print(f" ⚠ Found date {anchor_date.strftime('%Y-%m-%d')} within ±14d, but image has no valid data")
print(f" [RETRY] Expanding anchor search to ±60 days (excluding failed dates)...")
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=set(failed_anchor_dates), debug=False
)
if anchor_tile_paths and anchor_date:
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
rgb_images[anchor_idx] = anchor_rgb
days_offsets[anchor_idx] = 0 # Anchor is reference point
actual_dates[anchor_idx] = anchor_date
used_dates.add(anchor_date)
print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
else:
failed_anchor_dates.append(anchor_date)
print(f" ✗ No valid anchor found even within ±60 days")
else:
print(f" ✗ No tiles found for any date within ±60 days")
else:
print(f" ⚠ No tiles found within ±14 days, expanding search...")
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=[], debug=False
)
if anchor_tile_paths and anchor_date:
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
rgb_images[anchor_idx] = anchor_rgb
days_offsets[anchor_idx] = 0 # Anchor is reference point
actual_dates[anchor_idx] = anchor_date
used_dates.add(anchor_date)
print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
else:
print(f" ✗ No valid anchor found even within ±60 days")
else:
print(f" ✗ No tiles found for any date within ±60 days")
# STEP 2: Dynamically collect images BEFORE anchor date
# Strategy: Go backwards from anchor with progressively larger search windows
# Start at 7 days, then try 10, 15, 20, 30+ days apart
print(f" [STEP 2] Collecting images BEFORE anchor (going backwards, flexible spacing)...")
before_positions = [7, 6, 5, 4, 3, 2, 1, 0] # Will fill in reverse order (7→0)
before_images = [] # (position, date, rgb, offset)
pos_idx = 0 # Index into before_positions
last_found_date = anchor_date
# Progressive search offsets: try these day offsets in order
search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days before last found image
while pos_idx < len(before_positions) and last_found_date.year >= 2024:
found_this_iteration = False
# Try each offset until we find a valid image
for days_offset in search_offsets:
search_target_date = last_found_date - timedelta(days=days_offset)
tile_paths, actual_date, days_diff = find_overlapping_tiles(
search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False
)
if tile_paths and actual_date:
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
if rgb is not None and has_valid_rgb_data(rgb):
# Found valid image!
overall_max = np.nanmax(rgb)
overall_min = np.nanmin(rgb)
offset_from_anchor = (actual_date - anchor_date).days
before_images.append((before_positions[pos_idx], actual_date, rgb, offset_from_anchor))
used_dates.add(actual_date)
last_found_date = actual_date # Move backwards from this date
print(f" ✓ Before[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}")
pos_idx += 1
found_this_iteration = True
break # Found one, stop trying larger offsets
# If nothing found with any offset, we're done collecting before images
if not found_this_iteration:
break
# Store collected before images
for pos, actual_date, rgb, offset in before_images:
rgb_images[pos] = rgb
actual_dates[pos] = actual_date
days_offsets[pos] = offset
# STEP 3: Dynamically collect images AFTER anchor date
# Strategy: Go forwards from anchor with progressively larger search windows
# Start at 7 days, then try 10, 15, 20, 30+ days apart
print(f" [STEP 3] Collecting images AFTER anchor (going forwards, flexible spacing)...")
after_positions = [9, 10, 11, 12, 13, 14] # Will fill in order (9→14)
after_images = [] # (position, date, rgb, offset)
pos_idx = 0 # Index into after_positions
last_found_date = anchor_date
max_search_date = anchor_date + timedelta(days=200) # Don't search beyond 200 days forward
# Progressive search offsets: try these day offsets in order
search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days after last found image
while pos_idx < len(after_positions) and last_found_date < max_search_date:
found_this_iteration = False
# Try each offset until we find a valid image
for days_offset in search_offsets:
search_target_date = last_found_date + timedelta(days=days_offset)
# Don't search beyond max date
if search_target_date > max_search_date:
break
tile_paths, actual_date, days_diff = find_overlapping_tiles(
search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False
)
if tile_paths and actual_date:
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
if rgb is not None and has_valid_rgb_data(rgb):
# Found valid image!
overall_max = np.nanmax(rgb)
overall_min = np.nanmin(rgb)
offset_from_anchor = (actual_date - anchor_date).days
after_images.append((after_positions[pos_idx], actual_date, rgb, offset_from_anchor))
used_dates.add(actual_date)
last_found_date = actual_date # Move forwards from this date
print(f" ✓ After[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}")
pos_idx += 1
found_this_iteration = True
break # Found one, stop trying larger offsets
# If nothing found with any offset, we're done collecting after images
if not found_this_iteration:
break
# Store collected after images
for pos, actual_date, rgb, offset in after_images:
rgb_images[pos] = rgb
actual_dates[pos] = actual_date
days_offsets[pos] = offset
# Create 5x3 grid plot (15 images) # Create 5x3 grid plot (15 images)
fig, axes = plt.subplots(3, 5, figsize=(25, 15)) fig, axes = plt.subplots(3, 5, figsize=(25, 15))
fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")}',
# Build title with anchor offset information
anchor_offset_from_harvest = (actual_dates[8] - harvest_date).days if actual_dates[8] is not None else None
if anchor_offset_from_harvest is not None and anchor_offset_from_harvest != 0:
anchor_info = f"(Anchor: {actual_dates[8].strftime('%Y-%m-%d')}, {anchor_offset_from_harvest:+d}d from predicted harvest)"
else:
anchor_info = f"(Exact match with anchor: {actual_dates[8].strftime('%Y-%m-%d')})" if actual_dates[8] is not None else ""
fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")} {anchor_info}',
fontsize=16, fontweight='bold') fontsize=16, fontweight='bold')
# Grid positions (5 columns, 3 rows = 15 images) # Grid positions (5 columns, 3 rows = 15 images)
positions = [ positions = [
('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4), ('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4),
('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('T~0d', 1, 3), ('T+7d', 1, 4), ('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('HARVEST', 1, 3), ('T+7d', 1, 4),
('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4), ('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4),
] ]
for idx, (label, row, col) in enumerate(positions): # All 15 images for idx, (label, row, col) in enumerate(positions):
ax = axes[row, col] ax = axes[row, col]
if idx < len(rgb_images) and rgb_images[idx] is not None: if idx < len(rgb_images) and rgb_images[idx] is not None:
rgb_data = rgb_images[idx] rgb_data = rgb_images[idx]
# Debug: check data range # Debug: check data range for ALL bands
data_min, data_max = np.nanmin(rgb_data), np.nanmax(rgb_data) data_min = np.nanmin(rgb_data)
print(f" DEBUG: {label} RGB range: {data_min:.4f} - {data_max:.4f}, shape: {rgb_data.shape}") data_max = np.nanmax(rgb_data)
data_mean = np.nanmean(rgb_data)
data_std = np.nanstd(rgb_data)
# Check per-band stats
r_min, r_max, r_mean = np.nanmin(rgb_data[:,:,0]), np.nanmax(rgb_data[:,:,0]), np.nanmean(rgb_data[:,:,0])
g_min, g_max, g_mean = np.nanmin(rgb_data[:,:,1]), np.nanmax(rgb_data[:,:,1]), np.nanmean(rgb_data[:,:,1])
b_min, b_max, b_mean = np.nanmin(rgb_data[:,:,2]), np.nanmax(rgb_data[:,:,2]), np.nanmean(rgb_data[:,:,2])
print(f" DEBUG VALID {label} ({actual_dates[idx].strftime('%Y-%m-%d')}): RGB overall {data_min:.4f}-{data_max:.4f} (mean={data_mean:.4f}, std={data_std:.4f})")
print(f" R: {r_min:.4f}-{r_max:.4f} (μ={r_mean:.4f}), G: {g_min:.4f}-{g_max:.4f} (μ={g_mean:.4f}), B: {b_min:.4f}-{b_max:.4f} (μ={b_mean:.4f})")
# Display with explicit vmin/vmax to handle normalized 0-1 data # Display with explicit vmin/vmax to handle normalized 0-1 data
ax.imshow(rgb_data, vmin=0, vmax=1) ax.imshow(rgb_data, vmin=0, vmax=1)
# Build title: label + offset + actual date # Build title: show BOTH anchor offset AND harvest offset
offset_str = f"{days_offsets[idx]:+d}d" if days_offsets[idx] is not None else "?" if days_offsets[idx] is not None:
date_str = actual_dates[idx].strftime('%Y-%m-%d') if actual_dates[idx] is not None else "No Date" offset_from_anchor = days_offsets[idx]
ax.set_title(f'{label}\n{offset_str}\n{date_str}', fontsize=10, fontweight='bold') offset_from_harvest = (actual_dates[idx] - harvest_date).days
# Add red box around harvest date (T~0d at row=1, col=3) if idx == 8: # ANCHOR/HARVEST position
if label == 'T~0d': if offset_from_harvest == 0:
offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')})"
else:
offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')}, {offset_from_harvest:+d}d from predicted)"
else:
# Show both offsets: from anchor and from harvest
offset_str = f"{offset_from_anchor:+d}d from anchor\n{offset_from_harvest:+d}d from harvest\n{actual_dates[idx].strftime('%Y-%m-%d')}"
else:
offset_str = "No Data"
ax.set_title(offset_str, fontsize=9, fontweight='bold')
# Add red box around the ANCHOR IMAGE (position 8 is harvest/anchor)
if idx == 8: # Position 8 is the anchor
for spine in ax.spines.values(): for spine in ax.spines.values():
spine.set_edgecolor('red') spine.set_edgecolor('red')
spine.set_linewidth(4) spine.set_linewidth(4)
else: else:
ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray') ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray')
ax.set_title(label, fontsize=10) ax.set_title('No Data', fontsize=10)
print(f" DEBUG EMPTY {label}: No image data collected")
# Add red box for T~0d even if no data # Add red box for anchor position even if no data
if label == 'T~0d': if idx == 8: # Position 8 is the anchor
for spine in ax.spines.values(): for spine in ax.spines.values():
spine.set_edgecolor('red') spine.set_edgecolor('red')
spine.set_linewidth(4) spine.set_linewidth(4)

View file

@ -6,6 +6,8 @@
#' 2. Create master 5×5 grid covering all TIFFs #' 2. Create master 5×5 grid covering all TIFFs
#' 3. Split each daily TIFF into 25 tiles using the master grid #' 3. Split each daily TIFF into 25 tiles using the master grid
#' 4. Save tiles in date-specific folders: daily_tiles/[DATE]/[DATE]_[TILE_ID].tif #' 4. Save tiles in date-specific folders: daily_tiles/[DATE]/[DATE]_[TILE_ID].tif
#' & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-18
library(terra) library(terra)
library(sf) library(sf)
@ -32,7 +34,7 @@ if (length(args) >= 2) {
} }
PROJECT <- "angata" PROJECT <- "angata"
TIFF_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "merged_tif_8b") TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b")
# GRID SIZE CONFIGURATION - Change this to use different grid sizes # GRID SIZE CONFIGURATION - Change this to use different grid sizes
# Options: 5x5 (25 tiles), 10x10 (100 tiles), etc. # Options: 5x5 (25 tiles), 10x10 (100 tiles), etc.
@ -42,10 +44,10 @@ GRID_NCOLS <- 5
# Construct grid-specific subfolder path # Construct grid-specific subfolder path
GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS) GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS)
OUTPUT_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) OUTPUT_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL)
# Load field boundaries for overlap checking # Load field boundaries for overlap checking
GEOJSON_PATH <- file.path("..", "laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") GEOJSON_PATH <- file.path("laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson")
cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "") cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "")
cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "")

View file

@ -13,13 +13,13 @@
# #
# Examples: # Examples:
# # Angata 8-band data (with UDM cloud masking) # # Angata 8-band data (with UDM cloud masking)
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/02_ci_extraction.R 2026-01-02 7 angata merged_tif_8b # & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
# #
# # Aura 4-band data # # Aura 4-band data
# Rscript 02_ci_extraction.R 2025-11-26 7 aura merged_tif # Rscript 20_ci_extraction.R 2025-11-26 7 aura merged_tif
# #
# # Auto-detects and uses tiles if available: # # Auto-detects and uses tiles if available:
# Rscript 02_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists) # Rscript 20_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists)
# 1. Load required packages # 1. Load required packages
# ----------------------- # -----------------------

View file

@ -8,7 +8,7 @@
# #
# Usage: Rscript interpolate_growth_model.R [project_dir] # Usage: Rscript interpolate_growth_model.R [project_dir]
# - project_dir: Project directory name (e.g., "chemba") # - project_dir: Project directory name (e.g., "chemba")
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/03_interpolate_growth_model.R angata # & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/30_interpolate_growth_model.R angata
# 1. Load required packages # 1. Load required packages
# ----------------------- # -----------------------

View file

@ -186,8 +186,15 @@ main <- function() {
end_date <- if (length(args) >= 1 && !is.na(args[1])) { end_date <- if (length(args) >= 1 && !is.na(args[1])) {
as.Date(args[1]) as.Date(args[1])
} else if (exists("end_date", envir = .GlobalEnv)) { } else if (exists("end_date", envir = .GlobalEnv)) {
# For recursive calls, use the end_date that was set in the global environment global_date <- get("end_date", envir = .GlobalEnv)
get("end_date", envir = .GlobalEnv) # Check if it's a valid Date with length > 0
if (is.Date(global_date) && length(global_date) > 0 && !is.na(global_date)) {
global_date
} else if (exists("end_date_str", envir = .GlobalEnv)) {
as.Date(get("end_date_str", envir = .GlobalEnv))
} else {
Sys.Date()
}
} else if (exists("end_date_str", envir = .GlobalEnv)) { } else if (exists("end_date_str", envir = .GlobalEnv)) {
as.Date(get("end_date_str", envir = .GlobalEnv)) as.Date(get("end_date_str", envir = .GlobalEnv))
} else { } else {
@ -210,10 +217,15 @@ main <- function() {
7 7
} }
# Validate end_date is a proper Date object
if (is.null(end_date) || length(end_date) == 0 || !inherits(end_date, "Date")) {
stop("ERROR: end_date is not valid. Got: ", class(end_date), " with length ", length(end_date))
}
assign("project_dir", project_dir, envir = .GlobalEnv) assign("project_dir", project_dir, envir = .GlobalEnv)
assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv) assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv)
message("\n" %+% strrep("=", 70)) message("\n", strrep("=", 70))
message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION") message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION")
message(strrep("=", 70)) message(strrep("=", 70))
message("Date:", format(end_date, "%Y-%m-%d")) message("Date:", format(end_date, "%Y-%m-%d"))
@ -238,7 +250,7 @@ main <- function() {
# ========== PER-FIELD ANALYSIS (SC-64) ========== # ========== PER-FIELD ANALYSIS (SC-64) ==========
message("\n" %+% strrep("-", 70)) message("\n", strrep("-", 70))
message("PHASE 1: PER-FIELD WEEKLY ANALYSIS (SC-64 ENHANCEMENTS)") message("PHASE 1: PER-FIELD WEEKLY ANALYSIS (SC-64 ENHANCEMENTS)")
message(strrep("-", 70)) message(strrep("-", 70))
@ -694,9 +706,9 @@ main <- function() {
# ========== FINAL SUMMARY ========== # ========== FINAL SUMMARY ==========
cat("\n" %+% strrep("=", 70) %+% "\n") cat("\n", strrep("=", 70), "\n")
cat("80_CALCULATE_KPIs.R - COMPLETION SUMMARY\n") cat("80_CALCULATE_KPIs.R - COMPLETION SUMMARY\n")
cat(strrep("=", 70) %+% "\n") cat(strrep("=", 70), "\n")
cat("Per-field analysis fields analyzed:", nrow(field_analysis_df), "\n") cat("Per-field analysis fields analyzed:", nrow(field_analysis_df), "\n")
cat("Excel export:", export_paths$excel, "\n") cat("Excel export:", export_paths$excel, "\n")
cat("RDS export:", export_paths$rds, "\n") cat("RDS export:", export_paths$rds, "\n")

View file

@ -1,31 +1,40 @@
# ============================================================================== # ==============================================================================
# FULL PIPELINE RUNNER # FULL PIPELINE RUNNER
# ============================================================================== # ==============================================================================
# Runs scripts 02, 03, 04, 09 (KPIs), 09 (Weekly), and 10 (CI Report Simple) # Mixed Python/R pipeline:
# 1. Python: Download Planet images
# 2. R 10: Create master grid and split TIFFs
# 3. R 20: CI Extraction
# 4. R 21: Convert CI RDS to CSV
# 5. R 30: Interpolate growth model
# 6. Python 31: Harvest imminent weekly
# 7. R 40: Mosaic creation
# 8. R 80: Calculate KPIs
# #
# ============================================================================== # ==============================================================================
# HOW TO RUN THIS SCRIPT # HOW TO RUN THIS SCRIPT
# ============================================================================== # ==============================================================================
# #
# In PowerShell or Command Prompt: # Run from the smartcane/ directory:
# #
# Option 1 (Recommended - shows real-time output): # Option 1 (Recommended - shows real-time output):
# Rscript run_full_pipeline.R # Rscript r_app/run_full_pipeline.R
# #
# Option 2 (Full path to Rscript - use & in PowerShell for paths with spaces): # Option 2 (Full path to Rscript - use & in PowerShell for paths with spaces):
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" run_full_pipeline.R # & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R
# #
# Option 3 (Batch mode - output saved to .Rout file): # Option 3 (Batch mode - output saved to .Rout file):
# R CMD BATCH --vanilla run_full_pipeline.R # R CMD BATCH --vanilla r_app/run_full_pipeline.R
# #
# ============================================================================== # ==============================================================================
# ============================================================================== # ==============================================================================
# *** EDIT THESE VARIABLES *** # *** EDIT THESE VARIABLES ***
end_date <- "2025-12-24" # or specify: "2025-12-02", Sys.Date() end_date <- as.Date("2026-01-27") # or specify: as.Date("2026-01-27") , Sys.Date()
offset <- 7 # days to look back offset <- 7 # days to look back
project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba" project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba"
data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif" data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif"
force_rerun <- FALSE # Set to TRUE to force all scripts to run even if outputs exist
# *************************** # ***************************
# Format dates # Format dates
@ -35,117 +44,350 @@ end_date_str <- format(as.Date(end_date), "%Y-%m-%d")
pipeline_success <- TRUE pipeline_success <- TRUE
# ============================================================================== # ==============================================================================
# SCRIPT 02: CI EXTRACTION # INTELLIGENT CHECKING: What has already been completed?
# ============================================================================== # ==============================================================================
cat("\n========== RUNNING SCRIPT 02: CI EXTRACTION ==========\n") cat("\n========== CHECKING EXISTING OUTPUTS ==========\n")
tryCatch({
source("r_app/02_ci_extraction.R")
main() # Call the main function
cat("✓ Script 02 completed\n")
}, error = function(e) {
cat("✗ Error in Script 02:", e$message, "\n")
pipeline_success <<- FALSE
})
# ============================================================================== # Check Script 10 outputs (tiled splits)
# SCRIPT 03: INTERPOLATE GROWTH MODEL tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5")
# ============================================================================== tiles_dates <- if (dir.exists(tiles_dir)) {
cat("\n========== RUNNING SCRIPT 03: INTERPOLATE GROWTH MODEL ==========\n") list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE)
tryCatch({
source("r_app/03_interpolate_growth_model.R")
main() # Call the main function
cat("✓ Script 03 completed\n")
}, error = function(e) {
cat("✗ Error in Script 03:", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 04: MOSAIC CREATION
# ==============================================================================
cat("\n========== RUNNING SCRIPT 04: MOSAIC CREATION ==========\n")
tryCatch({
source("r_app/04_mosaic_creation.R")
main() # Call the main function
cat("✓ Script 04 completed\n")
}, error = function(e) {
cat("✗ Error in Script 04:", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 09: CALCULATE KPIs
# ==============================================================================
cat("\n========== RUNNING SCRIPT 09: CALCULATE KPIs ==========\n")
tryCatch({
source("r_app/09_calculate_kpis.R")
main() # Call the main function
cat("✓ Script 09 (KPIs) completed\n")
}, error = function(e) {
cat("✗ Error in Script 09 (KPIs):", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 09: FIELD ANALYSIS WEEKLY
# ==============================================================================
# Only run field analysis weekly for angata project
if (project_dir == "angata") {
cat("\n========== RUNNING SCRIPT 09: FIELD ANALYSIS WEEKLY ==========\n")
tryCatch({
source("r_app/09_field_analysis_weekly.R")
main() # Call the main function
cat("✓ Script 09 (Weekly) completed\n")
}, error = function(e) {
cat("✗ Error in Script 09 (Weekly):", e$message, "\n")
pipeline_success <<- FALSE
})
} else { } else {
cat("\n========== SKIPPING SCRIPT 09: FIELD ANALYSIS WEEKLY (only runs for angata) ==========\n") c()
} }
cat(sprintf("Script 10: %d dates already tiled\n", length(tiles_dates)))
# Check Script 20 outputs (CI extraction) - daily RDS files
ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals")
ci_files <- if (dir.exists(ci_daily_dir)) {
list.files(ci_daily_dir, pattern = "\\.rds$")
} else {
c()
}
cat(sprintf("Script 20: %d CI daily RDS files exist\n", length(ci_files)))
# Check Script 21 outputs (CSV conversion) - note: this gets overwritten each time, so we don't skip based on this
# Instead, check if CI RDS files exist - if they do, 21 should also run
# For now, just note that CSV is time-dependent, not a good skip indicator
cat("Script 21: CSV file exists but gets overwritten - will run if Script 20 runs\n")
# Check Script 40 outputs (mosaics in weekly_tile_max/5x5)
mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5")
mosaic_files <- if (dir.exists(mosaic_dir)) {
list.files(mosaic_dir, pattern = "\\.tif$")
} else {
c()
}
cat(sprintf("Script 40: %d mosaic files exist\n", length(mosaic_files)))
# Check Script 80 outputs (KPIs in reports/kpis/field_stats)
kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats")
kpi_files <- if (dir.exists(kpi_dir)) {
list.files(kpi_dir, pattern = "\\.csv$|\\.json$")
} else {
c()
}
cat(sprintf("Script 80: %d KPI files exist\n", length(kpi_files)))
# Determine if scripts should run based on outputs
skip_10 <- length(tiles_dates) > 0 && !force_rerun
skip_20 <- length(ci_files) > 0 && !force_rerun
skip_21 <- length(ci_files) > 0 && !force_rerun # Skip 21 if 20 is skipped
skip_40 <- length(mosaic_files) > 0 && !force_rerun
skip_80 <- FALSE # Always run Script 80 - it calculates KPIs for the current week (end_date), not historical weeks
cat("\nSkipping decisions:\n")
cat(sprintf(" Script 10: %s\n", if(skip_10) "SKIP (tiles exist)" else "RUN"))
cat(sprintf(" Script 20: %s\n", if(skip_20) "SKIP (CI exists)" else "RUN"))
cat(sprintf(" Script 21: %s\n", if(skip_21) "SKIP (CI exists)" else "RUN"))
cat(sprintf(" Script 40: %s\n", if(skip_40) "SKIP (mosaics exist)" else "RUN"))
cat(sprintf(" Script 80: %s\n", if(skip_80) "SKIP (KPIs exist)" else "RUN"))
# ============================================================================== # ==============================================================================
# SCRIPT 91: CI REPORT ANGATA (only for angata) # PYTHON: DOWNLOAD PLANET IMAGES (MISSING DATES ONLY)
# ============================================================================== # ==============================================================================
if (project_dir == "angata") { cat("\n========== DOWNLOADING PLANET IMAGES (MISSING DATES ONLY) ==========\n")
cat("\n========== RUNNING SCRIPT 91: CI REPORT ANGATA ==========\n") tryCatch({
if (pipeline_success) { # Setup paths
tryCatch({ base_path <- file.path("laravel_app", "storage", "app", project_dir)
rmarkdown::render("r_app/91_CI_report_with_kpis_Angata.Rmd", merged_tifs_dir <- file.path(base_path, data_source)
output_format = "word_document",
params = list(data_dir = project_dir, report_date = end_date_str)) # Get existing dates from raw TIFFs
cat("✓ Script 91 (Report) completed\n") existing_tiff_files <- list.files(merged_tifs_dir, pattern = "^\\d{4}-\\d{2}-\\d{2}\\.tif$")
}, error = function(e) { existing_tiff_dates <- sub("\\.tif$", "", existing_tiff_files)
cat("✗ Error in Script 91 (Report):", e$message, "\n")
}) # Get existing dates from tiles (better indicator of completion)
} else { existing_tile_dates <- tiles_dates
cat("✗ Skipping Script 91: Previous pipeline scripts failed\n")
# Find missing dates in the window
start_date <- end_date - offset
date_seq <- seq(start_date, end_date, by = "day")
target_dates <- format(date_seq, "%Y-%m-%d")
# Only download if tiles don't exist yet (more reliable than checking raw TIFFs)
missing_dates <- target_dates[!(target_dates %in% existing_tile_dates)]
cat(sprintf(" Existing tiled dates: %d\n", length(existing_tile_dates)))
cat(sprintf(" Missing dates in window: %d\n", length(missing_dates)))
# Download each missing date
download_count <- 0
download_failed <- 0
if (length(missing_dates) > 0) {
# Save current directory
original_dir <- getwd()
# Change to python_app directory so relative paths work correctly
setwd("python_app")
for (date_str in missing_dates) {
cmd <- sprintf('python 00_download_8band_pu_optimized.py "%s" --date "%s" --resolution 3 --cleanup', project_dir, date_str)
result <- system(cmd, ignore.stdout = FALSE, ignore.stderr = FALSE)
if (result == 0) {
download_count <- download_count + 1
} else {
download_failed <- download_failed + 1
}
} }
# Change back to original directory
setwd(original_dir)
}
cat(sprintf("✓ Downloaded %d dates, %d failed\n", download_count, download_failed))
if (download_failed > 0) {
cat("⚠ Some downloads failed, but continuing pipeline\n")
}
# Force Script 10 to run ONLY if downloads actually succeeded (not just attempted)
if (download_count > 0) {
skip_10 <- FALSE
}
}, error = function(e) {
cat("✗ Error in planet download:", e$message, "\n")
pipeline_success <<- FALSE
})
# ==============================================================================
# SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs
# ==============================================================================
if (pipeline_success && !skip_10) {
cat("\n========== RUNNING SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs ==========\n")
tryCatch({
# Set environment variables for the script (Script 10 uses these for filtering)
assign("PROJECT", project_dir, envir = .GlobalEnv)
# Suppress verbose per-date output, show only summary
sink(nullfile())
source("r_app/10_create_master_grid_and_split_tiffs.R")
sink()
# Verify output
tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5")
if (dir.exists(tiles_dir)) {
subdirs <- list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE)
cat(sprintf("✓ Script 10 completed - created tiles for %d dates\n", length(subdirs)))
} else {
cat("✓ Script 10 completed\n")
}
}, error = function(e) {
sink()
cat("✗ Error in Script 10:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_10) {
cat("\n========== SKIPPING SCRIPT 10 (tiles already exist) ==========\n")
} }
# ============================================================================== # ==============================================================================
# SCRIPT 10: CI REPORT (SIMPLE) # SCRIPT 20: CI EXTRACTION
# ============================================================================== # ==============================================================================
# Only run CI report for non-angata projects if (pipeline_success && !skip_20) {
cat("\n========== RUNNING SCRIPT 20: CI EXTRACTION ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/20_ci_extraction.R")
main() # Call main() to execute the script with the environment variables
if (project_dir != "angata") { # Verify CI output was created
cat("\n========== RUNNING SCRIPT 10: CI REPORT SIMPLE ==========\n") ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals")
if (pipeline_success) { if (dir.exists(ci_daily_dir)) {
tryCatch({ files <- list.files(ci_daily_dir, pattern = "\\.rds$")
rmarkdown::render("r_app/10_CI_report_with_kpis_simple.Rmd", cat(sprintf("✓ Script 20 completed - generated %d CI files\n", length(files)))
output_format = "word_document", } else {
params = list(data_dir = project_dir, report_date = end_date_str)) cat("✓ Script 20 completed\n")
cat("✓ Script 10 (Report) completed\n") }
}, error = function(e) { }, error = function(e) {
cat("✗ Error in Script 10 (Report):", e$message, "\n") cat("✗ Error in Script 20:", e$message, "\n")
}) pipeline_success <<- FALSE
} else { })
cat("✗ Skipping Script 10: Previous pipeline scripts failed\n") } else if (skip_20) {
} cat("\n========== SKIPPING SCRIPT 20 (CI already extracted) ==========\n")
} else { }
cat("\n========== SKIPPING SCRIPT 10: CI REPORT SIMPLE (not applicable for angata) ==========\n")
} # ==============================================================================
# SCRIPT 21: CONVERT CI RDS TO CSV
# ==============================================================================
if (pipeline_success && !skip_21) {
cat("\n========== RUNNING SCRIPT 21: CONVERT CI RDS TO CSV ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
source("r_app/21_convert_ci_rds_to_csv.R")
main() # Call main() to execute the script with the environment variables
# Verify CSV output was created
ci_csv_path <- file.path("laravel_app", "storage", "app", project_dir, "ci_extracted")
if (dir.exists(ci_csv_path)) {
csv_files <- list.files(ci_csv_path, pattern = "\\.csv$")
cat(sprintf("✓ Script 21 completed - converted to %d CSV files\n", length(csv_files)))
} else {
cat("✓ Script 21 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 21:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_21) {
cat("\n========== SKIPPING SCRIPT 21 (CSV already created) ==========\n")
}
# ==============================================================================
# SCRIPT 30: INTERPOLATE GROWTH MODEL
# ==============================================================================
if (pipeline_success) {
cat("\n========== RUNNING SCRIPT 30: INTERPOLATE GROWTH MODEL ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/30_interpolate_growth_model.R")
main() # Call main() to execute the script with the environment variables
# Verify interpolated output
growth_dir <- file.path("laravel_app", "storage", "app", project_dir, "growth_model_interpolated")
if (dir.exists(growth_dir)) {
files <- list.files(growth_dir, pattern = "\\.rds$|\\.csv$")
cat(sprintf("✓ Script 30 completed - generated %d growth model files\n", length(files)))
} else {
cat("✓ Script 30 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 30:", e$message, "\n")
pipeline_success <<- FALSE
})
}
# ==============================================================================
# PYTHON 31: HARVEST IMMINENT WEEKLY
# ==============================================================================
if (pipeline_success) {
cat("\n========== RUNNING PYTHON 31: HARVEST IMMINENT WEEKLY ==========\n")
tryCatch({
# Run Python script in pytorch_gpu conda environment
# Script expects positional project name (not --project flag)
# Run from smartcane root so conda can find the environment
cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir)
cat("DEBUG: Running command:", cmd, "\n")
result <- system(cmd)
if (result == 0) {
# Verify harvest output - check for THIS WEEK's specific file
current_week <- as.numeric(format(end_date, "%V"))
current_year <- as.numeric(format(end_date, "%Y"))
expected_file <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats",
sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year))
if (file.exists(expected_file)) {
cat(sprintf("✓ Script 31 completed - generated harvest imminent file for week %02d\n", current_week))
} else {
cat("✓ Script 31 completed (check if harvest.xlsx is available)\n")
}
} else {
cat("⚠ Script 31 completed with errors (check harvest.xlsx availability)\n")
}
}, error = function(e) {
setwd(original_dir)
cat("⚠ Script 31 error:", e$message, "\n")
})
}
# ==============================================================================
# SCRIPT 40: MOSAIC CREATION
# ==============================================================================
if (pipeline_success && !skip_40) {
cat("\n========== RUNNING SCRIPT 40: MOSAIC CREATION ==========\n")
tryCatch({
# Set environment variables for the script
assign("end_date", end_date, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/40_mosaic_creation.R")
main() # Call main() to execute the script with the environment variables
# Verify mosaic output
mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5")
if (dir.exists(mosaic_dir)) {
files <- list.files(mosaic_dir, pattern = "\\.tif$")
cat(sprintf("✓ Script 40 completed - generated %d mosaic files\n", length(files)))
} else {
cat("✓ Script 40 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 40:", e$message, "\n")
pipeline_success <<- FALSE
})
} else if (skip_40) {
cat("\n========== SKIPPING SCRIPT 40 (mosaics already created) ==========\n")
}
# ==============================================================================
# SCRIPT 80: CALCULATE KPIs
# ==============================================================================
if (pipeline_success) { # Always run Script 80 - it calculates KPIs for the current week
cat("\n========== RUNNING SCRIPT 80: CALCULATE KPIs ==========\n")
tryCatch({
# Set environment variables for the script (Script 80's main() uses these as fallbacks)
# NOTE: end_date is already a Date, just assign directly without as.Date()
assign("end_date", end_date, envir = .GlobalEnv)
assign("end_date_str", end_date_str, envir = .GlobalEnv)
assign("offset", offset, envir = .GlobalEnv)
assign("project_dir", project_dir, envir = .GlobalEnv)
assign("data_source", data_source, envir = .GlobalEnv)
source("r_app/80_calculate_kpis.R")
main() # Call main() to execute the script with the environment variables
# Verify KPI output
kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats")
if (dir.exists(kpi_dir)) {
files <- list.files(kpi_dir, pattern = "\\.csv$|\\.json$")
cat(sprintf("✓ Script 80 completed - generated %d KPI files\n", length(files)))
} else {
cat("✓ Script 80 completed\n")
}
}, error = function(e) {
cat("✗ Error in Script 80:", e$message, "\n")
cat("Full error:\n")
print(e)
pipeline_success <<- FALSE
})
}
# ============================================================================== # ==============================================================================
# SUMMARY # SUMMARY
@ -154,4 +396,9 @@ cat("\n========== PIPELINE COMPLETE ==========\n")
cat(sprintf("Project: %s\n", project_dir)) cat(sprintf("Project: %s\n", project_dir))
cat(sprintf("End Date: %s\n", end_date_str)) cat(sprintf("End Date: %s\n", end_date_str))
cat(sprintf("Offset: %d days\n", offset)) cat(sprintf("Offset: %d days\n", offset))
cat("Scripts executed: 02, 03, 04, 09 (KPIs), 09 (Weekly), 10 (CI Report)\n") if (pipeline_success) {
cat("Status: ✓ All scripts completed successfully\n")
} else {
cat("Status: ✗ Pipeline failed - check errors above\n")
}
cat("Pipeline sequence: Python Download → R 10 → R 20 → R 21 → R 30 → Python 31 → R 40 → R 80\n")