From 9a55d2fcf80946dfb873b8fa601391d665e99f5f Mon Sep 17 00:00:00 2001 From: Timon Date: Tue, 27 Jan 2026 08:58:06 +0100 Subject: [PATCH] Refactor full pipeline script to include intelligent checking of existing outputs and dynamic execution of scripts. Added new Python scripts for RGB validation and evaluation template creation. Enhanced error handling and logging throughout the pipeline. --- python_app/31_harvest_imminent_weekly.py | 12 +- ... => batch_rgb_validation_top_fields_v3.py} | 112 +++- python_app/create_rgb_evaluation_template.py | 193 +++++++ python_app/debug_all_tiles_for_date.py | 107 ---- python_app/debug_field_mask.py | 102 ---- python_app/debug_tiff_inspect.py | 47 -- python_app/download_planet_missing_dates.py | 2 +- python_app/merge_ci_data.R | 29 - python_app/rgb_visualization.py | 504 +++++++++++++----- r_app/10_create_master_grid_and_split_tiffs.R | 8 +- r_app/20_ci_extraction.R | 6 +- r_app/30_interpolate_growth_model.R | 2 +- r_app/80_calculate_kpis.R | 24 +- r_app/run_full_pipeline.R | 455 ++++++++++++---- 14 files changed, 1035 insertions(+), 568 deletions(-) rename python_app/{batch_rgb_validation_top_fields.py => batch_rgb_validation_top_fields_v3.py} (69%) create mode 100644 python_app/create_rgb_evaluation_template.py delete mode 100644 python_app/debug_all_tiles_for_date.py delete mode 100644 python_app/debug_field_mask.py delete mode 100644 python_app/debug_tiff_inspect.py delete mode 100644 python_app/merge_ci_data.R diff --git a/python_app/31_harvest_imminent_weekly.py b/python_app/31_harvest_imminent_weekly.py index 8722eda..39d3eb9 100644 --- a/python_app/31_harvest_imminent_weekly.py +++ b/python_app/31_harvest_imminent_weekly.py @@ -1,4 +1,4 @@ -""" +r""" Script: 02_harvest_imminent_weekly.py Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields @@ -38,12 +38,12 @@ Use Cases: - Feed into 09b script for weekly dashboard reports Usage: - python 02_harvest_imminent_weekly.py [project_name] + python python_app/31_harvest_imminent_weekly.py angata Examples: - python 02_harvest_imminent_weekly.py angata - python 02_harvest_imminent_weekly.py esa - python 02_harvest_imminent_weekly.py chemba + python python_app/31_harvest_imminent_weekly.py angata + python python_app/31_harvest_imminent_weekly.py esa + python python_app/31_harvest_imminent_weekly.py chemba If no project specified, defaults to 'angata' """ @@ -264,7 +264,7 @@ def main(): # [3] Load model (from python_app directory) print("\n[3/5] Loading Model 307...") - model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl + model_dir = Path("python_app") # Model files located in python_app/ directory model, config, scalers = load_model_and_config(model_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f" Device: {device}") diff --git a/python_app/batch_rgb_validation_top_fields.py b/python_app/batch_rgb_validation_top_fields_v3.py similarity index 69% rename from python_app/batch_rgb_validation_top_fields.py rename to python_app/batch_rgb_validation_top_fields_v3.py index cccb9ae..259a081 100644 --- a/python_app/batch_rgb_validation_top_fields.py +++ b/python_app/batch_rgb_validation_top_fields_v3.py @@ -1,23 +1,25 @@ #!/usr/bin/env python """ -Batch RGB Validation for Top 50 Largest Fields +Batch RGB Validation for Top 100 Largest Fields - V3 -Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields. +Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images). + +Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields. Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. Configuration: - GeoJSON: pivot.geojson (defines field boundaries and sizes) - Harvest data: harvest.xlsx (season_end dates for completed harvests) -- CI data: ci_data_for_python.csv - Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png Usage: - python batch_rgb_validation_top_fields.py + python batch_rgb_validation_top_fields_v3.py --field 1 + python batch_rgb_validation_top_fields_v3.py Output: - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ - Filenames: field___harvest_rgb.png - - Each grid shows 15 images at 7-day intervals around the season_end date + - Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images) """ import json @@ -26,6 +28,7 @@ import pandas as pd from pathlib import Path from datetime import datetime, timedelta import sys +import argparse # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -189,16 +192,26 @@ def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df): def main(): + parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)') + parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")') + parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)') + + args = parser.parse_args() + print("="*90) - print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS") - print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery") + if args.field: + print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}") + else: + print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS") + print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)") print("="*90) # Configuration - geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") - harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx") - output_dir = Path("laravel_app/storage/app/angata/RGB") - tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") + project = args.project + geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson") + harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx") + output_dir = Path(f"laravel_app/storage/app/{project}/RGB") + tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5") # Verify paths if not geojson_path.exists(): @@ -213,18 +226,83 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) - # Step 1: Load GeoJSON and get top 50 largest fields - print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...") + # Handle single field mode + if args.field: + print(f"\n[1/3] Loading harvest data for field {args.field}...") + + harvest_df = pd.read_excel(harvest_xlsx) + harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') + harvest_df['field'] = harvest_df['field'].astype(str).str.strip() + + field_records = harvest_df[harvest_df['field'] == args.field] + field_records = field_records[field_records['season_end'].notna()] + + if len(field_records) == 0: + print(f"✗ No harvest data found for field {args.field}") + return + + # Get latest harvest for this field + latest_idx = field_records['season_end'].idxmax() + latest_row = field_records.loc[latest_idx] + harvest_date = latest_row['season_end'] + + print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}") + + # Load field name from GeoJSON + print(f"\n[2/3] Loading field name from GeoJSON...") + with open(geojson_path) as f: + geojson_data = json.load(f) + + field_name = f"field_{args.field}" + for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + if str(props.get('field', '')) == args.field: + field_name = props.get('name', field_name) + break + + print(f" ✓ Field name: {field_name}") + + # Generate RGB grid + print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...") + results = generate_rgb_grids( + field_data=None, + field_id=args.field, + registered_harvest_dates=[], + predicted_harvest_dates=[ + { + 'harvest_date': harvest_date, + 'model_name': 'harvest_xlsx' + } + ], + output_dir=str(output_dir), + tiff_dir=str(tiff_dir), + geojson_path=str(geojson_path) + ) + + print("\n" + "="*90) + if results['predicted']: + print(f"✓ RGB grid generated successfully!") + print(f" Field: {field_name} (ID: {args.field})") + print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}") + print(f" Output: {output_dir}") + else: + print(f"⚠ No RGB grid generated (no imagery available)") + print("="*90) + return + + # Batch mode for top 100 fields + print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...") + fields_df = load_geojson_and_calculate_areas(geojson_path) if fields_df is None: return - top_50_fields = fields_df.head(50) - print(f" ✓ Selected {len(top_50_fields)} largest fields for processing") + top_100_fields = fields_df.head(100) + print(f" ✓ Selected {len(top_100_fields)} largest fields for processing") # Step 2: Load harvest dates from Excel print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") - harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields) + harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields) if len(harvest_dates) == 0: print("✗ No harvest dates found in Excel file") @@ -237,7 +315,7 @@ def main(): print(f" ... and {len(harvest_dates) - 5} more") # Step 3: Generate RGB grids for each field - print("\n[3/4] Generating RGB validation grids...") + print("\n[3/4] Generating RGB validation grids (v3 dynamic)...") rgb_count = 0 for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): diff --git a/python_app/create_rgb_evaluation_template.py b/python_app/create_rgb_evaluation_template.py new file mode 100644 index 0000000..dea094c --- /dev/null +++ b/python_app/create_rgb_evaluation_template.py @@ -0,0 +1,193 @@ +""" +Create an Excel evaluation template for RGB harvest date predictions. +Parses field names and dates directly from RGB image filenames. +""" +import os +import glob +import pandas as pd +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment, Border, Side +from openpyxl.utils import get_column_letter +import re +from datetime import datetime + +# Configuration +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +RGB_DIR = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB") +OUTPUT_PATH = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB_Evaluation_Template.xlsx") + +# Evaluators +EVALUATORS = ["Joey", "Daniel", "Nik", "Dimitra", "Timon"] + +def parse_rgb_filenames(): + """ + Parse field names and harvest dates from RGB image filenames. + Expected format: field_{field_id or name}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png + """ + fields_data = [] + + # Find all RGB images + rgb_files = glob.glob(os.path.join(RGB_DIR, "field_*.png")) + + for filepath in sorted(rgb_files): + filename = os.path.basename(filepath) + # Pattern: field_{field_id}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png + match = re.match(r"field_(.+?)_(\d{8})_harvest_xlsx_harvest_rgb\.png", filename) + + if match: + field_id = match.group(1) # e.g., "1000" or "91&92" + date_str = match.group(2) # e.g., "20250814" + + # Format date as YYYY-MM-DD + try: + harvest_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") + except ValueError: + harvest_date = date_str + + fields_data.append({ + "field_id": field_id, + "harvest_date": harvest_date, + "filename": filename + }) + + # Sort by field_id (treating numeric parts as integers where possible) + fields_data = sorted(fields_data, key=lambda x: (x["field_id"].replace("&92", ""), )) + + return fields_data + +def create_evaluation_template(): + """Create the Excel evaluation template.""" + print("Loading field data from RGB images...") + fields_data = parse_rgb_filenames() + + if not fields_data: + print("ERROR: No RGB images found in", RGB_DIR) + return + + print(f"Found {len(fields_data)} RGB images") + + # Create workbook + wb = Workbook() + + # === SHEET 1: Evaluation Form === + ws_eval = wb.active + ws_eval.title = "Evaluation" + + # Define styles + header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") + header_font = Font(bold=True, color="FFFFFF", size=11) + border = Border( + left=Side(style='thin'), + right=Side(style='thin'), + top=Side(style='thin'), + bottom=Side(style='thin') + ) + center_align = Alignment(horizontal="center", vertical="center", wrap_text=True) + left_align = Alignment(horizontal="left", vertical="center", wrap_text=True) + + # Column headers + headers = ["Field ID", "Predicted Harvest Date"] + EVALUATORS + for col_idx, header in enumerate(headers, start=1): + cell = ws_eval.cell(row=1, column=col_idx, value=header) + cell.fill = header_fill + cell.font = header_font + cell.alignment = center_align + cell.border = border + + # Set column widths + ws_eval.column_dimensions['A'].width = 15 + ws_eval.column_dimensions['B'].width = 20 + for col_idx in range(3, 3 + len(EVALUATORS)): + ws_eval.column_dimensions[get_column_letter(col_idx)].width = 12 + + # Add data rows + for row_idx, field in enumerate(fields_data, start=2): + ws_eval.cell(row=row_idx, column=1, value=field["field_id"]) + ws_eval.cell(row=row_idx, column=2, value=field["harvest_date"]) + + # Add empty cells for evaluator responses + for col_idx in range(3, 3 + len(EVALUATORS)): + cell = ws_eval.cell(row=row_idx, column=col_idx) + cell.alignment = center_align + cell.border = border + + # Light alternating row color + if row_idx % 2 == 0: + for col_idx in range(1, 3 + len(EVALUATORS)): + ws_eval.cell(row=row_idx, column=col_idx).fill = PatternFill( + start_color="D9E8F5", end_color="D9E8F5", fill_type="solid" + ) + + # Apply borders to all data cells + for col_idx in range(1, 3 + len(EVALUATORS)): + ws_eval.cell(row=row_idx, column=col_idx).border = border + if col_idx == 1 or col_idx == 2: + ws_eval.cell(row=row_idx, column=col_idx).alignment = left_align + + # Freeze panes + ws_eval.freeze_panes = "C2" + + # === SHEET 2: Instructions === + ws_instr = wb.create_sheet("Instructions") + + instr_content = [ + ["RGB Evaluation Instructions", ""], + ["", ""], + ["Overview:", ""], + ["The generated RGB images visualize the predicted harvest dates for each field.", ""], + ["The images are 3x3 grids showing satellite imagery from different dates", ""], + ["centered on the predicted harvest date (the center/red-box image).", ""], + ["", ""], + ["What to Evaluate:", ""], + ["For each field, determine if the predicted harvest date is CORRECT:", ""], + ["", ""], + ["Instructions for Reviewing:", ""], + ["1. Look at the CENTER image (red box) - this is the predicted harvest date", ""], + ["2. Compare to surrounding dates (before and after)", ""], + ["3. Look for change in field color/status:", ""], + [" • BEFORE: Field appears GREEN (growing/healthy crop)", ""], + [" • AT PREDICTED DATE: Field shows BROWN/YELLOW (soil visible, ripe for harvest)", ""], + [" • AFTER: Field continues BROWN (post-harvest or dormant)", ""], + ["", ""], + ["How to Enter Your Assessment:", ""], + ["Enter one of the following in your evaluator column for each field:", ""], + [" • YES = Predicted date is CORRECT (brown/harvest-ready at center date)", ""], + [" • NO = Predicted date is INCORRECT (not ready or already harvested)", ""], + [" • ? or MAYBE = Uncertain (cloudy images, unclear field status)", ""], + ["", ""], + ["Workflow Options:", ""], + ["Option A (Divide Work): Assign 2-3 fields per evaluator (rows divided by column)", ""], + ["Option B (Full Review): Each evaluator reviews all fields (everyone fills all rows)", ""], + ["Option C (Spot Check): Each evaluator samples 5-10 random fields", ""], + ["", ""], + ["Image Location:", ""], + ["All RGB images are in: /laravel_app/storage/app/angata/RGB/", ""], + ["Format: field_{FIELD_ID}_{YYYY-MM-DD}_harvest_xlsx_harvest_rgb.png", ""], + ["", ""], + ["Notes:", ""], + ["• Cloud cover may obscure ground truth - use best judgment", ""], + ["• Fields with multiple bands or irregular shapes: focus on dominant area", ""], + ["• Use context from previous/next dates to validate your assessment", ""], + ] + + # Add instructions text + for row_idx, row_data in enumerate(instr_content, start=1): + for col_idx, value in enumerate(row_data, start=1): + cell = ws_instr.cell(row=row_idx, column=col_idx, value=value) + if row_idx == 1: + cell.font = Font(bold=True, size=14) + elif any(keyword in str(value) for keyword in ["Overview:", "Instructions", "Workflow", "Image Location", "Notes"]): + cell.font = Font(bold=True, size=11) + cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True) + + ws_instr.column_dimensions['A'].width = 50 + ws_instr.column_dimensions['B'].width = 80 + + # Save workbook + wb.save(OUTPUT_PATH) + print(f"✓ Evaluation template created: {OUTPUT_PATH}") + print(f"✓ {len(fields_data)} fields added to evaluation form") + print(f"✓ Evaluators: {', '.join(EVALUATORS)}") + +if __name__ == "__main__": + create_evaluation_template() diff --git a/python_app/debug_all_tiles_for_date.py b/python_app/debug_all_tiles_for_date.py deleted file mode 100644 index ba07fe7..0000000 --- a/python_app/debug_all_tiles_for_date.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to find all tiles for a date and check which overlap with field boundary -""" - -import json -import rasterio -from rasterio.mask import mask -from pathlib import Path -import numpy as np -import shapely.geometry as shgeom -import pandas as pd - -# Load field 79 boundary -geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") -field_id = "79" - -print(f"Loading field {field_id} from GeoJSON...") -with open(geojson_path) as f: - geojson_data = json.load(f) - -field_boundary = None -for feature in geojson_data.get('features', []): - props = feature.get('properties', {}) - if str(props.get('field', '')) == str(field_id): - geometry = feature.get('geometry') - if geometry: - geom_type = geometry.get('type', '') - coordinates = geometry.get('coordinates', []) - - if geom_type == 'MultiPolygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0][0] - field_boundary = shgeom.Polygon(coords) - elif geom_type == 'Polygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0] - field_boundary = shgeom.Polygon(coords) - break - -if field_boundary is None: - print(f"Field {field_id} not found") - exit(1) - -print(f"Field boundary bounds: {field_boundary.bounds}") -print(f"Field boundary area: {field_boundary.area}") - -# Find a specific date directory -tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") -target_date = pd.Timestamp("2026-01-15") # Use a recent date that exists - -# Find tiles for that date -date_dirs = [] -for date_dir in tiff_dir.iterdir(): - if date_dir.is_dir(): - try: - dir_name = date_dir.name - date_str = dir_name.split('_')[0] - tile_date = pd.Timestamp(date_str) - if tile_date == target_date: - date_dirs.append(date_dir) - except: - pass - -if not date_dirs: - print(f"No tiles found for {target_date}") - exit(1) - -print(f"\nFound {len(date_dirs)} date directory(ies) for {target_date}") - -for date_dir in date_dirs: - print(f"\n=== Checking date directory: {date_dir.name} ===") - - tiles = list(date_dir.glob("*.tif")) - print(f"Found {len(tiles)} tiles in this directory") - - for tile_path in sorted(tiles): - try: - with rasterio.open(tile_path) as src: - tile_bounds = src.bounds - tile_geom = shgeom.box(*tile_bounds) - - intersects = field_boundary.intersects(tile_geom) - intersection = field_boundary.intersection(tile_geom) if intersects else None - intersection_area = intersection.area if intersection else 0 - - print(f"\n{tile_path.name}") - print(f" Tile bounds: {tile_bounds}") - print(f" Intersects field: {intersects}") - if intersects: - print(f" Intersection area: {intersection_area:.8f}") - - # Try to mask this tile - geom = shgeom.mapping(field_boundary) - try: - masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) - print(f" ✓ Successfully masked! Shape: {masked_data.shape}") - - # Check the data in each band - for i, band_idx in enumerate([1, 2, 3]): - band_data = masked_data[i] - non_zero = (band_data != 0).sum() - print(f" Band {band_idx}: {non_zero} non-zero pixels out of {band_data.size}") - except ValueError as e: - print(f" ✗ Masking failed: {e}") - except Exception as e: - print(f" Error reading tile: {e}") diff --git a/python_app/debug_field_mask.py b/python_app/debug_field_mask.py deleted file mode 100644 index ce96700..0000000 --- a/python_app/debug_field_mask.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to diagnose why field boundary masking produces no data -""" - -import json -import rasterio -from rasterio.mask import mask -from pathlib import Path -import numpy as np -import shapely.geometry as shgeom - -# Load a sample field boundary -geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") -field_id = "79" # A field that had issues - -print(f"Loading field {field_id} from GeoJSON...") -with open(geojson_path) as f: - geojson_data = json.load(f) - -field_boundary = None -for feature in geojson_data.get('features', []): - props = feature.get('properties', {}) - if str(props.get('field', '')) == str(field_id): - geometry = feature.get('geometry') - if geometry: - geom_type = geometry.get('type', '') - coordinates = geometry.get('coordinates', []) - - if geom_type == 'MultiPolygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0][0] - field_boundary = shgeom.Polygon(coords) - elif geom_type == 'Polygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0] - field_boundary = shgeom.Polygon(coords) - break - -if field_boundary is None: - print(f"Field {field_id} not found") - exit(1) - -print(f"Field boundary bounds: {field_boundary.bounds}") -print(f"Field boundary area: {field_boundary.area}") - -# Load a sample TIFF tile -tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") -tile_file = None -for date_dir in sorted(tiff_dir.iterdir()): - if date_dir.is_dir(): - for tif in date_dir.glob("*.tif"): - if tif.stat().st_size > 12e6: - tile_file = tif - break - if tile_file: - break - -if not tile_file: - print("No suitable TIFF found") - exit(1) - -print(f"\nTesting with TIFF: {tile_file.name}") - -with rasterio.open(tile_file) as src: - print(f"TIFF Bounds: {src.bounds}") - print(f"TIFF CRS: {src.crs}") - - # Check if field boundary is within tile bounds - tile_box = shgeom.box(*src.bounds) - intersects = field_boundary.intersects(tile_box) - print(f"Field boundary intersects tile: {intersects}") - - if intersects: - intersection = field_boundary.intersection(tile_box) - print(f"Intersection area: {intersection.area}") - print(f"Intersection bounds: {intersection.bounds}") - - # Try to mask and see what we get - print("\nAttempting to mask...") - geom = shgeom.mapping(field_boundary) - try: - masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) - print(f"Masked data shape: {masked_data.shape}") - print(f"Masked data dtype: {masked_data.dtype}") - - # Check the data - for i, band_idx in enumerate([1, 2, 3]): - band_data = masked_data[i] - print(f"\nBand {band_idx}:") - print(f" min: {np.nanmin(band_data):.6f}") - print(f" max: {np.nanmax(band_data):.6f}") - print(f" mean: {np.nanmean(band_data):.6f}") - print(f" % valid (non-zero): {(band_data != 0).sum() / band_data.size * 100:.2f}%") - print(f" % NaN: {np.isnan(band_data).sum() / band_data.size * 100:.2f}%") - - # Show sample values - valid_pixels = band_data[band_data != 0] - if len(valid_pixels) > 0: - print(f" Sample valid values: {valid_pixels[:10]}") - except ValueError as e: - print(f"Error during masking: {e}") diff --git a/python_app/debug_tiff_inspect.py b/python_app/debug_tiff_inspect.py deleted file mode 100644 index be51e5e..0000000 --- a/python_app/debug_tiff_inspect.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to inspect TIFF file structure and data -""" - -import rasterio -from pathlib import Path -import numpy as np - -# Pick a tile file to inspect -tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") - -# Find first available tile -tile_file = None -for date_dir in sorted(tiff_dir.iterdir()): - if date_dir.is_dir(): - for tif in date_dir.glob("*.tif"): - if tif.stat().st_size > 12e6: # Skip empty files - tile_file = tif - break - if tile_file: - break - -if not tile_file: - print("No suitable TIFF files found") - exit(1) - -print(f"Inspecting: {tile_file.name}") -print("=" * 80) - -with rasterio.open(tile_file) as src: - print(f"Band count: {src.count}") - print(f"Data type: {src.dtypes[0]}") - print(f"Shape: {src.height} x {src.width}") - print(f"CRS: {src.crs}") - print(f"Bounds: {src.bounds}") - print() - - # Read each band - for band_idx in range(1, min(6, src.count + 1)): - data = src.read(band_idx) - print(f"Band {band_idx}:") - print(f" dtype: {data.dtype}") - print(f" range: {data.min():.6f} - {data.max():.6f}") - print(f" mean: {data.mean():.6f}") - print(f" % valid (non-zero): {(data != 0).sum() / data.size * 100:.1f}%") - print() diff --git a/python_app/download_planet_missing_dates.py b/python_app/download_planet_missing_dates.py index b6723b7..9e15d25 100644 --- a/python_app/download_planet_missing_dates.py +++ b/python_app/download_planet_missing_dates.py @@ -4,7 +4,7 @@ Purpose: Download Planet satellite data for missing dates only (skip existing fi Can be called from batch scripts or other Python scripts. Usage: - python download_planet_missing_dates.py --start 2022-01-01 --end 2025-12-15 --project angata + python download_planet_missing_dates.py --start 2026-01-17 --end 2026-12-20 --project angata python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run Environment variables (alternative to CLI args): diff --git a/python_app/merge_ci_data.R b/python_app/merge_ci_data.R deleted file mode 100644 index 8c179bb..0000000 --- a/python_app/merge_ci_data.R +++ /dev/null @@ -1,29 +0,0 @@ -# Merge all CI RDS files into a single CSV -library(tidyverse) - -# Paths -ci_data_dir <- "r_app/experiments/ci_graph_exploration/CI_data" -output_csv <- "python_app/lstm_ci_data_combined.csv" - -# Find all RDS files -rds_files <- list.files(ci_data_dir, pattern = "\\.rds$", full.names = TRUE) -print(paste("Found", length(rds_files), "RDS files")) - -# Load and combine all files -combined_data <- tibble() - -for (file in rds_files) { - filename <- basename(file) - client_name <- sub("\\.rds$", "", filename) # Extract client name from filename - print(paste("Loading:", filename, "- Client:", client_name)) - data <- readRDS(file) - data$client <- client_name - combined_data <- bind_rows(combined_data, data) -} - -print(paste("Total rows:", nrow(combined_data))) -print(paste("Columns:", paste(names(combined_data), collapse = ", "))) - -# Write to CSV -write.csv(combined_data, output_csv, row.names = FALSE) -print(paste("✓ Saved to:", output_csv)) diff --git a/python_app/rgb_visualization.py b/python_app/rgb_visualization.py index c1fd2e0..ccb83f6 100644 --- a/python_app/rgb_visualization.py +++ b/python_app/rgb_visualization.py @@ -88,95 +88,116 @@ def load_field_boundaries(geojson_path, field_id): return None, None -def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60): +def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=None, debug=False): """ - Find all tile files for target_date (or closest date) that overlap with field_boundary. + Find tile files with actual data (not cloud-masked) for target_date or nearest date. - Tile files are organized in subdirectories by date: 5x5/YYYY-MM-DD_HH/*.tif + Searches by increasing distance from target date until finding tiles with data. + Avoids reusing dates in exclude_dates to ensure temporal diversity in grids. Args: target_date (pd.Timestamp): Target date to find tiles near tiff_dir (Path): Directory containing 5x5 date subdirectories field_boundary (shapely.Polygon): Field boundary for overlap detection days_window (int): Max days to search before/after target + exclude_dates (list): List of dates to skip (avoid repetition) + debug (bool): Enable detailed debugging output Returns: tuple: (list of tile paths, actual_date, days_diff) - list: tile paths that overlap field - pd.Timestamp: actual date of tiles found - int: days difference from target to actual date found """ target_date = pd.Timestamp(target_date) tiff_dir = Path(tiff_dir) + exclude_dates = exclude_dates or [] + exclude_dates = [pd.Timestamp(d) for d in exclude_dates] if not tiff_dir.exists(): + if debug: + print(f" [DEBUG] TIFF dir does not exist: {tiff_dir}") return [], None, None - # Find all date subdirectories - available_dates = {} # {date: ([tile file paths], actual_dir_name)} - min_size_mb = 12.0 # Empty files are ~11.56 MB + # Build map of all available dates + available_dates = {} + date_parse_errors = 0 for date_dir in tiff_dir.iterdir(): if not date_dir.is_dir(): continue - try: - # Parse date from directory name (YYYY-MM-DD or YYYY-MM-DD_HH) dir_name = date_dir.name - # Extract just the date part before underscore if it exists date_str = dir_name.split('_')[0] tile_date = pd.Timestamp(date_str) - days_diff = (tile_date - target_date).days - if abs(days_diff) > days_window: - continue - - # Find all .tif files in this date directory tile_files = [] for tile_file in date_dir.glob('*.tif'): - # Skip obviously empty files - file_size_mb = tile_file.stat().st_size / (1024 * 1024) - if file_size_mb >= min_size_mb: - tile_files.append(tile_file) + # Include ALL tiles, regardless of size + # Some tiles may be small but still contain valid data for specific fields + tile_files.append(tile_file) if tile_files: available_dates[tile_date] = (tile_files, dir_name) - except: - pass + except Exception as e: + date_parse_errors += 1 + if debug: + print(f" [DEBUG] Failed to parse date from {date_dir.name}: {e}") + + if debug: + print(f" [DEBUG] Found {len(available_dates)} dates with tile files, {date_parse_errors} parse errors") + print(f" [DEBUG] Date range: {min(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'} to {max(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'}") if not available_dates: return [], None, None - # Find closest date - closest_date = min(available_dates.keys(), key=lambda d: abs((d - target_date).days)) - days_diff = (closest_date - target_date).days - tiles, _ = available_dates[closest_date] + # Search dates by increasing distance from target, looking for tiles with actual data + sorted_dates = sorted(available_dates.keys(), key=lambda d: abs((d - target_date).days)) - # Filter tiles to only those that overlap field boundary - if rasterio is None or field_boundary is None: - # If rasterio not available, use all tiles (conservative approach) - return tiles, closest_date, days_diff + for search_date in sorted_dates: + # Skip if this date was recently used (avoid temporal repetition) + if search_date in exclude_dates: + continue + + tiles, dir_name = available_dates[search_date] + days_diff = (search_date - target_date).days + + # Try to find overlapping tiles at this date + overlapping_tiles = [] + tile_check_errors = 0 + + for tile_path in tiles: + try: + with rasterio.open(tile_path) as src: + tile_bounds = src.bounds + tile_geom = shgeom.box(*tile_bounds) + + # Debug first tile + if debug and len(overlapping_tiles) == 0 and tile_check_errors == 0: + print(f" [DEBUG] First tile check for {tile_path.name}:") + print(f" Tile bounds: {tile_bounds}") + print(f" Tile CRS: {src.crs}") + print(f" Field bounds: {field_boundary.bounds}") + print(f" Field geom type: {field_boundary.geom_type}") + print(f" Intersects: {tile_geom.intersects(field_boundary)}") + + if tile_geom.intersects(field_boundary): + overlapping_tiles.append(tile_path) + except Exception as e: + tile_check_errors += 1 + if debug: + print(f" [DEBUG] Error checking tile {tile_path.name}: {e}") + + if debug: + print(f" [DEBUG] Date {search_date.strftime('%Y-%m-%d')}: {len(tiles)} tiles, {len(overlapping_tiles)} overlap field, {tile_check_errors} errors") + + if overlapping_tiles: + # Found overlapping tiles, return them + print(f" [FIND_TILES] Target: {target_date.strftime('%Y-%m-%d')}, Using: {search_date.strftime('%Y-%m-%d')} ({days_diff:+d}d), Tiles: {[Path(t).name for t in overlapping_tiles]}") + return overlapping_tiles, search_date, days_diff - overlapping_tiles = [] + # No overlapping tiles found at all + if debug: + print(f" [DEBUG] No overlapping tiles found for {target_date.strftime('%Y-%m-%d')} within {len(sorted_dates)} searched dates") - for tile_path in tiles: - try: - with rasterio.open(tile_path) as src: - # Get tile bounds - tile_bounds = src.bounds # (left, bottom, right, top) - tile_geom = shgeom.box(*tile_bounds) - - # Check if tile overlaps field - if tile_geom.intersects(field_boundary): - overlapping_tiles.append(tile_path) - except: - pass - - if not overlapping_tiles: - # No overlapping tiles found, return all tiles for the closest date - return tiles, closest_date, days_diff - - return overlapping_tiles, closest_date, days_diff + return [], None, None def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): @@ -211,44 +232,44 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): # For merged_final_tif: bands 1,2,3 are R,G,B bands_to_read = (1, 2, 3) - # Mask and read bands + # Mask and read bands - extract ONLY the specific field polygon geom = shgeom.mapping(field_boundary) try: masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read)) - - # Stack RGB rgb = np.stack([masked_data[i] for i in range(3)], axis=-1) - - # Convert to float32 if not already - rgb = rgb.astype(np.float32) - - # Normalize to 0-1 range - # Data appears to be 8-bit (0-255 range) stored as float32 - # Check actual max value to determine normalization - max_val = np.nanmax(rgb) - if max_val > 0: - # If max is around 255 or less, assume 8-bit - if max_val <= 255: - rgb = rgb / 255.0 - # If max is around 65535, assume 16-bit - elif max_val <= 65535: - rgb = rgb / 65535.0 - # Otherwise divide by max to normalize - else: - rgb = rgb / max_val - - rgb = np.clip(rgb, 0, 1) - - # Check if result is all NaN - if np.all(np.isnan(rgb)): - return None - - # Replace any remaining NaN with 0 (cloud/invalid pixels) - rgb = np.nan_to_num(rgb, nan=0.0) - - return rgb - except ValueError: + except (ValueError, RuntimeError) as e: + # Mask failed - field doesn't overlap this tile or geometry issue + print(f" MASK ERROR on {Path(tiff_path).name}: {str(e)[:50]}") return None + + # Convert to float32 if not already + rgb = rgb.astype(np.float32) + + # Normalize to 0-1 range + # Data appears to be 8-bit (0-255 range) stored as float32 + # Check actual max value to determine normalization + max_val = np.nanmax(rgb) + if max_val > 0: + # If max is around 255 or less, assume 8-bit + if max_val <= 255: + rgb = rgb / 255.0 + # If max is around 65535, assume 16-bit + elif max_val <= 65535: + rgb = rgb / 65535.0 + # Otherwise divide by max to normalize + else: + rgb = rgb / max_val + + rgb = np.clip(rgb, 0, 1) + + # Check if result is all NaN + if np.all(np.isnan(rgb)): + return None + + # Replace any remaining NaN with 0 (cloud/invalid pixels) + rgb = np.nan_to_num(rgb, nan=0.0) + + return rgb except Exception as e: return None @@ -284,9 +305,16 @@ def load_and_composite_tiles_rgb(tile_paths, field_boundary): if len(rgb_arrays) == 1: composited = rgb_arrays[0] else: - # If multiple tiles, use max composite - stacked = np.stack(rgb_arrays, axis=0) - composited = np.max(stacked, axis=0) + # If multiple tiles, need to handle different shapes + # Find common shape or use max/min approach that handles variable sizes + try: + # Try to stack if same shape + stacked = np.stack(rgb_arrays, axis=0) + composited = np.max(stacked, axis=0) + except ValueError: + # Different shapes - use the largest (most complete) tile + # This happens when tiles are masked to different field areas + composited = max(rgb_arrays, key=lambda x: x.size) composited = composited.astype(np.float32) @@ -307,6 +335,26 @@ def load_and_composite_tiles_rgb(tile_paths, field_boundary): return None +def has_valid_rgb_data(rgb_data, threshold=0.05): + """ + Check if RGB image has actual data (not black/empty). + Returns True if max value > threshold (not all zeros/black). + """ + if rgb_data is None: + return False + + try: + # Check if there's any variation in the RGB data + data_max = np.nanmax(rgb_data) + data_min = np.nanmin(rgb_data) + + # Image is valid if max > threshold AND there's variation + has_data = data_max > threshold and (data_max - data_min) > 0.01 + return has_data + except: + return False + + def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary, title, output_dir, harvest_type='registered', model_name=None, harvest_index=None): """ @@ -334,91 +382,263 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field """ harvest_date = pd.Timestamp(harvest_date) - # Target dates: 15 images at 7-day intervals (8 pre, 1 near, 6 post) - target_dates = [ - harvest_date - timedelta(days=56), # T-56d - harvest_date - timedelta(days=49), # T-49d - harvest_date - timedelta(days=42), # T-42d - harvest_date - timedelta(days=35), # T-35d - harvest_date - timedelta(days=28), # T-28d - harvest_date - timedelta(days=21), # T-21d - harvest_date - timedelta(days=14), # T-14d - harvest_date - timedelta(days=7), # T-7d - harvest_date, # T~0d (near harvest) - harvest_date + timedelta(days=7), # T+7d - harvest_date + timedelta(days=14), # T+14d - harvest_date + timedelta(days=21), # T+21d - harvest_date + timedelta(days=28), # T+28d - harvest_date + timedelta(days=35), # T+35d - harvest_date + timedelta(days=42), # T+42d - harvest_date + timedelta(days=56), # T+56d (Note: non-standard to fill 5th col in row 3) - ] + # Pre-allocate 15 image slots + rgb_images = [None] * 15 + days_offsets = [None] * 15 + actual_dates = [None] * 15 + used_dates = set() # Use set for efficient lookups - # Find TIFFs for each date - rgb_images = [] - days_offsets = [] - actual_dates = [] # Store actual dates of TIFFs found + # STEP 0: Debug - List all available dates + print(f" [STEP 0] Checking available TIFF dates in {tiff_dir}...") + available_dates = [] + if tiff_dir.exists(): + for date_folder in sorted(tiff_dir.iterdir()): + if date_folder.is_dir(): + try: + date_obj = datetime.strptime(date_folder.name, '%Y-%m-%d').date() + available_dates.append(date_obj) + except: + pass + print(f" Found {len(available_dates)} dates with data: {available_dates[:5]}... (showing first 5)") - for target in target_dates: - tile_paths, actual_date, days_diff = find_overlapping_tiles(target, tiff_dir, field_boundary, days_window=60) - - if not tile_paths or actual_date is None: - rgb_images.append(None) - days_offsets.append(None) - actual_dates.append(None) - print(f" ⚠ No tiles found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data") - continue - - rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) - rgb_images.append(rgb) - days_offsets.append(days_diff) - actual_dates.append(actual_date) - - if rgb is not None: - print(f" ✓ Loaded {len(tile_paths)} tile(s) for {actual_date.strftime('%Y-%m-%d')} ({days_diff:+d}d from target)") + # STEP 1: Find anchor image (closest to predicted harvest date) FIRST + # Search within ±14 days of predicted harvest date first, then expand if needed + print(f" [STEP 1] Finding anchor (closest to harvest {harvest_date.strftime('%Y-%m-%d')}, searching ±14 days)...") + anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles( + harvest_date, tiff_dir, field_boundary, days_window=14, exclude_dates=[], debug=False + ) + + anchor_rgb = None + anchor_idx = 8 # Position 8 is the center (T~0d / harvest date position) + failed_anchor_dates = [] # Track dates that failed validation + + if anchor_tile_paths and anchor_date: + anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary) + if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb): + rgb_images[anchor_idx] = anchor_rgb + days_offsets[anchor_idx] = 0 # Anchor is reference point + actual_dates[anchor_idx] = anchor_date + used_dates.add(anchor_date) + print(f" ✓ ANCHOR FOUND (±14d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)") else: - print(f" ⚠ Loaded {len(tile_paths)} tile(s) but RGB data is None") + failed_anchor_dates.append(anchor_date) + print(f" ⚠ Found date {anchor_date.strftime('%Y-%m-%d')} within ±14d, but image has no valid data") + print(f" [RETRY] Expanding anchor search to ±60 days (excluding failed dates)...") + anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles( + harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=set(failed_anchor_dates), debug=False + ) + if anchor_tile_paths and anchor_date: + anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary) + if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb): + rgb_images[anchor_idx] = anchor_rgb + days_offsets[anchor_idx] = 0 # Anchor is reference point + actual_dates[anchor_idx] = anchor_date + used_dates.add(anchor_date) + print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)") + else: + failed_anchor_dates.append(anchor_date) + print(f" ✗ No valid anchor found even within ±60 days") + else: + print(f" ✗ No tiles found for any date within ±60 days") + else: + print(f" ⚠ No tiles found within ±14 days, expanding search...") + anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles( + harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=[], debug=False + ) + if anchor_tile_paths and anchor_date: + anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary) + if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb): + rgb_images[anchor_idx] = anchor_rgb + days_offsets[anchor_idx] = 0 # Anchor is reference point + actual_dates[anchor_idx] = anchor_date + used_dates.add(anchor_date) + print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)") + else: + print(f" ✗ No valid anchor found even within ±60 days") + else: + print(f" ✗ No tiles found for any date within ±60 days") + + # STEP 2: Dynamically collect images BEFORE anchor date + # Strategy: Go backwards from anchor with progressively larger search windows + # Start at 7 days, then try 10, 15, 20, 30+ days apart + print(f" [STEP 2] Collecting images BEFORE anchor (going backwards, flexible spacing)...") + before_positions = [7, 6, 5, 4, 3, 2, 1, 0] # Will fill in reverse order (7→0) + before_images = [] # (position, date, rgb, offset) + + pos_idx = 0 # Index into before_positions + last_found_date = anchor_date + + # Progressive search offsets: try these day offsets in order + search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days before last found image + + while pos_idx < len(before_positions) and last_found_date.year >= 2024: + found_this_iteration = False + + # Try each offset until we find a valid image + for days_offset in search_offsets: + search_target_date = last_found_date - timedelta(days=days_offset) + + tile_paths, actual_date, days_diff = find_overlapping_tiles( + search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False + ) + + if tile_paths and actual_date: + rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) + if rgb is not None and has_valid_rgb_data(rgb): + # Found valid image! + overall_max = np.nanmax(rgb) + overall_min = np.nanmin(rgb) + + offset_from_anchor = (actual_date - anchor_date).days + before_images.append((before_positions[pos_idx], actual_date, rgb, offset_from_anchor)) + used_dates.add(actual_date) + last_found_date = actual_date # Move backwards from this date + + print(f" ✓ Before[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}") + + pos_idx += 1 + found_this_iteration = True + break # Found one, stop trying larger offsets + + # If nothing found with any offset, we're done collecting before images + if not found_this_iteration: + break + + # Store collected before images + for pos, actual_date, rgb, offset in before_images: + rgb_images[pos] = rgb + actual_dates[pos] = actual_date + days_offsets[pos] = offset + + # STEP 3: Dynamically collect images AFTER anchor date + # Strategy: Go forwards from anchor with progressively larger search windows + # Start at 7 days, then try 10, 15, 20, 30+ days apart + print(f" [STEP 3] Collecting images AFTER anchor (going forwards, flexible spacing)...") + after_positions = [9, 10, 11, 12, 13, 14] # Will fill in order (9→14) + after_images = [] # (position, date, rgb, offset) + + pos_idx = 0 # Index into after_positions + last_found_date = anchor_date + max_search_date = anchor_date + timedelta(days=200) # Don't search beyond 200 days forward + + # Progressive search offsets: try these day offsets in order + search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days after last found image + + while pos_idx < len(after_positions) and last_found_date < max_search_date: + found_this_iteration = False + + # Try each offset until we find a valid image + for days_offset in search_offsets: + search_target_date = last_found_date + timedelta(days=days_offset) + + # Don't search beyond max date + if search_target_date > max_search_date: + break + + tile_paths, actual_date, days_diff = find_overlapping_tiles( + search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False + ) + + if tile_paths and actual_date: + rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) + if rgb is not None and has_valid_rgb_data(rgb): + # Found valid image! + overall_max = np.nanmax(rgb) + overall_min = np.nanmin(rgb) + + offset_from_anchor = (actual_date - anchor_date).days + after_images.append((after_positions[pos_idx], actual_date, rgb, offset_from_anchor)) + used_dates.add(actual_date) + last_found_date = actual_date # Move forwards from this date + + print(f" ✓ After[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}") + + pos_idx += 1 + found_this_iteration = True + break # Found one, stop trying larger offsets + + # If nothing found with any offset, we're done collecting after images + if not found_this_iteration: + break + + # Store collected after images + for pos, actual_date, rgb, offset in after_images: + rgb_images[pos] = rgb + actual_dates[pos] = actual_date + days_offsets[pos] = offset # Create 5x3 grid plot (15 images) fig, axes = plt.subplots(3, 5, figsize=(25, 15)) - fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")}', + + # Build title with anchor offset information + anchor_offset_from_harvest = (actual_dates[8] - harvest_date).days if actual_dates[8] is not None else None + if anchor_offset_from_harvest is not None and anchor_offset_from_harvest != 0: + anchor_info = f"(Anchor: {actual_dates[8].strftime('%Y-%m-%d')}, {anchor_offset_from_harvest:+d}d from predicted harvest)" + else: + anchor_info = f"(Exact match with anchor: {actual_dates[8].strftime('%Y-%m-%d')})" if actual_dates[8] is not None else "" + + fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")} {anchor_info}', fontsize=16, fontweight='bold') # Grid positions (5 columns, 3 rows = 15 images) positions = [ ('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4), - ('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('T~0d', 1, 3), ('T+7d', 1, 4), + ('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('HARVEST', 1, 3), ('T+7d', 1, 4), ('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4), ] - for idx, (label, row, col) in enumerate(positions): # All 15 images + for idx, (label, row, col) in enumerate(positions): ax = axes[row, col] if idx < len(rgb_images) and rgb_images[idx] is not None: rgb_data = rgb_images[idx] - # Debug: check data range - data_min, data_max = np.nanmin(rgb_data), np.nanmax(rgb_data) - print(f" DEBUG: {label} RGB range: {data_min:.4f} - {data_max:.4f}, shape: {rgb_data.shape}") + # Debug: check data range for ALL bands + data_min = np.nanmin(rgb_data) + data_max = np.nanmax(rgb_data) + data_mean = np.nanmean(rgb_data) + data_std = np.nanstd(rgb_data) + + # Check per-band stats + r_min, r_max, r_mean = np.nanmin(rgb_data[:,:,0]), np.nanmax(rgb_data[:,:,0]), np.nanmean(rgb_data[:,:,0]) + g_min, g_max, g_mean = np.nanmin(rgb_data[:,:,1]), np.nanmax(rgb_data[:,:,1]), np.nanmean(rgb_data[:,:,1]) + b_min, b_max, b_mean = np.nanmin(rgb_data[:,:,2]), np.nanmax(rgb_data[:,:,2]), np.nanmean(rgb_data[:,:,2]) + + print(f" DEBUG VALID {label} ({actual_dates[idx].strftime('%Y-%m-%d')}): RGB overall {data_min:.4f}-{data_max:.4f} (mean={data_mean:.4f}, std={data_std:.4f})") + print(f" R: {r_min:.4f}-{r_max:.4f} (μ={r_mean:.4f}), G: {g_min:.4f}-{g_max:.4f} (μ={g_mean:.4f}), B: {b_min:.4f}-{b_max:.4f} (μ={b_mean:.4f})") # Display with explicit vmin/vmax to handle normalized 0-1 data ax.imshow(rgb_data, vmin=0, vmax=1) - # Build title: label + offset + actual date - offset_str = f"{days_offsets[idx]:+d}d" if days_offsets[idx] is not None else "?" - date_str = actual_dates[idx].strftime('%Y-%m-%d') if actual_dates[idx] is not None else "No Date" - ax.set_title(f'{label}\n{offset_str}\n{date_str}', fontsize=10, fontweight='bold') + # Build title: show BOTH anchor offset AND harvest offset + if days_offsets[idx] is not None: + offset_from_anchor = days_offsets[idx] + offset_from_harvest = (actual_dates[idx] - harvest_date).days + + if idx == 8: # ANCHOR/HARVEST position + if offset_from_harvest == 0: + offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')})" + else: + offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')}, {offset_from_harvest:+d}d from predicted)" + else: + # Show both offsets: from anchor and from harvest + offset_str = f"{offset_from_anchor:+d}d from anchor\n{offset_from_harvest:+d}d from harvest\n{actual_dates[idx].strftime('%Y-%m-%d')}" + else: + offset_str = "No Data" - # Add red box around harvest date (T~0d at row=1, col=3) - if label == 'T~0d': + ax.set_title(offset_str, fontsize=9, fontweight='bold') + + # Add red box around the ANCHOR IMAGE (position 8 is harvest/anchor) + if idx == 8: # Position 8 is the anchor for spine in ax.spines.values(): spine.set_edgecolor('red') spine.set_linewidth(4) else: ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray') - ax.set_title(label, fontsize=10) + ax.set_title('No Data', fontsize=10) + print(f" DEBUG EMPTY {label}: No image data collected") - # Add red box for T~0d even if no data - if label == 'T~0d': + # Add red box for anchor position even if no data + if idx == 8: # Position 8 is the anchor for spine in ax.spines.values(): spine.set_edgecolor('red') spine.set_linewidth(4) diff --git a/r_app/10_create_master_grid_and_split_tiffs.R b/r_app/10_create_master_grid_and_split_tiffs.R index 6279fb2..fdab65c 100644 --- a/r_app/10_create_master_grid_and_split_tiffs.R +++ b/r_app/10_create_master_grid_and_split_tiffs.R @@ -6,6 +6,8 @@ #' 2. Create master 5×5 grid covering all TIFFs #' 3. Split each daily TIFF into 25 tiles using the master grid #' 4. Save tiles in date-specific folders: daily_tiles/[DATE]/[DATE]_[TILE_ID].tif +#' & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-18 + library(terra) library(sf) @@ -32,7 +34,7 @@ if (length(args) >= 2) { } PROJECT <- "angata" -TIFF_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "merged_tif_8b") +TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b") # GRID SIZE CONFIGURATION - Change this to use different grid sizes # Options: 5x5 (25 tiles), 10x10 (100 tiles), etc. @@ -42,10 +44,10 @@ GRID_NCOLS <- 5 # Construct grid-specific subfolder path GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS) -OUTPUT_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) +OUTPUT_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) # Load field boundaries for overlap checking -GEOJSON_PATH <- file.path("..", "laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") +GEOJSON_PATH <- file.path("laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "") cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") diff --git a/r_app/20_ci_extraction.R b/r_app/20_ci_extraction.R index d9c3d37..ab82188 100644 --- a/r_app/20_ci_extraction.R +++ b/r_app/20_ci_extraction.R @@ -13,13 +13,13 @@ # # Examples: # # Angata 8-band data (with UDM cloud masking) -# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/02_ci_extraction.R 2026-01-02 7 angata merged_tif_8b +# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b # # # Aura 4-band data -# Rscript 02_ci_extraction.R 2025-11-26 7 aura merged_tif +# Rscript 20_ci_extraction.R 2025-11-26 7 aura merged_tif # # # Auto-detects and uses tiles if available: -# Rscript 02_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists) +# Rscript 20_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists) # 1. Load required packages # ----------------------- diff --git a/r_app/30_interpolate_growth_model.R b/r_app/30_interpolate_growth_model.R index 6707190..ed310e5 100644 --- a/r_app/30_interpolate_growth_model.R +++ b/r_app/30_interpolate_growth_model.R @@ -8,7 +8,7 @@ # # Usage: Rscript interpolate_growth_model.R [project_dir] # - project_dir: Project directory name (e.g., "chemba") -# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/03_interpolate_growth_model.R angata +# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/30_interpolate_growth_model.R angata # 1. Load required packages # ----------------------- diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index bef0764..ed2c330 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -186,8 +186,15 @@ main <- function() { end_date <- if (length(args) >= 1 && !is.na(args[1])) { as.Date(args[1]) } else if (exists("end_date", envir = .GlobalEnv)) { - # For recursive calls, use the end_date that was set in the global environment - get("end_date", envir = .GlobalEnv) + global_date <- get("end_date", envir = .GlobalEnv) + # Check if it's a valid Date with length > 0 + if (is.Date(global_date) && length(global_date) > 0 && !is.na(global_date)) { + global_date + } else if (exists("end_date_str", envir = .GlobalEnv)) { + as.Date(get("end_date_str", envir = .GlobalEnv)) + } else { + Sys.Date() + } } else if (exists("end_date_str", envir = .GlobalEnv)) { as.Date(get("end_date_str", envir = .GlobalEnv)) } else { @@ -210,10 +217,15 @@ main <- function() { 7 } + # Validate end_date is a proper Date object + if (is.null(end_date) || length(end_date) == 0 || !inherits(end_date, "Date")) { + stop("ERROR: end_date is not valid. Got: ", class(end_date), " with length ", length(end_date)) + } + assign("project_dir", project_dir, envir = .GlobalEnv) assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv) - message("\n" %+% strrep("=", 70)) + message("\n", strrep("=", 70)) message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION") message(strrep("=", 70)) message("Date:", format(end_date, "%Y-%m-%d")) @@ -238,7 +250,7 @@ main <- function() { # ========== PER-FIELD ANALYSIS (SC-64) ========== - message("\n" %+% strrep("-", 70)) + message("\n", strrep("-", 70)) message("PHASE 1: PER-FIELD WEEKLY ANALYSIS (SC-64 ENHANCEMENTS)") message(strrep("-", 70)) @@ -694,9 +706,9 @@ main <- function() { # ========== FINAL SUMMARY ========== - cat("\n" %+% strrep("=", 70) %+% "\n") + cat("\n", strrep("=", 70), "\n") cat("80_CALCULATE_KPIs.R - COMPLETION SUMMARY\n") - cat(strrep("=", 70) %+% "\n") + cat(strrep("=", 70), "\n") cat("Per-field analysis fields analyzed:", nrow(field_analysis_df), "\n") cat("Excel export:", export_paths$excel, "\n") cat("RDS export:", export_paths$rds, "\n") diff --git a/r_app/run_full_pipeline.R b/r_app/run_full_pipeline.R index 657fec1..ae6ff14 100644 --- a/r_app/run_full_pipeline.R +++ b/r_app/run_full_pipeline.R @@ -1,31 +1,40 @@ # ============================================================================== # FULL PIPELINE RUNNER # ============================================================================== -# Runs scripts 02, 03, 04, 09 (KPIs), 09 (Weekly), and 10 (CI Report Simple) +# Mixed Python/R pipeline: +# 1. Python: Download Planet images +# 2. R 10: Create master grid and split TIFFs +# 3. R 20: CI Extraction +# 4. R 21: Convert CI RDS to CSV +# 5. R 30: Interpolate growth model +# 6. Python 31: Harvest imminent weekly +# 7. R 40: Mosaic creation +# 8. R 80: Calculate KPIs # # ============================================================================== # HOW TO RUN THIS SCRIPT # ============================================================================== # -# In PowerShell or Command Prompt: +# Run from the smartcane/ directory: # # Option 1 (Recommended - shows real-time output): -# Rscript run_full_pipeline.R +# Rscript r_app/run_full_pipeline.R # # Option 2 (Full path to Rscript - use & in PowerShell for paths with spaces): -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" run_full_pipeline.R +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R # # Option 3 (Batch mode - output saved to .Rout file): -# R CMD BATCH --vanilla run_full_pipeline.R +# R CMD BATCH --vanilla r_app/run_full_pipeline.R # # ============================================================================== # ============================================================================== # *** EDIT THESE VARIABLES *** -end_date <- "2025-12-24" # or specify: "2025-12-02", Sys.Date() +end_date <- as.Date("2026-01-27") # or specify: as.Date("2026-01-27") , Sys.Date() offset <- 7 # days to look back project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba" data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif" +force_rerun <- FALSE # Set to TRUE to force all scripts to run even if outputs exist # *************************** # Format dates @@ -35,117 +44,350 @@ end_date_str <- format(as.Date(end_date), "%Y-%m-%d") pipeline_success <- TRUE # ============================================================================== -# SCRIPT 02: CI EXTRACTION +# INTELLIGENT CHECKING: What has already been completed? # ============================================================================== -cat("\n========== RUNNING SCRIPT 02: CI EXTRACTION ==========\n") -tryCatch({ - source("r_app/02_ci_extraction.R") - main() # Call the main function - cat("✓ Script 02 completed\n") -}, error = function(e) { - cat("✗ Error in Script 02:", e$message, "\n") - pipeline_success <<- FALSE -}) +cat("\n========== CHECKING EXISTING OUTPUTS ==========\n") -# ============================================================================== -# SCRIPT 03: INTERPOLATE GROWTH MODEL -# ============================================================================== -cat("\n========== RUNNING SCRIPT 03: INTERPOLATE GROWTH MODEL ==========\n") -tryCatch({ - source("r_app/03_interpolate_growth_model.R") - main() # Call the main function - cat("✓ Script 03 completed\n") -}, error = function(e) { - cat("✗ Error in Script 03:", e$message, "\n") - pipeline_success <<- FALSE -}) - -# ============================================================================== -# SCRIPT 04: MOSAIC CREATION -# ============================================================================== -cat("\n========== RUNNING SCRIPT 04: MOSAIC CREATION ==========\n") -tryCatch({ - source("r_app/04_mosaic_creation.R") - main() # Call the main function - cat("✓ Script 04 completed\n") -}, error = function(e) { - cat("✗ Error in Script 04:", e$message, "\n") - pipeline_success <<- FALSE -}) - -# ============================================================================== -# SCRIPT 09: CALCULATE KPIs -# ============================================================================== -cat("\n========== RUNNING SCRIPT 09: CALCULATE KPIs ==========\n") -tryCatch({ - source("r_app/09_calculate_kpis.R") - main() # Call the main function - cat("✓ Script 09 (KPIs) completed\n") -}, error = function(e) { - cat("✗ Error in Script 09 (KPIs):", e$message, "\n") - pipeline_success <<- FALSE -}) - -# ============================================================================== -# SCRIPT 09: FIELD ANALYSIS WEEKLY -# ============================================================================== -# Only run field analysis weekly for angata project -if (project_dir == "angata") { - cat("\n========== RUNNING SCRIPT 09: FIELD ANALYSIS WEEKLY ==========\n") - tryCatch({ - source("r_app/09_field_analysis_weekly.R") - main() # Call the main function - cat("✓ Script 09 (Weekly) completed\n") - }, error = function(e) { - cat("✗ Error in Script 09 (Weekly):", e$message, "\n") - pipeline_success <<- FALSE - }) +# Check Script 10 outputs (tiled splits) +tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5") +tiles_dates <- if (dir.exists(tiles_dir)) { + list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE) } else { - cat("\n========== SKIPPING SCRIPT 09: FIELD ANALYSIS WEEKLY (only runs for angata) ==========\n") + c() } +cat(sprintf("Script 10: %d dates already tiled\n", length(tiles_dates))) + +# Check Script 20 outputs (CI extraction) - daily RDS files +ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals") +ci_files <- if (dir.exists(ci_daily_dir)) { + list.files(ci_daily_dir, pattern = "\\.rds$") +} else { + c() +} +cat(sprintf("Script 20: %d CI daily RDS files exist\n", length(ci_files))) + +# Check Script 21 outputs (CSV conversion) - note: this gets overwritten each time, so we don't skip based on this +# Instead, check if CI RDS files exist - if they do, 21 should also run +# For now, just note that CSV is time-dependent, not a good skip indicator +cat("Script 21: CSV file exists but gets overwritten - will run if Script 20 runs\n") + +# Check Script 40 outputs (mosaics in weekly_tile_max/5x5) +mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5") +mosaic_files <- if (dir.exists(mosaic_dir)) { + list.files(mosaic_dir, pattern = "\\.tif$") +} else { + c() +} +cat(sprintf("Script 40: %d mosaic files exist\n", length(mosaic_files))) + +# Check Script 80 outputs (KPIs in reports/kpis/field_stats) +kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats") +kpi_files <- if (dir.exists(kpi_dir)) { + list.files(kpi_dir, pattern = "\\.csv$|\\.json$") +} else { + c() +} +cat(sprintf("Script 80: %d KPI files exist\n", length(kpi_files))) + +# Determine if scripts should run based on outputs +skip_10 <- length(tiles_dates) > 0 && !force_rerun +skip_20 <- length(ci_files) > 0 && !force_rerun +skip_21 <- length(ci_files) > 0 && !force_rerun # Skip 21 if 20 is skipped +skip_40 <- length(mosaic_files) > 0 && !force_rerun +skip_80 <- FALSE # Always run Script 80 - it calculates KPIs for the current week (end_date), not historical weeks + +cat("\nSkipping decisions:\n") +cat(sprintf(" Script 10: %s\n", if(skip_10) "SKIP (tiles exist)" else "RUN")) +cat(sprintf(" Script 20: %s\n", if(skip_20) "SKIP (CI exists)" else "RUN")) +cat(sprintf(" Script 21: %s\n", if(skip_21) "SKIP (CI exists)" else "RUN")) +cat(sprintf(" Script 40: %s\n", if(skip_40) "SKIP (mosaics exist)" else "RUN")) +cat(sprintf(" Script 80: %s\n", if(skip_80) "SKIP (KPIs exist)" else "RUN")) # ============================================================================== -# SCRIPT 91: CI REPORT ANGATA (only for angata) +# PYTHON: DOWNLOAD PLANET IMAGES (MISSING DATES ONLY) # ============================================================================== -if (project_dir == "angata") { - cat("\n========== RUNNING SCRIPT 91: CI REPORT ANGATA ==========\n") - if (pipeline_success) { - tryCatch({ - rmarkdown::render("r_app/91_CI_report_with_kpis_Angata.Rmd", - output_format = "word_document", - params = list(data_dir = project_dir, report_date = end_date_str)) - cat("✓ Script 91 (Report) completed\n") - }, error = function(e) { - cat("✗ Error in Script 91 (Report):", e$message, "\n") - }) - } else { - cat("✗ Skipping Script 91: Previous pipeline scripts failed\n") +cat("\n========== DOWNLOADING PLANET IMAGES (MISSING DATES ONLY) ==========\n") +tryCatch({ + # Setup paths + base_path <- file.path("laravel_app", "storage", "app", project_dir) + merged_tifs_dir <- file.path(base_path, data_source) + + # Get existing dates from raw TIFFs + existing_tiff_files <- list.files(merged_tifs_dir, pattern = "^\\d{4}-\\d{2}-\\d{2}\\.tif$") + existing_tiff_dates <- sub("\\.tif$", "", existing_tiff_files) + + # Get existing dates from tiles (better indicator of completion) + existing_tile_dates <- tiles_dates + + # Find missing dates in the window + start_date <- end_date - offset + date_seq <- seq(start_date, end_date, by = "day") + target_dates <- format(date_seq, "%Y-%m-%d") + + # Only download if tiles don't exist yet (more reliable than checking raw TIFFs) + missing_dates <- target_dates[!(target_dates %in% existing_tile_dates)] + + cat(sprintf(" Existing tiled dates: %d\n", length(existing_tile_dates))) + cat(sprintf(" Missing dates in window: %d\n", length(missing_dates))) + + # Download each missing date + download_count <- 0 + download_failed <- 0 + + if (length(missing_dates) > 0) { + # Save current directory + original_dir <- getwd() + + # Change to python_app directory so relative paths work correctly + setwd("python_app") + + for (date_str in missing_dates) { + cmd <- sprintf('python 00_download_8band_pu_optimized.py "%s" --date "%s" --resolution 3 --cleanup', project_dir, date_str) + result <- system(cmd, ignore.stdout = FALSE, ignore.stderr = FALSE) + if (result == 0) { + download_count <- download_count + 1 + } else { + download_failed <- download_failed + 1 + } } + + # Change back to original directory + setwd(original_dir) + } + + cat(sprintf("✓ Downloaded %d dates, %d failed\n", download_count, download_failed)) + if (download_failed > 0) { + cat("⚠ Some downloads failed, but continuing pipeline\n") + } + + # Force Script 10 to run ONLY if downloads actually succeeded (not just attempted) + if (download_count > 0) { + skip_10 <- FALSE + } + +}, error = function(e) { + cat("✗ Error in planet download:", e$message, "\n") + pipeline_success <<- FALSE +}) + +# ============================================================================== +# SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs +# ============================================================================== +if (pipeline_success && !skip_10) { + cat("\n========== RUNNING SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs ==========\n") + tryCatch({ + # Set environment variables for the script (Script 10 uses these for filtering) + assign("PROJECT", project_dir, envir = .GlobalEnv) + + # Suppress verbose per-date output, show only summary + sink(nullfile()) + source("r_app/10_create_master_grid_and_split_tiffs.R") + sink() + + # Verify output + tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5") + if (dir.exists(tiles_dir)) { + subdirs <- list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE) + cat(sprintf("✓ Script 10 completed - created tiles for %d dates\n", length(subdirs))) + } else { + cat("✓ Script 10 completed\n") + } + }, error = function(e) { + sink() + cat("✗ Error in Script 10:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_10) { + cat("\n========== SKIPPING SCRIPT 10 (tiles already exist) ==========\n") } # ============================================================================== -# SCRIPT 10: CI REPORT (SIMPLE) +# SCRIPT 20: CI EXTRACTION # ============================================================================== -# Only run CI report for non-angata projects +if (pipeline_success && !skip_20) { + cat("\n========== RUNNING SCRIPT 20: CI EXTRACTION ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/20_ci_extraction.R") + main() # Call main() to execute the script with the environment variables + + # Verify CI output was created + ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals") + if (dir.exists(ci_daily_dir)) { + files <- list.files(ci_daily_dir, pattern = "\\.rds$") + cat(sprintf("✓ Script 20 completed - generated %d CI files\n", length(files))) + } else { + cat("✓ Script 20 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 20:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_20) { + cat("\n========== SKIPPING SCRIPT 20 (CI already extracted) ==========\n") +} +# ============================================================================== +# SCRIPT 21: CONVERT CI RDS TO CSV +# ============================================================================== +if (pipeline_success && !skip_21) { + cat("\n========== RUNNING SCRIPT 21: CONVERT CI RDS TO CSV ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + + source("r_app/21_convert_ci_rds_to_csv.R") + main() # Call main() to execute the script with the environment variables + + # Verify CSV output was created + ci_csv_path <- file.path("laravel_app", "storage", "app", project_dir, "ci_extracted") + if (dir.exists(ci_csv_path)) { + csv_files <- list.files(ci_csv_path, pattern = "\\.csv$") + cat(sprintf("✓ Script 21 completed - converted to %d CSV files\n", length(csv_files))) + } else { + cat("✓ Script 21 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 21:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_21) { + cat("\n========== SKIPPING SCRIPT 21 (CSV already created) ==========\n") +} -if (project_dir != "angata") { - cat("\n========== RUNNING SCRIPT 10: CI REPORT SIMPLE ==========\n") - if (pipeline_success) { - tryCatch({ - rmarkdown::render("r_app/10_CI_report_with_kpis_simple.Rmd", - output_format = "word_document", - params = list(data_dir = project_dir, report_date = end_date_str)) - cat("✓ Script 10 (Report) completed\n") - }, error = function(e) { - cat("✗ Error in Script 10 (Report):", e$message, "\n") - }) - } else { - cat("✗ Skipping Script 10: Previous pipeline scripts failed\n") - } - } else { - cat("\n========== SKIPPING SCRIPT 10: CI REPORT SIMPLE (not applicable for angata) ==========\n") - } +# ============================================================================== +# SCRIPT 30: INTERPOLATE GROWTH MODEL +# ============================================================================== +if (pipeline_success) { + cat("\n========== RUNNING SCRIPT 30: INTERPOLATE GROWTH MODEL ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/30_interpolate_growth_model.R") + main() # Call main() to execute the script with the environment variables + + # Verify interpolated output + growth_dir <- file.path("laravel_app", "storage", "app", project_dir, "growth_model_interpolated") + if (dir.exists(growth_dir)) { + files <- list.files(growth_dir, pattern = "\\.rds$|\\.csv$") + cat(sprintf("✓ Script 30 completed - generated %d growth model files\n", length(files))) + } else { + cat("✓ Script 30 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 30:", e$message, "\n") + pipeline_success <<- FALSE + }) +} + +# ============================================================================== +# PYTHON 31: HARVEST IMMINENT WEEKLY +# ============================================================================== +if (pipeline_success) { + cat("\n========== RUNNING PYTHON 31: HARVEST IMMINENT WEEKLY ==========\n") + tryCatch({ + # Run Python script in pytorch_gpu conda environment + # Script expects positional project name (not --project flag) + # Run from smartcane root so conda can find the environment + cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir) + cat("DEBUG: Running command:", cmd, "\n") + result <- system(cmd) + + if (result == 0) { + # Verify harvest output - check for THIS WEEK's specific file + current_week <- as.numeric(format(end_date, "%V")) + current_year <- as.numeric(format(end_date, "%Y")) + expected_file <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats", + sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year)) + + if (file.exists(expected_file)) { + cat(sprintf("✓ Script 31 completed - generated harvest imminent file for week %02d\n", current_week)) + } else { + cat("✓ Script 31 completed (check if harvest.xlsx is available)\n") + } + } else { + cat("⚠ Script 31 completed with errors (check harvest.xlsx availability)\n") + } + }, error = function(e) { + setwd(original_dir) + cat("⚠ Script 31 error:", e$message, "\n") + }) +} + +# ============================================================================== +# SCRIPT 40: MOSAIC CREATION +# ============================================================================== +if (pipeline_success && !skip_40) { + cat("\n========== RUNNING SCRIPT 40: MOSAIC CREATION ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/40_mosaic_creation.R") + main() # Call main() to execute the script with the environment variables + + # Verify mosaic output + mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5") + if (dir.exists(mosaic_dir)) { + files <- list.files(mosaic_dir, pattern = "\\.tif$") + cat(sprintf("✓ Script 40 completed - generated %d mosaic files\n", length(files))) + } else { + cat("✓ Script 40 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 40:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_40) { + cat("\n========== SKIPPING SCRIPT 40 (mosaics already created) ==========\n") +} + +# ============================================================================== +# SCRIPT 80: CALCULATE KPIs +# ============================================================================== +if (pipeline_success) { # Always run Script 80 - it calculates KPIs for the current week + cat("\n========== RUNNING SCRIPT 80: CALCULATE KPIs ==========\n") + tryCatch({ + # Set environment variables for the script (Script 80's main() uses these as fallbacks) + # NOTE: end_date is already a Date, just assign directly without as.Date() + assign("end_date", end_date, envir = .GlobalEnv) + assign("end_date_str", end_date_str, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/80_calculate_kpis.R") + main() # Call main() to execute the script with the environment variables + + # Verify KPI output + kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats") + if (dir.exists(kpi_dir)) { + files <- list.files(kpi_dir, pattern = "\\.csv$|\\.json$") + cat(sprintf("✓ Script 80 completed - generated %d KPI files\n", length(files))) + } else { + cat("✓ Script 80 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 80:", e$message, "\n") + cat("Full error:\n") + print(e) + pipeline_success <<- FALSE + }) +} # ============================================================================== # SUMMARY @@ -154,4 +396,9 @@ cat("\n========== PIPELINE COMPLETE ==========\n") cat(sprintf("Project: %s\n", project_dir)) cat(sprintf("End Date: %s\n", end_date_str)) cat(sprintf("Offset: %d days\n", offset)) -cat("Scripts executed: 02, 03, 04, 09 (KPIs), 09 (Weekly), 10 (CI Report)\n") +if (pipeline_success) { + cat("Status: ✓ All scripts completed successfully\n") +} else { + cat("Status: ✗ Pipeline failed - check errors above\n") +} +cat("Pipeline sequence: Python Download → R 10 → R 20 → R 21 → R 30 → Python 31 → R 40 → R 80\n")