updated code to fit in workflow better

2026-01-12 16:33:23 +01:00 · 2026-01-12 16:33:23 +01:00 · cb63cf00b8
parent d1303dd218
commit cb63cf00b8
11 changed files with 2602 additions and 69 deletions
--- a/python_app/00_download_8band_pu_optimized.py
+++ b/python_app/00_download_8band_pu_optimized.py
@ -31,7 +31,7 @@ Examples:
        python download_8band_pu_optimized.py chemba  # Uses today's date
        python download_8band_pu_optimized.py xinavane --clear-singles --cleanup
        python download_8band_pu_optimized.py angata --clear-all --resolution 5
-
+ 
 Cost Model:
        - 4-band uint16 with cloud masking: ~50% lower cost than 9-band FLOAT32
        - Reduced bbox sizes: ~10-20% lower cost due to smaller average tile size
@ -39,6 +39,18 @@ Cost Model:
        - Requests: Slightly higher (~50-60 tiles) but within 700k budget
        Expected result: ~75% PU savings with dynamic geometry-fitted grid
 Example running it in powershell:
        $startDate = [DateTime]::ParseExact("2025-11-01", "yyyy-MM-dd", $null)
        $endDate = [DateTime]::ParseExact("2025-12-24", "yyyy-MM-dd", $null)
        $current = $startDate
        while ($current -le $endDate) {
            $dateStr = $current.ToString("yyyy-MM-dd")
            Write-Host "Downloading $dateStr..."
            python download_8band_pu_optimized.py angata --date $dateStr
            $current = $current.AddDays(1)
        }
 """
 import os
--- a/python_app/01_harvest_baseline_prediction.py
+++ b/python_app/01_harvest_baseline_prediction.py
@ -0,0 +1,111 @@
 """
 Script: 01_harvest_baseline_prediction.py
 Purpose: BASELINE PREDICTION - Run ONCE to establish harvest date baseline for all fields and seasons
 This script processes COMPLETE historical CI data (all available dates) and uses Model 307 
 to predict ALL harvest dates across the entire dataset. This becomes your reference baseline
 for monitoring and comparisons going forward.
 RUN FREQUENCY: Once during initial setup
 INPUT: ci_data_for_python.csv (complete historical CI data from 02b_convert_rds_to_csv.R)
        Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
 OUTPUT: harvest_production_export.xlsx (baseline harvest predictions for all fields/seasons)
 Workflow:
 1. Load ci_data_for_python.csv (daily interpolated, all historical dates)
 2. Group data by field and season (Model 307 detects season boundaries internally)
 3. Run two-step harvest detection (Phase 1: fast detection, Phase 2: ±40 day refinement)
 4. Export harvest_production_export.xlsx with columns:
   - field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date
 Two-Step Detection Algorithm:
  Phase 1 (Growing Window): Expands daily, checks when detected_prob > 0.5 for 3 consecutive days
  Phase 2 (Refinement): Extracts ±40 day window, finds peak harvest signal with argmax
 This is your GROUND TRUTH - compare all future predictions against this baseline.
 Usage:
  python 01_harvest_baseline_prediction.py [project_name]
  Examples:
    python 01_harvest_baseline_prediction.py angata
    python 01_harvest_baseline_prediction.py esa
    python 01_harvest_baseline_prediction.py chemba
  If no project specified, defaults to 'angata'
 """
 import pandas as pd
 import numpy as np
 import torch
 import sys
 from pathlib import Path
 from harvest_date_pred_utils import (
    load_model_and_config,
    extract_features,
    run_two_step_refinement,
    build_production_harvest_table
 )
 def main():
    # Get project name from command line or use default
    project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
    # Construct paths
    base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
    ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
    CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
    harvest_data_dir = base_storage / "HarvestData"
    harvest_data_dir.mkdir(parents=True, exist_ok=True)  # Create if doesn't exist
    OUTPUT_XLSX = harvest_data_dir / "harvest_production_export.xlsx"
    MODEL_DIR = Path(".")  # Model files in python_app/
    # Check if input exists
    if not CI_DATA_FILE.exists():
        print(f"ERROR: {CI_DATA_FILE} not found")
        print(f"       Expected at: {CI_DATA_FILE.resolve()}")
        print(f"\n       Run 02b_convert_rds_to_csv.R first to generate this file:")
        print(f"       Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}")
        return
    print("="*80)
    print(f"HARVEST DATE PREDICTION - LSTM MODEL 307 ({project_name})")
    print("="*80)
    # [1/4] Load model
    print("\n[1/4] Loading Model 307...")
    model, config, scalers = load_model_and_config(MODEL_DIR)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"  Device: {device}")
    # [2/4] Load and prepare CI data
    print("\n[2/4] Loading CI data...")
    print(f"  From: {CI_DATA_FILE}")
    ci_data = pd.read_csv(CI_DATA_FILE)
    ci_data['Date'] = pd.to_datetime(ci_data['Date'])
    print(f"  Loaded {len(ci_data)} daily rows across {ci_data['field'].nunique()} fields")
    print(f"  Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
    # [3/4] Run model predictions with two-step detection
    print("\n[3/4] Running two-step harvest detection...")
    refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device)
    # Build and export
    print("\nBuilding production harvest table...")
    prod_table = build_production_harvest_table(refined_results)
    prod_table.to_excel(OUTPUT_XLSX, index=False)
    print(f"\n✓ Exported {len(prod_table)} predictions to {OUTPUT_XLSX}")
    print(f"\nOutput location: {OUTPUT_XLSX.resolve()}")
    print(f"\nStorage structure:")
    print(f"  Input:  laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
    print(f"  Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
    print(f"\nColumn structure:")
    print(f"  field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date")
    print(f"\nNext steps:")
    print(f"  1. Review baseline predictions in harvest_production_export.xlsx")
    print(f"  2. Run weekly monitoring: python 02_harvest_imminent_weekly.py {project_name}")
 if __name__ == "__main__":
    main()
--- a/python_app/02_harvest_imminent_weekly.py
+++ b/python_app/02_harvest_imminent_weekly.py
@ -0,0 +1,348 @@
 """
 Script: 02_harvest_imminent_weekly.py
 Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields
 This script runs on RECENT CI data (typically last 300 days) to predict whether each field
 is approaching harvest. Use this for operational decision-making and real-time alerts.
 RUN FREQUENCY: Weekly (or daily if required)
 INPUT: 
  - ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R)
    Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
  - harvest_production_export.xlsx (baseline from script 01 - optional, for reference)
 OUTPUT: 
  - harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
 Workflow:
 1. Load harvest_production_export.xlsx (baseline dates - optional, for context)
 2. Load ci_data_for_python.csv (recent CI data)
 3. For each field, extract last 300 days of history
 4. Run Model 307 inference on full sequence (last timestep probabilities)
 5. Export harvest_imminent_weekly.csv with probabilities
 Output Columns:
  - field: Field ID
  - sub_field: Sub-field identifier
  - imminent_prob: Probability field will be harvestable in next 28 days (0.0-1.0)
  - detected_prob: Probability field is currently being harvested (0.0-1.0)
  - week: ISO week number
  - year: Year
  - as_of_date: Latest date in dataset
  - num_days: Number of days of history used
 Use Cases:
  - Alert when imminent_prob > 0.7 (prepare harvest operations)
  - Alert when detected_prob > 0.6 (field is being harvested)
  - Track trends over weeks to validate baseline predictions
  - Feed into 09b script for weekly dashboard reports
 Usage:
  python 02_harvest_imminent_weekly.py [project_name]
  Examples:
    python 02_harvest_imminent_weekly.py angata
    python 02_harvest_imminent_weekly.py esa
    python 02_harvest_imminent_weekly.py chemba
  If no project specified, defaults to 'angata'
 """
 import pandas as pd
 import numpy as np
 import torch
 import subprocess
 import sys
 from pathlib import Path
 from datetime import datetime, timedelta
 from harvest_date_pred_utils import (
    load_model_and_config,
    extract_features,
 )
 def load_harvest_dates(harvest_file):
    """Load latest harvest end dates from Excel file (from harvest_production_export.xlsx)."""
    print("[1/5] Loading harvest dates...")
    if not Path(harvest_file).exists():
        print(f"  ERROR: {harvest_file} not found")
        print("  Using 180-day lookback as default")
        return None
    try:
        harvest_df = pd.read_excel(harvest_file)
        print(f"  Loaded {len(harvest_df)} field-season records")
        # Use season_end_date column (output from harvest prediction script)
        harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date'])
        # Group by field and get the latest season_end_date
        harvest_dates = {}
        for field_id, group in harvest_df.groupby('field'):
            latest_end = group['season_end_date'].max()
            harvest_dates[str(field_id).strip()] = latest_end
        print(f"  Successfully mapped {len(harvest_dates)} fields")
        print(f"  Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
        return harvest_dates
    except Exception as e:
        print(f"  ERROR loading harvest file: {e}")
        print(f"  Using 180-day lookback instead")
        return None
 def run_rds_to_csv_conversion():
    """Run R script to convert RDS to CSV."""
    print("\n[2/5] Converting RDS to CSV (daily interpolation)...")
    r_script = Path("02b_convert_rds_to_csv.R")
    if not r_script.exists():
        print(f"  ERROR: {r_script} not found")
        return False
    # Use full path to Rscript on Windows
    rscript_exe = r"C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe"
    try:
        result = subprocess.run(
            [rscript_exe, str(r_script)],
            capture_output=True,
            text=True,
            timeout=300
        )
        if result.returncode != 0:
            print(f"  ERROR running R script:\n{result.stderr}")
            return False
        # Show last few lines of output
        lines = result.stdout.strip().split('\n')
        for line in lines[-5:]:
            if line.strip():
                print(f"  {line}")
        return True
    except Exception as e:
        print(f"  ERROR: {e}")
        return False
 def load_ci_data(csv_file):
    """Load CI data."""
    print("\n[3/5] Loading CI data...")
    if not Path(csv_file).exists():
        print(f"  ERROR: {csv_file} not found")
        return None
    ci_data = pd.read_csv(csv_file)
    ci_data['Date'] = pd.to_datetime(ci_data['Date'])
    print(f"  Loaded {len(ci_data)} daily rows for {ci_data['field'].nunique()} fields")
    print(f"  Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
    return ci_data
 def extract_seasonal_data(field_id, harvest_date, ci_data):
    """
    Extract CI data from harvest date to latest for a specific field.
    Returns dataframe sorted by date, or None if insufficient data.
    """
    # field_id is int, ci_data['field'] is also int
    field_data = ci_data[ci_data['field'] == field_id].copy()
    if len(field_data) == 0:
        return None
    # Filter from harvest date onwards
    field_data = field_data[field_data['Date'] >= harvest_date].sort_values('Date')
    # Need at least 30 days of data for meaningful inference
    if len(field_data) < 30:
        return None
    return field_data
 def run_inference_on_season(field_data, model, config, scalers, device, ci_column='FitData'):
    """
    Run Model 307 inference on recent field CI history.
    Predicts probability that field will be ready to harvest in next 28 days.
    Uses last timestep from the provided data sequence.
    Returns (imminent_prob, detected_prob) for prediction.
    """
    try:
        # Use last 300 days of data for inference (enough history for meaningful patterns, 
        # avoids training data seasonality mismatch)
        if len(field_data) > 300:
            field_data = field_data.iloc[-300:]
        # Extract features 
        features_array = extract_features(field_data, config['features'], ci_column)
        if features_array.shape[0] < 10:
            return None, None
        # Scale features using per-feature scalers (CRITICAL: same as Phase 1 in harvest_date_pred_utils.py)
        # Scalers is a list of StandardScaler objects, one per feature
        if scalers and isinstance(scalers, list):
            for fi, scaler in enumerate(scalers):
                try:
                    features_array[:, fi] = scaler.transform(features_array[:, fi].reshape(-1, 1)).flatten()
                except Exception:
                    pass
        # Run inference
        with torch.no_grad():
            x_tensor = torch.tensor(features_array, dtype=torch.float32).unsqueeze(0).to(device)
            out_imm, out_det = model(x_tensor)
            # Get last timestep probabilities
            imminent_prob = out_imm.squeeze(0)[-1].cpu().item()
            detected_prob = out_det.squeeze(0)[-1].cpu().item()
        return round(imminent_prob, 4), round(detected_prob, 4)
    except Exception as e:
        return None, None
 def main():
    # Get project name from command line or use default
    project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
    # Construct paths
    base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
    ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
    CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
    harvest_data_dir = base_storage / "HarvestData"
    BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx"
    OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv"
    harvest_data_dir.mkdir(parents=True, exist_ok=True)  # Create if doesn't exist
    print("="*80)
    print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})")
    print("="*80)
    # [1] Load harvest dates (optional - for projects with predictions)
    harvest_dates = None
    if BASELINE_FILE.exists():
        harvest_dates = load_harvest_dates(BASELINE_FILE)
    else:
        print("[1/5] Loading harvest dates...")
        print(f"  INFO: {BASELINE_FILE} not found (optional for weekly monitoring)")
    # [2] Load CI data
    print(f"\n[2/5] Loading CI data...")
    print(f"  From: {CI_DATA_FILE}")
    if not CI_DATA_FILE.exists():
        print(f"  ERROR: {CI_DATA_FILE} not found")
        print(f"  Expected at: {CI_DATA_FILE.resolve()}")
        print(f"\n  Run 02b_convert_rds_to_csv.R first to generate this file:")
        print(f"  Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}")
        return
    ci_data = load_ci_data(CI_DATA_FILE)
    if ci_data is None:
        print("ERROR: Could not load CI data")
        return
    # [3] Load model (from python_app directory)
    print("\n[3/5] Loading Model 307...")
    model_dir = Path(".")  # Current directory is python_app/, contains model.pt, config.json, scalers.pkl
    model, config, scalers = load_model_and_config(model_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"  Device: {device}")
    # [4] Run inference per field
    print("\n[4/5] Running seasonal inference...")
    results_list = []
    ci_column = config['data']['ci_column']
    # Get field metadata
    field_meta = ci_data.groupby('field').agg({
        'sub_field': 'first',
        'Date': 'max'
    }).reset_index()
    field_meta.columns = ['field', 'sub_field', 'latest_date']
    count = 0
    for field_id in ci_data['field'].unique():
        # Get metadata
        meta = field_meta[field_meta['field'] == field_id]
        if len(meta) == 0:
            continue
        sub_field = meta['sub_field'].iloc[0]
        latest_date = meta['latest_date'].iloc[0]
        # Use recent CI history (last 300 days from latest available data)
        field_data = ci_data[ci_data['field'] == field_id].copy()
        field_data = field_data.sort_values('Date')
        # Keep last 300 days of history for inference
        if len(field_data) > 300:
            field_data = field_data.iloc[-300:]
        if len(field_data) < 30:
            continue
        # Run inference on recent history to predict next 28 days
        imminent_prob, detected_prob = run_inference_on_season(
            field_data, model, config, scalers, device, ci_column
        )
        if imminent_prob is None:
            continue
        week = int(latest_date.strftime('%V'))
        year = int(latest_date.strftime('%Y'))
        results_list.append({
            'field': field_id,
            'sub_field': sub_field,
            'imminent_prob': imminent_prob,
            'detected_prob': detected_prob,
            'week': week,
            'year': year,
            'as_of_date': latest_date,
            'num_days': len(field_data),
        })
        count += 1
    print(f"  Completed inference for {count} fields")
    # Build output DataFrame
    df = pd.DataFrame(results_list)
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"\n[5/5] Exporting results...")
    print(f"✓ Exported {len(df)} fields to {OUTPUT_CSV}")
    print(f"  Output location: {OUTPUT_CSV.resolve()}")
    if len(df) > 0:
        print(f"\nSample rows:")
        print(df[['field', 'sub_field', 'imminent_prob', 'detected_prob', 'num_days', 'week', 'year']].head(15).to_string(index=False))
        # Show alert summary
        high_imminent = len(df[df['imminent_prob'] > 0.7])
        high_detected = len(df[df['detected_prob'] > 0.6])
        print(f"\n⚠ ALERTS:")
        print(f"  Fields with imminent_prob > 0.70: {high_imminent}")
        print(f"  Fields with detected_prob > 0.60: {high_detected}")
    else:
        print(f"  WARNING: No results exported - check CI data availability")
    print(f"\nStorage structure:")
    print(f"  Input CI:   laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
    print(f"  Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx")
    print(f"  Output:     laravel_app/storage/app/{project_name}/Data/HarvestData/")
    print(f"\nReady to load into 09b field analysis report")
 if __name__ == "__main__":
    main()
--- a/python_app/download_planet_missing_dates.py
+++ b/python_app/download_planet_missing_dates.py
@ -18,6 +18,7 @@ import sys
 import json
 import datetime
 import argparse
 import subprocess
 from pathlib import Path
 from osgeo import gdal
 import time
@ -441,6 +442,7 @@ def get_evalscript():
 def main():
    print("="*80)
    print("PLANET SATELLITE DATA DOWNLOADER - MISSING DATES ONLY")
    print("Wrapper for 00_download_8band_pu_optimized.py")
    print("="*80)
    config_dict = get_config()
@ -495,47 +497,45 @@ def main():
                print(f"    - {date}")
    if config_dict['dry_run']:
-        print("\n[DRY-RUN] Would download and merge above dates")
+        print("\n[DRY-RUN] Would download above dates using 00_download_8band_pu_optimized.py")
        return 0
-    # Setup BBox list
+    # Download each missing date using the optimized downloader
-    print(f"\nLoading field geometries...")
+    print(f"\n{'='*80}")
-    bbox_list = setup_bbox_list(paths['geojson'], resolution=config_dict['resolution'])
+    print(f"Downloading missing dates using optimized script...")
    if bbox_list is None:
        return 1
    print(f"  Created {len(bbox_list)} BBox tiles")
    # Download and merge each missing date
    print(f"\nDownloading missing dates...")
    print(f"{'='*80}")
    success_count = 0
-    for i, slot in enumerate(missing_dates, 1):
+    for i, date_str in enumerate(missing_dates, 1):
-        print(f"\n[{i}/{len(missing_dates)}] Processing {slot}...")
+        print(f"\n[{i}/{len(missing_dates)}] Downloading {date_str}...")
-        # Check availability
+        # Call 00_download_8band_pu_optimized.py for this date
-        if not is_image_available(slot, bbox_list, collection_id):
+        cmd = [
-            print(f"  Skipping {slot} - no imagery available")
+            sys.executable,
-            continue
+            "00_download_8band_pu_optimized.py",
            config_dict['project'],
            "--date", date_str,
            "--resolution", str(config_dict['resolution']),
            "--cleanup"
        ]
-        # Download for all bboxes
+        try:
-        print(f"  Downloading {len(bbox_list)} tiles...")
+            result = subprocess.run(cmd, check=True, capture_output=False)
        for bbox in bbox_list:
            size = bbox_to_dimensions(bbox, resolution=config_dict['resolution'])
            download_function(slot, bbox, size, paths['single_images'])
        # Merge
        print(f"  Merging tiles...")
        if merge_files(slot, paths['single_images'], paths['merged_tifs'], paths['virtual_raster']):
            success_count += 1
            print(f"  ✓ Successfully downloaded {date_str}")
        except subprocess.CalledProcessError as e:
            print(f"  ✗ Failed to download {date_str}: {e}")
            # Continue with next date instead of stopping
            continue
    # Summary
    print(f"\n{'='*80}")
    print(f"SUMMARY:")
    print(f"  Successfully processed: {success_count}/{len(missing_dates)} dates")
    print(f"  Output folder: {paths['merged_tifs']}")
    print(f"{'='*80}")
-    return 0
+    return 0 if success_count == len(missing_dates) else 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/python_app/harvest_date_pred_utils.py
+++ b/python_app/harvest_date_pred_utils.py
@ -0,0 +1,482 @@
 """
 Self-contained utility module for two-step harvest date prediction and Excel export.
 Includes model architecture, feature engineering, and core prediction logic.
 """
 import sys
 import pandas as pd
 import numpy as np
 import torch
 import torch.nn as nn
 import pickle
 import yaml
 from pathlib import Path
 from typing import Tuple, Dict, Any, List
 from sklearn.preprocessing import StandardScaler
 # ============================================================================
 # TORCH MODELS (from src/models.py, inlined for self-containment)
 # ============================================================================
 class HarvestDetectionLSTM(nn.Module):
    """Unidirectional LSTM for harvest detection with dual outputs."""
    def __init__(self, input_size: int, hidden_size: int = 128, 
                 num_layers: int = 1, dropout: float = 0.5):
        super(HarvestDetectionLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False,
            batch_first=True
        )
        self.imminent_head = nn.Sequential(
            nn.Linear(hidden_size, 16),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        self.detected_head = nn.Sequential(
            nn.Linear(hidden_size, 16),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        lstm_out, _ = self.lstm(x)
        batch_size, seq_len, hidden_size = lstm_out.shape
        lstm_flat = lstm_out.reshape(-1, hidden_size)
        imminent_flat = self.imminent_head(lstm_flat).reshape(batch_size, seq_len)
        detected_flat = self.detected_head(lstm_flat).reshape(batch_size, seq_len)
        return imminent_flat, detected_flat
 class HarvestDetectionGRU(nn.Module):
    """Unidirectional GRU for harvest detection with dual outputs."""
    def __init__(self, input_size: int, hidden_size: int = 128, 
                 num_layers: int = 1, dropout: float = 0.5):
        super(HarvestDetectionGRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=False,
            batch_first=True
        )
        self.imminent_head = nn.Sequential(
            nn.Linear(hidden_size, 16),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        self.detected_head = nn.Sequential(
            nn.Linear(hidden_size, 16),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        gru_out, _ = self.gru(x)
        batch_size, seq_len, hidden_size = gru_out.shape
        gru_flat = gru_out.reshape(-1, hidden_size)
        imminent_flat = self.imminent_head(gru_flat).reshape(batch_size, seq_len)
        detected_flat = self.detected_head(gru_flat).reshape(batch_size, seq_len)
        return imminent_flat, detected_flat
 def create_model(model_type: str, input_size: int, hidden_size: int = 128,
                num_layers: int = 1, dropout: float = 0.5, device = None) -> nn.Module:
    """Create a model from registry."""
    registry = {'LSTM': HarvestDetectionLSTM, 'GRU': HarvestDetectionGRU}
    if model_type not in registry:
        raise ValueError(f"Unknown model type: {model_type}")
    model = registry[model_type](
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        dropout=dropout
    )
    if device:
        model = model.to(device)
    # Print model info
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Model: {model_type}")
    print(f"  Input size: {input_size}")
    print(f"  Hidden size: {hidden_size}")
    print(f"  Num layers: {num_layers}")
    print(f"  Dropout: {dropout}")
    print(f"  Total parameters: {total_params:,}")
    print(f"  Trainable parameters: {trainable_params:,}")
    print(f"  Device: {device}")
    return model
 # ============================================================================
 # FEATURE ENGINEERING (from src/feature_engineering.py, simplified for inline)
 # ============================================================================
 def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> pd.DataFrame:
    """Compute all CI-based features (state, velocity, acceleration, min/max/range/std/CV)."""
    features = pd.DataFrame(index=ci_series.index)
    # State (moving averages)
    features['CI_raw'] = ci_series
    features['7d_MA'] = ci_series.rolling(window=7, min_periods=1).mean()
    features['14d_MA'] = ci_series.rolling(window=14, min_periods=1).mean()
    features['21d_MA'] = ci_series.rolling(window=21, min_periods=1).mean()
    # Velocity (gradient of MA)
    for window in [7, 14, 21]:
        ma = ci_series.rolling(window=window, min_periods=1).mean()
        features[f'{window}d_velocity'] = ma.diff() / 1.0  # Simplified gradient
    # Acceleration (gradient of velocity)
    for window in [7, 14, 21]:
        ma = ci_series.rolling(window=window, min_periods=1).mean()
        vel = ma.diff()
        features[f'{window}d_acceleration'] = vel.diff()
    # Min, Max, Range
    for window in [7, 14, 21]:
        features[f'{window}d_min'] = ci_series.rolling(window=window, min_periods=1).min()
        features[f'{window}d_max'] = ci_series.rolling(window=window, min_periods=1).max()
        features[f'{window}d_range'] = features[f'{window}d_max'] - features[f'{window}d_min']
    # Std and CV
    for window in [7, 14, 21]:
        features[f'{window}d_std'] = ci_series.rolling(window=window, min_periods=1).std()
        ma = ci_series.rolling(window=window, min_periods=1).mean()
        features[f'{window}d_CV'] = features[f'{window}d_std'] / (ma + 1e-6)
    # DOY normalized
    if doy_series is not None:
        features['DOY_normalized'] = doy_series / 450.0
    return features.fillna(0)
 def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
    """Extract and return specified features as numpy array."""
    # Compute all CI features
    ci_series = data_df[ci_column].astype(float)
    doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
    all_features = compute_ci_features(ci_series, doy_series)
    # Select requested features
    requested = [f for f in feature_names if f in all_features.columns]
    if not requested:
        raise ValueError(f"No valid features found. Requested: {feature_names}")
    return all_features[requested].values
 # ============================================================================
 # MAIN UTILITY FUNCTIONS
 # ============================================================================
 def load_model_and_config(model_dir: Path):
    """Load model, config, and scalers from a given directory."""
    cwd = Path.cwd()
    # Try different naming conventions
    candidates = [
        # Standard names
        (model_dir / "config.json", model_dir / "model.pt", model_dir / "scalers.pkl"),
        # Model 307 specific names
        (model_dir / "model_config.json", model_dir / "model_307.pt", model_dir / "model_scalers.pkl"),
        # CWD standard names
        (cwd / "config.json", cwd / "model.pt", cwd / "scalers.pkl"),
        # CWD Model 307 names
        (cwd / "model_config.json", cwd / "model_307.pt", cwd / "model_scalers.pkl"),
    ]
    config_file = model_file = scalers_file = None
    for cfg, mdl, scl in candidates:
        if cfg.exists() and mdl.exists() and scl.exists():
            config_file, model_file, scalers_file = cfg, mdl, scl
            print(f"Found model files in: {cfg.parent}")
            break
    if not (config_file and model_file and scalers_file):
        missing = []
        for cfg, mdl, scl in candidates:
            if not cfg.exists():
                missing.append(str(cfg))
            if not mdl.exists():
                missing.append(str(mdl))
            if not scl.exists():
                missing.append(str(scl))
        raise FileNotFoundError(
            f"Missing model files. Checked multiple locations. Missing: {missing}"
        )
    with open(config_file) as f:
        config = yaml.safe_load(f)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = create_model(
        model_type=config['model']['type'],
        input_size=len(config['features']),
        hidden_size=config['model']['hidden_size'],
        num_layers=config['model']['num_layers'],
        dropout=config['model']['dropout'],
        device=device
    )
    print(f"Loading weights from: {model_file}")
    model.load_state_dict(torch.load(model_file, map_location=device, weights_only=False))
    model.eval()
    with open(scalers_file, 'rb') as f:
        scalers = pickle.load(f)
    return model, config, scalers
 def load_harvest_data(data_file: Path) -> pd.DataFrame:
    """Load harvest data CSV."""
    print(f"Loading data from: {data_file}")
    df = pd.read_csv(data_file)
    print(f"Loaded {len(df)} rows")
    return df
 def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device):
    """
    Phase 1: Growing window detection with threshold crossing.
    Expand window day-by-day, check last timestep's detected_prob.
    When 3 consecutive days have prob > 0.5, harvest detected.
    Returns list of (harvest_date, harvest_idx) tuples.
    """
    harvest_dates = []
    current_pos = 0
    while current_pos < len(field_data):
        consecutive_above_threshold = 0
        for window_end in range(current_pos + 1, len(field_data) + 1):
            window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
            try:
                features = extract_features(window_data, config['features'], ci_column=ci_column)
                # Apply scalers
                for fi, scaler in enumerate(scalers):
                    try:
                        features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
                    except Exception:
                        pass
                # Run model
                with torch.no_grad():
                    x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
                    imminent_probs, detected_probs = model(x_tensor)
                    detected_probs = detected_probs.squeeze(0).cpu().numpy()
                # Check LAST timestep
                last_prob = detected_probs[-1]
                if last_prob > 0.5:
                    consecutive_above_threshold += 1
                else:
                    consecutive_above_threshold = 0
                # Harvest detected: 3 consecutive days above threshold
                if consecutive_above_threshold >= 3:
                    harvest_date = field_data.iloc[current_pos + window_end - 3]['Date']
                    harvest_dates.append((harvest_date, current_pos + window_end - 3))
                    # Reset to next day after harvest
                    current_pos = current_pos + window_end - 2
                    break
            except Exception:
                continue
        else:
            break
    return harvest_dates
 def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device):
    """
    Phase 2: Refinement with ±40 day window.
    For each Phase 1 harvest, extract window and refine with argmax.
    Returns list of (harvest_date, harvest_idx) tuples.
    """
    refined_harvests = []
    field_data = field_data.sort_values('Date').reset_index(drop=True)
    for i, (phase1_harvest_date, phase1_idx) in enumerate(phase1_harvests):
        try:
            # Determine season start
            if i == 0:
                season_start_date = field_data.iloc[0]['Date']
            else:
                prev_harvest_idx = phase1_harvests[i-1][1]
                season_start_idx = prev_harvest_idx + 1
                if season_start_idx >= len(field_data):
                    break
                season_start_date = field_data.iloc[season_start_idx]['Date']
            # Extract ±40 day window
            window_start_date = season_start_date - pd.Timedelta(days=40)
            window_end_date = phase1_harvest_date + pd.Timedelta(days=40)
            window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0)
            window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data))
            if window_end_idx <= window_start_idx:
                refined_harvests.append((phase1_harvest_date, phase1_idx))
                continue
            window_data = field_data.iloc[window_start_idx:window_end_idx].copy().reset_index(drop=True)
            # Extract features for full window
            features = extract_features(window_data, config['features'], ci_column=ci_column)
            # Apply scalers
            for fi, scaler in enumerate(scalers):
                try:
                    features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
                except Exception:
                    pass
            # Run model once on full window
            with torch.no_grad():
                x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
                imminent_probs, detected_probs = model(x_tensor)
                detected_probs = detected_probs.squeeze(0).cpu().numpy()
            # Find refined harvest (argmax in window)
            refined_idx_in_window = int(np.argmax(detected_probs))
            refined_idx_global = window_start_idx + refined_idx_in_window
            refined_harvest_date = field_data.iloc[refined_idx_global]['Date']
            refined_harvests.append((refined_harvest_date, refined_idx_global))
        except Exception:
            refined_harvests.append((phase1_harvest_date, phase1_idx))
    return refined_harvests
 def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=None):
    """
    Two-step harvest detection for each field:
    1. Phase 1: Growing window with 3-day threshold confirmation
    2. Phase 2: ±40 day refinement with argmax
    Returns list of dicts with field, season_start_date, season_end_date, etc.
    """
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = []
    ci_column = config['data']['ci_column']
    # Group by field and count total fields for progress
    field_groups = list(df.groupby('field'))
    total_fields = len(field_groups)
    harvests_found = 0
    print(f"  Processing {total_fields} fields...")
    for idx, (field, field_data) in enumerate(field_groups, 1):
        # Simple progress indicator
        pct = int((idx / total_fields) * 100)
        bar_length = 40
        filled = int((idx / total_fields) * bar_length)
        bar = "█" * filled + "░" * (bar_length - filled)
        print(f"  [{bar}] {pct:3d}% ({idx}/{total_fields} fields)", end='\r')
        field_data = field_data.sort_values('Date').reset_index(drop=True)
        # Phase 1: Growing window detection
        phase1_harvests = run_phase1_growing_window(field_data, model, config, scalers, ci_column, device)
        if not phase1_harvests:
            continue
        # Phase 2: Refinement
        phase2_harvests = run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device)
        # Store results
        for i, (harvest_date, harvest_idx) in enumerate(phase2_harvests):
            if i == 0:
                season_start_date = field_data.iloc[0]['Date']
            else:
                prev_harvest_idx = phase2_harvests[i-1][1]
                season_start_idx = prev_harvest_idx + 1
                if season_start_idx >= len(field_data):
                    break
                season_start_date = field_data.iloc[season_start_idx]['Date']
            season_end_date = harvest_date
            result = {
                'field': field,
                'season': i + 1,
                'season_start_date': season_start_date,
                'season_end_date': season_end_date,
                'phase2_harvest_date': harvest_date,
            }
            results.append(result)
            harvests_found += 1
    print()  # New line after progress bar
    print(f"  ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields")
    return results
 def build_production_harvest_table(refined_results: List[Dict]) -> pd.DataFrame:
    """
    Build a DataFrame from refined results with columns for production pipeline.
    One row per field/season with season start and end dates (formatted as YYYY-MM-DD).
    """
    if not refined_results:
        print("WARNING: No refined results to build table")
        return pd.DataFrame(columns=['field', 'season', 'season_start_date', 'season_end_date'])
    # Build DataFrame
    df = pd.DataFrame(refined_results)
    # Ensure date columns are datetime
    df['season_start_date'] = pd.to_datetime(df['season_start_date']).dt.strftime('%Y-%m-%d')
    df['season_end_date'] = pd.to_datetime(df['season_end_date']).dt.strftime('%Y-%m-%d')
    df['phase1_harvest_date'] = pd.to_datetime(df['phase1_harvest_date']).dt.strftime('%Y-%m-%d')
    print(f"Built production table with {len(df)} field/season combinations")
    return df
--- a/python_app/model_307.pt
+++ b/python_app/model_307.pt
--- a/python_app/model_config.json
+++ b/python_app/model_config.json
@ -0,0 +1,144 @@
 {
  "name": "307_dropout02_with_doy",
  "description": "Production Model 307: LSTM-based harvest detection (Phase 3, minimal regularization)",
  "model_info": {
    "type": "LSTM",
    "architecture": "Unidirectional LSTM with dual output heads (imminent + detected)",
    "total_parameters": 105120,
    "input_features": 14,
    "hidden_units": 256,
    "output_heads": 2,
    "training_data": "Historical multi-season CI data from multiple estates",
    "validation_method": "5-fold cross-validation",
    "device": "GPU (CUDA) or CPU fallback"
  },
  "production_scripts": {
    "baseline": {
      "script": "01_harvest_baseline_prediction.py",
      "frequency": "Run ONCE during setup",
      "purpose": "Predict all harvest dates (ground truth baseline)",
      "input": "ci_data_for_python.csv (complete historical data)",
      "output": "harvest_production_export.xlsx",
      "time_estimate": "5-30 minutes depending on data volume"
    },
    "monitoring": {
      "script": "02_harvest_imminent_weekly.py",
      "frequency": "Run WEEKLY (or daily if required)",
      "purpose": "Real-time harvest status and imminent alerts",
      "input": "ci_data_for_python.csv (recent data)",
      "output": "harvest_imminent_weekly.csv",
      "time_estimate": "1-5 minutes"
    }
  },
  "features": [
    "CI_raw",
    "7d_MA",
    "14d_MA",
    "21d_MA",
    "7d_velocity",
    "14d_velocity",
    "21d_velocity",
    "7d_min",
    "14d_min",
    "21d_min",
    "7d_std",
    "14d_std",
    "21d_std",
    "DOY_normalized"
  ],
  "model": {
    "type": "LSTM",
    "hidden_size": 256,
    "num_layers": 1,
    "dropout": 0.2
  },
  "training": {
    "imminent_days_before": 28,
    "imminent_days_before_end": 1,
    "detected_days_after_start": 1,
    "detected_days_after_end": 21,
    "k_folds": 5,
    "num_epochs": 150,
    "patience": 20,
    "learning_rate": 0.001,
    "batch_size": 4
  },
  "data": {
    "csv_path": "../lstm_complete_data.csv",
    "ci_column": "FitData",
    "test_fraction": 0.15,
    "seed": 42
  },
  "workflow_instructions": {
    "overview": "Model 307 uses a two-script approach: baseline setup + weekly monitoring",
    "step_1_baseline": {
      "description": "Establish historical harvest date reference for all fields",
      "script": "01_harvest_baseline_prediction.py",
      "when": "Run once after setting up CI extraction pipeline",
      "command": "conda activate python_gpu && python 01_harvest_baseline_prediction.py",
      "input_data": "ci_data_for_python.csv (all available historical CI data)",
      "output_file": "harvest_production_export.xlsx (ground truth baseline)",
      "columns": [
        "field - Field ID",
        "sub_field - Sub-field designation",
        "season - Season number (1, 2, 3...)",
        "year - Year of harvest",
        "season_start_date - Start of growing season",
        "season_end_date - End of season (harvest date)",
        "phase1_harvest_date - Refined harvest prediction"
      ],
      "notes": "This becomes your reference - compare all weekly monitoring against this"
    },
    "step_2_monitoring": {
      "description": "Weekly real-time harvest status and imminent alerts",
      "script": "02_harvest_imminent_weekly.py",
      "when": "Run every week (e.g., Mondays) or daily if near harvest",
      "command": "conda activate python_gpu && python 02_harvest_imminent_weekly.py",
      "input_data": "ci_data_for_python.csv (latest CI data from 02b conversion)",
      "output_file": "harvest_imminent_weekly.csv",
      "columns": [
        "field - Field ID",
        "sub_field - Sub-field designation",
        "imminent_prob - Likelihood of harvest readiness in next 28 days (0.0-1.0)",
        "detected_prob - Current harvest probability (0.0-1.0)",
        "week - ISO week number",
        "year - Year",
        "as_of_date - Latest date in dataset",
        "num_days - Days of history used"
      ],
      "alert_thresholds": {
        "imminent_high": "imminent_prob > 0.7 (prepare harvest)",
        "imminent_medium": "imminent_prob 0.5-0.7 (monitor closely)",
        "detected_high": "detected_prob > 0.6 (active harvesting)"
      }
    },
    "integration_with_r_pipeline": {
      "before_model_307": [
        "Planet 8-band download: download_8band_pu_optimized.ipynb",
        "CI extraction: 02_ci_extraction.R",
        "Convert to CSV: 02b_convert_rds_to_csv.R (outputs ci_data_for_python.csv)"
      ],
      "model_307_here": [
        "BASELINE: 01_harvest_baseline_prediction.py (run once)",
        "MONITORING: 02_harvest_imminent_weekly.py (run weekly)"
      ],
      "after_model_307": [
        "Field analysis: 09b_field_analysis_weekly.R (reads harvest predictions)",
        "Reports: 10_CI_report_with_kpis.Rmd (includes harvest status)"
      ]
    },
    "environment_requirements": {
      "python_env": "python_gpu",
      "activation": "conda activate python_gpu",
      "required_packages": [
        "torch (GPU-enabled)",
        "pandas",
        "numpy",
        "scikit-learn",
        "pyyaml",
        "openpyxl"
      ],
      "gpu": "NVIDIA GPU with CUDA (optional - falls back to CPU if unavailable)"
    }
  }
 }
--- a/python_app/model_scalers.pkl
+++ b/python_app/model_scalers.pkl
--- a/r_app/02b_convert_ci_rds_to_csv.R
+++ b/r_app/02b_convert_ci_rds_to_csv.R
@ -15,9 +15,113 @@
 suppressPackageStartupMessages({
  library(tidyverse)
  library(lubridate)
  library(zoo)
  library(here)
 })
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 #' Convert wide format RDS to long format
 #'
 #' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns
 #' @return Long format tibble: field, sub_field, Date, FitData
 wide_to_long_ci_data <- function(ci_data_wide) {
  ci_data_wide %>%
    pivot_longer(
      cols = -c(field, sub_field),
      names_to = "Date",
      values_to = "FitData",
      values_drop_na = TRUE
    ) %>%
    mutate(
      Date = as.Date(Date),
      FitData = as.numeric(FitData)
    ) %>%
    filter(!is.na(FitData))
 }
 #' Create daily interpolated sequences with DOY for each field
 #'
 #' For each field/sub_field combination, creates complete daily sequences from first to last date,
 #' fills in measurements, and interpolates missing dates.
 #'
 #' @param ci_data_long Long format tibble: field, sub_field, Date, FitData
 #' @return Tibble with: field, sub_field, Date, FitData, DOY, value
 create_interpolated_daily_sequences <- function(ci_data_long) {
  ci_data_long %>%
    group_by(field, sub_field) %>%
    nest() %>%
    mutate(
      data = map(data, function(df) {
        # Sort measurements by date
        df <- df %>% arrange(Date)
        # Create complete daily sequence from first to last date
        date_seq <- seq(min(df$Date), max(df$Date), by = "day")
        # Build daily dataframe (field/sub_field stay in outer df, not here)
        daily_df <- tibble(
          Date = date_seq,
          value = NA_real_,
          FitData = NA_real_,
          DOY = seq_along(date_seq)  # Continuous day counter: 1, 2, 3, ...
        )
        # Fill in actual measurement values
        for (i in seq_len(nrow(df))) {
          idx <- which(daily_df$Date == df$Date[i])
          if (length(idx) > 0) {
            daily_df$value[idx] <- df$FitData[i]
          }
        }
        # Interpolate missing dates linearly
        daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE)
        daily_df
      })
    ) %>%
    unnest(data) %>%
    select(field, sub_field, Date, FitData, DOY, value) %>%
    arrange(field, Date)
 }
 #' Validate conversion output
 #'
 #' @param ci_data_python Tibble with converted CI data
 #' @return Invisibly returns the tibble (for piping)
 validate_conversion_output <- function(ci_data_python) {
  cat(sprintf("\nValidation:\n"))
  cat(sprintf("  Unique fields: %d\n", n_distinct(ci_data_python$field)))
  cat(sprintf("  Total daily rows: %d\n", nrow(ci_data_python)))
  cat(sprintf("  Date range: %s to %s\n", 
              min(ci_data_python$Date, na.rm = TRUE), 
              max(ci_data_python$Date, na.rm = TRUE)))
  cat(sprintf("  FitData range: %.2f to %.2f\n",
              min(ci_data_python$FitData, na.rm = TRUE),
              max(ci_data_python$FitData, na.rm = TRUE)))
  cat(sprintf("  Raw measurements: %d\n", sum(!is.na(ci_data_python$value))))
  cat(sprintf("  Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData))))
  invisible(ci_data_python)
 }
 #' Print next steps message
 print_next_steps <- function() {
  cat("\nNext steps for Python harvest detection:\n")
  cat("  1. Read this CSV file in Python\n")
  cat("  2. Group by field to identify seasons\n")
  cat("  3. Run LSTM model to detect harvest dates\n")
  cat("  4. Save predicted harvest dates to Excel\n")
  cat("  5. Use output in script 03 for interpolation\n")
 }
 # ============================================================================
 # MAIN FUNCTION
 # ============================================================================
 main <- function() {
  # Process command line arguments
  args <- commandArgs(trailingOnly = TRUE)
@ -28,7 +132,7 @@ main <- function() {
  } else if (exists("project_dir", envir = .GlobalEnv)) {
    project_dir <- get("project_dir", envir = .GlobalEnv)
  } else {
-    project_dir <- "esa"
+    project_dir <- "angata"
  }
  # Make available globally
@ -49,9 +153,17 @@ main <- function() {
  })
  # Define paths
-  ci_data_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
+  ci_data_source_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
-  input_file <- file.path(ci_data_dir, "combined_CI_data.rds")
+  ci_data_output_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "ci_data_for_python")
-  output_file <- file.path(ci_data_dir, "ci_data_for_python.csv")
+  
  # Create output directory if it doesn't exist (for new projects)
  if (!dir.exists(ci_data_output_dir)) {
    dir.create(ci_data_output_dir, recursive = TRUE, showWarnings = FALSE)
    cat(sprintf("✓ Created output directory: %s\n", ci_data_output_dir))
  }
  input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds")
  output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv")
  # Check if input file exists
  if (!file.exists(input_file)) {
@ -61,52 +173,32 @@ main <- function() {
  cat(sprintf("Loading: %s\n", input_file))
  # Load RDS file
-  ci_data <- readRDS(input_file) %>%
+  ci_data_wide <- readRDS(input_file) %>%
    as_tibble()
-  cat(sprintf("  Loaded %d rows\n", nrow(ci_data)))
+  cat(sprintf("  Loaded %d rows\n", nrow(ci_data_wide)))
-  cat(sprintf("  Columns: %s\n", paste(names(ci_data), collapse = ", ")))
+  cat(sprintf("  Format: WIDE (field, sub_field, then dates as columns)\n"))
  cat(sprintf("  Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", ")))
-  # Prepare data for Python
+  # Step 1: Convert from WIDE to LONG format
-  ci_data_python <- ci_data %>%
+  cat("\nStep 1: Converting from wide to long format...\n")
-    # Ensure standard column names
+  ci_data_long <- wide_to_long_ci_data(ci_data_wide)
    rename(
      field = field,
      sub_field = sub_field,
      Date = Date,
      FitData = FitData,
      DOY = DOY
    ) %>%
    # Add 'value' as an alias for FitData (sometimes needed)
    mutate(value = FitData) %>%
    # Keep only necessary columns
    select(field, sub_field, Date, FitData, DOY, value) %>%
    # Sort by field and date
    arrange(field, Date)
-  # Validate data
+  # Step 2: Create complete daily sequences with interpolation
-  cat(sprintf("\nValidation:\n"))
+  cat("\nStep 2: Creating complete daily sequences with interpolation...\n")
-  cat(sprintf("  Unique fields: %d\n", n_distinct(ci_data_python$field)))
+  ci_data_python <- create_interpolated_daily_sequences(ci_data_long)
  cat(sprintf("  Date range: %s to %s\n", 
              min(ci_data_python$Date, na.rm = TRUE), 
              max(ci_data_python$Date, na.rm = TRUE)))
  cat(sprintf("  FitData range: %.2f to %.2f\n",
              min(ci_data_python$FitData, na.rm = TRUE),
              max(ci_data_python$FitData, na.rm = TRUE)))
  cat(sprintf("  Missing FitData: %d rows\n", sum(is.na(ci_data_python$FitData))))
-  # Save to CSV
+  # Step 3: Validate output
-  cat(sprintf("\nSaving to: %s\n", output_file))
+  cat("\nStep 3: Validating output...")
  validate_conversion_output(ci_data_python)
  # Step 4: Save to CSV
  cat(sprintf("\nStep 4: Saving to CSV...\n"))
  cat(sprintf("  Output: %s\n", output_file))
  write_csv(ci_data_python, output_file)
-  cat(sprintf("✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
+  cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
-  cat("\nNext steps for Python harvest detection:\n")
+  print_next_steps()
  cat("  1. Read this CSV file in Python\n")
  cat("  2. Group by field to identify seasons\n")
  cat("  3. Run LSTM model to detect harvest dates\n")
  cat("  4. Save predicted harvest dates to Excel\n")
  cat("  5. Use output in script 03 for interpolation\n")
 }
 if (sys.nframe() == 0) {
--- a/webapps/geojson_viewer.html
+++ b/webapps/geojson_viewer.html
--- a/webapps/index.html
+++ b/webapps/index.html
@ -212,6 +212,22 @@
                    <a href="./data_validation_tool/" class="app-btn">Open Tool</a>
                </div>
            </div>
            <!-- GeoJSON Viewer -->
            <div class="app-card">
                <div class="app-icon">📍</div>
                <div class="app-content">
                    <h2>GeoJSON Viewer</h2>
                    <p>Upload and visualize GeoJSON files on an interactive map with feature properties.</p>
                    <ul class="app-features">
                        <li>Upload GeoJSON files</li>
                        <li>Interactive map view</li>
                        <li>View feature properties</li>
                        <li>Download exports</li>
                    </ul>
                    <a href="./geojson_viewer.html" class="app-btn">Open Viewer</a>
                </div>
            </div>
        </div>
        <footer>