diff --git a/python_app/download_8band_pu_optimized.py b/python_app/00_download_8band_pu_optimized.py
similarity index 97%
rename from python_app/download_8band_pu_optimized.py
rename to python_app/00_download_8band_pu_optimized.py
index 4fb5c4f..9d2841d 100644
--- a/python_app/download_8band_pu_optimized.py
+++ b/python_app/00_download_8band_pu_optimized.py
@@ -31,7 +31,7 @@ Examples:
python download_8band_pu_optimized.py chemba # Uses today's date
python download_8band_pu_optimized.py xinavane --clear-singles --cleanup
python download_8band_pu_optimized.py angata --clear-all --resolution 5
-
+
Cost Model:
- 4-band uint16 with cloud masking: ~50% lower cost than 9-band FLOAT32
- Reduced bbox sizes: ~10-20% lower cost due to smaller average tile size
@@ -39,6 +39,18 @@ Cost Model:
- Requests: Slightly higher (~50-60 tiles) but within 700k budget
Expected result: ~75% PU savings with dynamic geometry-fitted grid
+
+Example running it in powershell:
+ $startDate = [DateTime]::ParseExact("2025-11-01", "yyyy-MM-dd", $null)
+ $endDate = [DateTime]::ParseExact("2025-12-24", "yyyy-MM-dd", $null)
+
+ $current = $startDate
+ while ($current -le $endDate) {
+ $dateStr = $current.ToString("yyyy-MM-dd")
+ Write-Host "Downloading $dateStr..."
+ python download_8band_pu_optimized.py angata --date $dateStr
+ $current = $current.AddDays(1)
+ }
"""
import os
diff --git a/python_app/01_harvest_baseline_prediction.py b/python_app/01_harvest_baseline_prediction.py
new file mode 100644
index 0000000..7461964
--- /dev/null
+++ b/python_app/01_harvest_baseline_prediction.py
@@ -0,0 +1,111 @@
+"""
+Script: 01_harvest_baseline_prediction.py
+Purpose: BASELINE PREDICTION - Run ONCE to establish harvest date baseline for all fields and seasons
+
+This script processes COMPLETE historical CI data (all available dates) and uses Model 307
+to predict ALL harvest dates across the entire dataset. This becomes your reference baseline
+for monitoring and comparisons going forward.
+
+RUN FREQUENCY: Once during initial setup
+INPUT: ci_data_for_python.csv (complete historical CI data from 02b_convert_rds_to_csv.R)
+ Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
+OUTPUT: harvest_production_export.xlsx (baseline harvest predictions for all fields/seasons)
+
+Workflow:
+1. Load ci_data_for_python.csv (daily interpolated, all historical dates)
+2. Group data by field and season (Model 307 detects season boundaries internally)
+3. Run two-step harvest detection (Phase 1: fast detection, Phase 2: ±40 day refinement)
+4. Export harvest_production_export.xlsx with columns:
+ - field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date
+
+Two-Step Detection Algorithm:
+ Phase 1 (Growing Window): Expands daily, checks when detected_prob > 0.5 for 3 consecutive days
+ Phase 2 (Refinement): Extracts ±40 day window, finds peak harvest signal with argmax
+
+This is your GROUND TRUTH - compare all future predictions against this baseline.
+
+Usage:
+ python 01_harvest_baseline_prediction.py [project_name]
+
+ Examples:
+ python 01_harvest_baseline_prediction.py angata
+ python 01_harvest_baseline_prediction.py esa
+ python 01_harvest_baseline_prediction.py chemba
+
+ If no project specified, defaults to 'angata'
+"""
+
+import pandas as pd
+import numpy as np
+import torch
+import sys
+from pathlib import Path
+from harvest_date_pred_utils import (
+ load_model_and_config,
+ extract_features,
+ run_two_step_refinement,
+ build_production_harvest_table
+)
+
+
+def main():
+ # Get project name from command line or use default
+ project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
+
+ # Construct paths
+ base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
+ ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
+ CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
+ harvest_data_dir = base_storage / "HarvestData"
+ harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
+ OUTPUT_XLSX = harvest_data_dir / "harvest_production_export.xlsx"
+ MODEL_DIR = Path(".") # Model files in python_app/
+
+ # Check if input exists
+ if not CI_DATA_FILE.exists():
+ print(f"ERROR: {CI_DATA_FILE} not found")
+ print(f" Expected at: {CI_DATA_FILE.resolve()}")
+ print(f"\n Run 02b_convert_rds_to_csv.R first to generate this file:")
+ print(f" Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}")
+ return
+
+ print("="*80)
+ print(f"HARVEST DATE PREDICTION - LSTM MODEL 307 ({project_name})")
+ print("="*80)
+
+ # [1/4] Load model
+ print("\n[1/4] Loading Model 307...")
+ model, config, scalers = load_model_and_config(MODEL_DIR)
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ print(f" Device: {device}")
+
+ # [2/4] Load and prepare CI data
+ print("\n[2/4] Loading CI data...")
+ print(f" From: {CI_DATA_FILE}")
+ ci_data = pd.read_csv(CI_DATA_FILE)
+ ci_data['Date'] = pd.to_datetime(ci_data['Date'])
+ print(f" Loaded {len(ci_data)} daily rows across {ci_data['field'].nunique()} fields")
+ print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
+
+ # [3/4] Run model predictions with two-step detection
+ print("\n[3/4] Running two-step harvest detection...")
+ refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device)
+
+ # Build and export
+ print("\nBuilding production harvest table...")
+ prod_table = build_production_harvest_table(refined_results)
+
+ prod_table.to_excel(OUTPUT_XLSX, index=False)
+ print(f"\n✓ Exported {len(prod_table)} predictions to {OUTPUT_XLSX}")
+ print(f"\nOutput location: {OUTPUT_XLSX.resolve()}")
+ print(f"\nStorage structure:")
+ print(f" Input: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
+ print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
+ print(f"\nColumn structure:")
+ print(f" field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date")
+ print(f"\nNext steps:")
+ print(f" 1. Review baseline predictions in harvest_production_export.xlsx")
+ print(f" 2. Run weekly monitoring: python 02_harvest_imminent_weekly.py {project_name}")
+
+if __name__ == "__main__":
+ main()
diff --git a/python_app/02_harvest_imminent_weekly.py b/python_app/02_harvest_imminent_weekly.py
new file mode 100644
index 0000000..09cf3de
--- /dev/null
+++ b/python_app/02_harvest_imminent_weekly.py
@@ -0,0 +1,348 @@
+"""
+Script: 02_harvest_imminent_weekly.py
+Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields
+
+This script runs on RECENT CI data (typically last 300 days) to predict whether each field
+is approaching harvest. Use this for operational decision-making and real-time alerts.
+
+RUN FREQUENCY: Weekly (or daily if required)
+INPUT:
+ - ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R)
+ Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
+ - harvest_production_export.xlsx (baseline from script 01 - optional, for reference)
+OUTPUT:
+ - harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
+
+Workflow:
+1. Load harvest_production_export.xlsx (baseline dates - optional, for context)
+2. Load ci_data_for_python.csv (recent CI data)
+3. For each field, extract last 300 days of history
+4. Run Model 307 inference on full sequence (last timestep probabilities)
+5. Export harvest_imminent_weekly.csv with probabilities
+
+Output Columns:
+ - field: Field ID
+ - sub_field: Sub-field identifier
+ - imminent_prob: Probability field will be harvestable in next 28 days (0.0-1.0)
+ - detected_prob: Probability field is currently being harvested (0.0-1.0)
+ - week: ISO week number
+ - year: Year
+ - as_of_date: Latest date in dataset
+ - num_days: Number of days of history used
+
+Use Cases:
+ - Alert when imminent_prob > 0.7 (prepare harvest operations)
+ - Alert when detected_prob > 0.6 (field is being harvested)
+ - Track trends over weeks to validate baseline predictions
+ - Feed into 09b script for weekly dashboard reports
+
+Usage:
+ python 02_harvest_imminent_weekly.py [project_name]
+
+ Examples:
+ python 02_harvest_imminent_weekly.py angata
+ python 02_harvest_imminent_weekly.py esa
+ python 02_harvest_imminent_weekly.py chemba
+
+ If no project specified, defaults to 'angata'
+"""
+
+import pandas as pd
+import numpy as np
+import torch
+import subprocess
+import sys
+from pathlib import Path
+from datetime import datetime, timedelta
+from harvest_date_pred_utils import (
+ load_model_and_config,
+ extract_features,
+)
+
+
+def load_harvest_dates(harvest_file):
+ """Load latest harvest end dates from Excel file (from harvest_production_export.xlsx)."""
+ print("[1/5] Loading harvest dates...")
+
+ if not Path(harvest_file).exists():
+ print(f" ERROR: {harvest_file} not found")
+ print(" Using 180-day lookback as default")
+ return None
+
+ try:
+ harvest_df = pd.read_excel(harvest_file)
+ print(f" Loaded {len(harvest_df)} field-season records")
+
+ # Use season_end_date column (output from harvest prediction script)
+ harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date'])
+
+ # Group by field and get the latest season_end_date
+ harvest_dates = {}
+ for field_id, group in harvest_df.groupby('field'):
+ latest_end = group['season_end_date'].max()
+ harvest_dates[str(field_id).strip()] = latest_end
+
+ print(f" Successfully mapped {len(harvest_dates)} fields")
+ print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
+ return harvest_dates
+ except Exception as e:
+ print(f" ERROR loading harvest file: {e}")
+ print(f" Using 180-day lookback instead")
+ return None
+
+
+def run_rds_to_csv_conversion():
+ """Run R script to convert RDS to CSV."""
+ print("\n[2/5] Converting RDS to CSV (daily interpolation)...")
+ r_script = Path("02b_convert_rds_to_csv.R")
+
+ if not r_script.exists():
+ print(f" ERROR: {r_script} not found")
+ return False
+
+ # Use full path to Rscript on Windows
+ rscript_exe = r"C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe"
+
+ try:
+ result = subprocess.run(
+ [rscript_exe, str(r_script)],
+ capture_output=True,
+ text=True,
+ timeout=300
+ )
+
+ if result.returncode != 0:
+ print(f" ERROR running R script:\n{result.stderr}")
+ return False
+
+ # Show last few lines of output
+ lines = result.stdout.strip().split('\n')
+ for line in lines[-5:]:
+ if line.strip():
+ print(f" {line}")
+
+ return True
+ except Exception as e:
+ print(f" ERROR: {e}")
+ return False
+
+
+def load_ci_data(csv_file):
+ """Load CI data."""
+ print("\n[3/5] Loading CI data...")
+
+ if not Path(csv_file).exists():
+ print(f" ERROR: {csv_file} not found")
+ return None
+
+ ci_data = pd.read_csv(csv_file)
+ ci_data['Date'] = pd.to_datetime(ci_data['Date'])
+
+ print(f" Loaded {len(ci_data)} daily rows for {ci_data['field'].nunique()} fields")
+ print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
+
+ return ci_data
+
+
+def extract_seasonal_data(field_id, harvest_date, ci_data):
+ """
+ Extract CI data from harvest date to latest for a specific field.
+ Returns dataframe sorted by date, or None if insufficient data.
+ """
+ # field_id is int, ci_data['field'] is also int
+ field_data = ci_data[ci_data['field'] == field_id].copy()
+
+ if len(field_data) == 0:
+ return None
+
+ # Filter from harvest date onwards
+ field_data = field_data[field_data['Date'] >= harvest_date].sort_values('Date')
+
+ # Need at least 30 days of data for meaningful inference
+ if len(field_data) < 30:
+ return None
+
+ return field_data
+
+
+def run_inference_on_season(field_data, model, config, scalers, device, ci_column='FitData'):
+ """
+ Run Model 307 inference on recent field CI history.
+ Predicts probability that field will be ready to harvest in next 28 days.
+ Uses last timestep from the provided data sequence.
+ Returns (imminent_prob, detected_prob) for prediction.
+ """
+ try:
+ # Use last 300 days of data for inference (enough history for meaningful patterns,
+ # avoids training data seasonality mismatch)
+ if len(field_data) > 300:
+ field_data = field_data.iloc[-300:]
+
+ # Extract features
+ features_array = extract_features(field_data, config['features'], ci_column)
+
+ if features_array.shape[0] < 10:
+ return None, None
+
+ # Scale features using per-feature scalers (CRITICAL: same as Phase 1 in harvest_date_pred_utils.py)
+ # Scalers is a list of StandardScaler objects, one per feature
+ if scalers and isinstance(scalers, list):
+ for fi, scaler in enumerate(scalers):
+ try:
+ features_array[:, fi] = scaler.transform(features_array[:, fi].reshape(-1, 1)).flatten()
+ except Exception:
+ pass
+
+ # Run inference
+ with torch.no_grad():
+ x_tensor = torch.tensor(features_array, dtype=torch.float32).unsqueeze(0).to(device)
+ out_imm, out_det = model(x_tensor)
+
+ # Get last timestep probabilities
+ imminent_prob = out_imm.squeeze(0)[-1].cpu().item()
+ detected_prob = out_det.squeeze(0)[-1].cpu().item()
+
+ return round(imminent_prob, 4), round(detected_prob, 4)
+
+ except Exception as e:
+ return None, None
+
+
+def main():
+ # Get project name from command line or use default
+ project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
+
+ # Construct paths
+ base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
+ ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
+ CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
+ harvest_data_dir = base_storage / "HarvestData"
+ BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx"
+ OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv"
+ harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
+
+ print("="*80)
+ print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})")
+ print("="*80)
+
+ # [1] Load harvest dates (optional - for projects with predictions)
+ harvest_dates = None
+ if BASELINE_FILE.exists():
+ harvest_dates = load_harvest_dates(BASELINE_FILE)
+ else:
+ print("[1/5] Loading harvest dates...")
+ print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)")
+
+ # [2] Load CI data
+ print(f"\n[2/5] Loading CI data...")
+ print(f" From: {CI_DATA_FILE}")
+
+ if not CI_DATA_FILE.exists():
+ print(f" ERROR: {CI_DATA_FILE} not found")
+ print(f" Expected at: {CI_DATA_FILE.resolve()}")
+ print(f"\n Run 02b_convert_rds_to_csv.R first to generate this file:")
+ print(f" Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}")
+ return
+
+ ci_data = load_ci_data(CI_DATA_FILE)
+ if ci_data is None:
+ print("ERROR: Could not load CI data")
+ return
+
+ # [3] Load model (from python_app directory)
+ print("\n[3/5] Loading Model 307...")
+ model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl
+ model, config, scalers = load_model_and_config(model_dir)
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ print(f" Device: {device}")
+
+ # [4] Run inference per field
+ print("\n[4/5] Running seasonal inference...")
+
+ results_list = []
+ ci_column = config['data']['ci_column']
+
+ # Get field metadata
+ field_meta = ci_data.groupby('field').agg({
+ 'sub_field': 'first',
+ 'Date': 'max'
+ }).reset_index()
+ field_meta.columns = ['field', 'sub_field', 'latest_date']
+
+ count = 0
+ for field_id in ci_data['field'].unique():
+ # Get metadata
+ meta = field_meta[field_meta['field'] == field_id]
+ if len(meta) == 0:
+ continue
+
+ sub_field = meta['sub_field'].iloc[0]
+ latest_date = meta['latest_date'].iloc[0]
+
+ # Use recent CI history (last 300 days from latest available data)
+ field_data = ci_data[ci_data['field'] == field_id].copy()
+ field_data = field_data.sort_values('Date')
+
+ # Keep last 300 days of history for inference
+ if len(field_data) > 300:
+ field_data = field_data.iloc[-300:]
+
+ if len(field_data) < 30:
+ continue
+
+ # Run inference on recent history to predict next 28 days
+ imminent_prob, detected_prob = run_inference_on_season(
+ field_data, model, config, scalers, device, ci_column
+ )
+
+ if imminent_prob is None:
+ continue
+
+ week = int(latest_date.strftime('%V'))
+ year = int(latest_date.strftime('%Y'))
+
+ results_list.append({
+ 'field': field_id,
+ 'sub_field': sub_field,
+ 'imminent_prob': imminent_prob,
+ 'detected_prob': detected_prob,
+ 'week': week,
+ 'year': year,
+ 'as_of_date': latest_date,
+ 'num_days': len(field_data),
+ })
+
+ count += 1
+
+ print(f" Completed inference for {count} fields")
+
+ # Build output DataFrame
+ df = pd.DataFrame(results_list)
+ df.to_csv(OUTPUT_CSV, index=False)
+
+ print(f"\n[5/5] Exporting results...")
+ print(f"✓ Exported {len(df)} fields to {OUTPUT_CSV}")
+ print(f" Output location: {OUTPUT_CSV.resolve()}")
+
+ if len(df) > 0:
+ print(f"\nSample rows:")
+ print(df[['field', 'sub_field', 'imminent_prob', 'detected_prob', 'num_days', 'week', 'year']].head(15).to_string(index=False))
+
+ # Show alert summary
+ high_imminent = len(df[df['imminent_prob'] > 0.7])
+ high_detected = len(df[df['detected_prob'] > 0.6])
+ print(f"\n⚠ ALERTS:")
+ print(f" Fields with imminent_prob > 0.70: {high_imminent}")
+ print(f" Fields with detected_prob > 0.60: {high_detected}")
+ else:
+ print(f" WARNING: No results exported - check CI data availability")
+
+ print(f"\nStorage structure:")
+ print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
+ print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx")
+ print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
+ print(f"\nReady to load into 09b field analysis report")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python_app/download_planet_missing_dates.py b/python_app/download_planet_missing_dates.py
index dc2f870..b6723b7 100644
--- a/python_app/download_planet_missing_dates.py
+++ b/python_app/download_planet_missing_dates.py
@@ -18,6 +18,7 @@ import sys
import json
import datetime
import argparse
+import subprocess
from pathlib import Path
from osgeo import gdal
import time
@@ -441,6 +442,7 @@ def get_evalscript():
def main():
print("="*80)
print("PLANET SATELLITE DATA DOWNLOADER - MISSING DATES ONLY")
+ print("Wrapper for 00_download_8band_pu_optimized.py")
print("="*80)
config_dict = get_config()
@@ -495,47 +497,45 @@ def main():
print(f" - {date}")
if config_dict['dry_run']:
- print("\n[DRY-RUN] Would download and merge above dates")
+ print("\n[DRY-RUN] Would download above dates using 00_download_8band_pu_optimized.py")
return 0
- # Setup BBox list
- print(f"\nLoading field geometries...")
- bbox_list = setup_bbox_list(paths['geojson'], resolution=config_dict['resolution'])
- if bbox_list is None:
- return 1
- print(f" Created {len(bbox_list)} BBox tiles")
-
- # Download and merge each missing date
- print(f"\nDownloading missing dates...")
+ # Download each missing date using the optimized downloader
+ print(f"\n{'='*80}")
+ print(f"Downloading missing dates using optimized script...")
print(f"{'='*80}")
success_count = 0
- for i, slot in enumerate(missing_dates, 1):
- print(f"\n[{i}/{len(missing_dates)}] Processing {slot}...")
+ for i, date_str in enumerate(missing_dates, 1):
+ print(f"\n[{i}/{len(missing_dates)}] Downloading {date_str}...")
- # Check availability
- if not is_image_available(slot, bbox_list, collection_id):
- print(f" Skipping {slot} - no imagery available")
- continue
+ # Call 00_download_8band_pu_optimized.py for this date
+ cmd = [
+ sys.executable,
+ "00_download_8band_pu_optimized.py",
+ config_dict['project'],
+ "--date", date_str,
+ "--resolution", str(config_dict['resolution']),
+ "--cleanup"
+ ]
- # Download for all bboxes
- print(f" Downloading {len(bbox_list)} tiles...")
- for bbox in bbox_list:
- size = bbox_to_dimensions(bbox, resolution=config_dict['resolution'])
- download_function(slot, bbox, size, paths['single_images'])
-
- # Merge
- print(f" Merging tiles...")
- if merge_files(slot, paths['single_images'], paths['merged_tifs'], paths['virtual_raster']):
+ try:
+ result = subprocess.run(cmd, check=True, capture_output=False)
success_count += 1
+ print(f" ✓ Successfully downloaded {date_str}")
+ except subprocess.CalledProcessError as e:
+ print(f" ✗ Failed to download {date_str}: {e}")
+ # Continue with next date instead of stopping
+ continue
# Summary
print(f"\n{'='*80}")
print(f"SUMMARY:")
print(f" Successfully processed: {success_count}/{len(missing_dates)} dates")
print(f" Output folder: {paths['merged_tifs']}")
+ print(f"{'='*80}")
- return 0
+ return 0 if success_count == len(missing_dates) else 1
if __name__ == "__main__":
sys.exit(main())
diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py
new file mode 100644
index 0000000..7c9eb07
--- /dev/null
+++ b/python_app/harvest_date_pred_utils.py
@@ -0,0 +1,482 @@
+"""
+Self-contained utility module for two-step harvest date prediction and Excel export.
+Includes model architecture, feature engineering, and core prediction logic.
+"""
+
+import sys
+import pandas as pd
+import numpy as np
+import torch
+import torch.nn as nn
+import pickle
+import yaml
+from pathlib import Path
+from typing import Tuple, Dict, Any, List
+from sklearn.preprocessing import StandardScaler
+
+# ============================================================================
+# TORCH MODELS (from src/models.py, inlined for self-containment)
+# ============================================================================
+
+class HarvestDetectionLSTM(nn.Module):
+ """Unidirectional LSTM for harvest detection with dual outputs."""
+ def __init__(self, input_size: int, hidden_size: int = 128,
+ num_layers: int = 1, dropout: float = 0.5):
+ super(HarvestDetectionLSTM, self).__init__()
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.num_layers = num_layers
+
+ self.lstm = nn.LSTM(
+ input_size=input_size,
+ hidden_size=hidden_size,
+ num_layers=num_layers,
+ dropout=dropout if num_layers > 1 else 0,
+ bidirectional=False,
+ batch_first=True
+ )
+
+ self.imminent_head = nn.Sequential(
+ nn.Linear(hidden_size, 16),
+ nn.ReLU(),
+ nn.Dropout(dropout),
+ nn.Linear(16, 1),
+ nn.Sigmoid()
+ )
+
+ self.detected_head = nn.Sequential(
+ nn.Linear(hidden_size, 16),
+ nn.ReLU(),
+ nn.Dropout(dropout),
+ nn.Linear(16, 1),
+ nn.Sigmoid()
+ )
+
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ lstm_out, _ = self.lstm(x)
+ batch_size, seq_len, hidden_size = lstm_out.shape
+ lstm_flat = lstm_out.reshape(-1, hidden_size)
+
+ imminent_flat = self.imminent_head(lstm_flat).reshape(batch_size, seq_len)
+ detected_flat = self.detected_head(lstm_flat).reshape(batch_size, seq_len)
+
+ return imminent_flat, detected_flat
+
+
+class HarvestDetectionGRU(nn.Module):
+ """Unidirectional GRU for harvest detection with dual outputs."""
+ def __init__(self, input_size: int, hidden_size: int = 128,
+ num_layers: int = 1, dropout: float = 0.5):
+ super(HarvestDetectionGRU, self).__init__()
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.num_layers = num_layers
+
+ self.gru = nn.GRU(
+ input_size=input_size,
+ hidden_size=hidden_size,
+ num_layers=num_layers,
+ dropout=dropout if num_layers > 1 else 0,
+ bidirectional=False,
+ batch_first=True
+ )
+
+ self.imminent_head = nn.Sequential(
+ nn.Linear(hidden_size, 16),
+ nn.ReLU(),
+ nn.Dropout(dropout),
+ nn.Linear(16, 1),
+ nn.Sigmoid()
+ )
+
+ self.detected_head = nn.Sequential(
+ nn.Linear(hidden_size, 16),
+ nn.ReLU(),
+ nn.Dropout(dropout),
+ nn.Linear(16, 1),
+ nn.Sigmoid()
+ )
+
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ gru_out, _ = self.gru(x)
+ batch_size, seq_len, hidden_size = gru_out.shape
+ gru_flat = gru_out.reshape(-1, hidden_size)
+
+ imminent_flat = self.imminent_head(gru_flat).reshape(batch_size, seq_len)
+ detected_flat = self.detected_head(gru_flat).reshape(batch_size, seq_len)
+
+ return imminent_flat, detected_flat
+
+
+def create_model(model_type: str, input_size: int, hidden_size: int = 128,
+ num_layers: int = 1, dropout: float = 0.5, device = None) -> nn.Module:
+ """Create a model from registry."""
+ registry = {'LSTM': HarvestDetectionLSTM, 'GRU': HarvestDetectionGRU}
+ if model_type not in registry:
+ raise ValueError(f"Unknown model type: {model_type}")
+
+ model = registry[model_type](
+ input_size=input_size,
+ hidden_size=hidden_size,
+ num_layers=num_layers,
+ dropout=dropout
+ )
+
+ if device:
+ model = model.to(device)
+
+ # Print model info
+ total_params = sum(p.numel() for p in model.parameters())
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+ print(f"Model: {model_type}")
+ print(f" Input size: {input_size}")
+ print(f" Hidden size: {hidden_size}")
+ print(f" Num layers: {num_layers}")
+ print(f" Dropout: {dropout}")
+ print(f" Total parameters: {total_params:,}")
+ print(f" Trainable parameters: {trainable_params:,}")
+ print(f" Device: {device}")
+
+ return model
+
+
+# ============================================================================
+# FEATURE ENGINEERING (from src/feature_engineering.py, simplified for inline)
+# ============================================================================
+
+def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> pd.DataFrame:
+ """Compute all CI-based features (state, velocity, acceleration, min/max/range/std/CV)."""
+ features = pd.DataFrame(index=ci_series.index)
+
+ # State (moving averages)
+ features['CI_raw'] = ci_series
+ features['7d_MA'] = ci_series.rolling(window=7, min_periods=1).mean()
+ features['14d_MA'] = ci_series.rolling(window=14, min_periods=1).mean()
+ features['21d_MA'] = ci_series.rolling(window=21, min_periods=1).mean()
+
+ # Velocity (gradient of MA)
+ for window in [7, 14, 21]:
+ ma = ci_series.rolling(window=window, min_periods=1).mean()
+ features[f'{window}d_velocity'] = ma.diff() / 1.0 # Simplified gradient
+
+ # Acceleration (gradient of velocity)
+ for window in [7, 14, 21]:
+ ma = ci_series.rolling(window=window, min_periods=1).mean()
+ vel = ma.diff()
+ features[f'{window}d_acceleration'] = vel.diff()
+
+ # Min, Max, Range
+ for window in [7, 14, 21]:
+ features[f'{window}d_min'] = ci_series.rolling(window=window, min_periods=1).min()
+ features[f'{window}d_max'] = ci_series.rolling(window=window, min_periods=1).max()
+ features[f'{window}d_range'] = features[f'{window}d_max'] - features[f'{window}d_min']
+
+ # Std and CV
+ for window in [7, 14, 21]:
+ features[f'{window}d_std'] = ci_series.rolling(window=window, min_periods=1).std()
+ ma = ci_series.rolling(window=window, min_periods=1).mean()
+ features[f'{window}d_CV'] = features[f'{window}d_std'] / (ma + 1e-6)
+
+ # DOY normalized
+ if doy_series is not None:
+ features['DOY_normalized'] = doy_series / 450.0
+
+ return features.fillna(0)
+
+
+def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
+ """Extract and return specified features as numpy array."""
+ # Compute all CI features
+ ci_series = data_df[ci_column].astype(float)
+ doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
+
+ all_features = compute_ci_features(ci_series, doy_series)
+
+ # Select requested features
+ requested = [f for f in feature_names if f in all_features.columns]
+ if not requested:
+ raise ValueError(f"No valid features found. Requested: {feature_names}")
+
+ return all_features[requested].values
+
+
+# ============================================================================
+# MAIN UTILITY FUNCTIONS
+# ============================================================================
+
+def load_model_and_config(model_dir: Path):
+ """Load model, config, and scalers from a given directory."""
+ cwd = Path.cwd()
+
+ # Try different naming conventions
+ candidates = [
+ # Standard names
+ (model_dir / "config.json", model_dir / "model.pt", model_dir / "scalers.pkl"),
+ # Model 307 specific names
+ (model_dir / "model_config.json", model_dir / "model_307.pt", model_dir / "model_scalers.pkl"),
+ # CWD standard names
+ (cwd / "config.json", cwd / "model.pt", cwd / "scalers.pkl"),
+ # CWD Model 307 names
+ (cwd / "model_config.json", cwd / "model_307.pt", cwd / "model_scalers.pkl"),
+ ]
+
+ config_file = model_file = scalers_file = None
+ for cfg, mdl, scl in candidates:
+ if cfg.exists() and mdl.exists() and scl.exists():
+ config_file, model_file, scalers_file = cfg, mdl, scl
+ print(f"Found model files in: {cfg.parent}")
+ break
+
+ if not (config_file and model_file and scalers_file):
+ missing = []
+ for cfg, mdl, scl in candidates:
+ if not cfg.exists():
+ missing.append(str(cfg))
+ if not mdl.exists():
+ missing.append(str(mdl))
+ if not scl.exists():
+ missing.append(str(scl))
+ raise FileNotFoundError(
+ f"Missing model files. Checked multiple locations. Missing: {missing}"
+ )
+
+ with open(config_file) as f:
+ config = yaml.safe_load(f)
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ model = create_model(
+ model_type=config['model']['type'],
+ input_size=len(config['features']),
+ hidden_size=config['model']['hidden_size'],
+ num_layers=config['model']['num_layers'],
+ dropout=config['model']['dropout'],
+ device=device
+ )
+
+ print(f"Loading weights from: {model_file}")
+ model.load_state_dict(torch.load(model_file, map_location=device, weights_only=False))
+ model.eval()
+
+ with open(scalers_file, 'rb') as f:
+ scalers = pickle.load(f)
+
+ return model, config, scalers
+
+
+def load_harvest_data(data_file: Path) -> pd.DataFrame:
+ """Load harvest data CSV."""
+ print(f"Loading data from: {data_file}")
+ df = pd.read_csv(data_file)
+ print(f"Loaded {len(df)} rows")
+ return df
+
+
+def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device):
+ """
+ Phase 1: Growing window detection with threshold crossing.
+ Expand window day-by-day, check last timestep's detected_prob.
+ When 3 consecutive days have prob > 0.5, harvest detected.
+ Returns list of (harvest_date, harvest_idx) tuples.
+ """
+ harvest_dates = []
+ current_pos = 0
+
+ while current_pos < len(field_data):
+ consecutive_above_threshold = 0
+
+ for window_end in range(current_pos + 1, len(field_data) + 1):
+ window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
+
+ try:
+ features = extract_features(window_data, config['features'], ci_column=ci_column)
+
+ # Apply scalers
+ for fi, scaler in enumerate(scalers):
+ try:
+ features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
+ except Exception:
+ pass
+
+ # Run model
+ with torch.no_grad():
+ x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
+ imminent_probs, detected_probs = model(x_tensor)
+ detected_probs = detected_probs.squeeze(0).cpu().numpy()
+
+ # Check LAST timestep
+ last_prob = detected_probs[-1]
+
+ if last_prob > 0.5:
+ consecutive_above_threshold += 1
+ else:
+ consecutive_above_threshold = 0
+
+ # Harvest detected: 3 consecutive days above threshold
+ if consecutive_above_threshold >= 3:
+ harvest_date = field_data.iloc[current_pos + window_end - 3]['Date']
+ harvest_dates.append((harvest_date, current_pos + window_end - 3))
+
+ # Reset to next day after harvest
+ current_pos = current_pos + window_end - 2
+ break
+
+ except Exception:
+ continue
+ else:
+ break
+
+ return harvest_dates
+
+
+def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device):
+ """
+ Phase 2: Refinement with ±40 day window.
+ For each Phase 1 harvest, extract window and refine with argmax.
+ Returns list of (harvest_date, harvest_idx) tuples.
+ """
+ refined_harvests = []
+ field_data = field_data.sort_values('Date').reset_index(drop=True)
+
+ for i, (phase1_harvest_date, phase1_idx) in enumerate(phase1_harvests):
+ try:
+ # Determine season start
+ if i == 0:
+ season_start_date = field_data.iloc[0]['Date']
+ else:
+ prev_harvest_idx = phase1_harvests[i-1][1]
+ season_start_idx = prev_harvest_idx + 1
+ if season_start_idx >= len(field_data):
+ break
+ season_start_date = field_data.iloc[season_start_idx]['Date']
+
+ # Extract ±40 day window
+ window_start_date = season_start_date - pd.Timedelta(days=40)
+ window_end_date = phase1_harvest_date + pd.Timedelta(days=40)
+
+ window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0)
+ window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data))
+
+ if window_end_idx <= window_start_idx:
+ refined_harvests.append((phase1_harvest_date, phase1_idx))
+ continue
+
+ window_data = field_data.iloc[window_start_idx:window_end_idx].copy().reset_index(drop=True)
+
+ # Extract features for full window
+ features = extract_features(window_data, config['features'], ci_column=ci_column)
+
+ # Apply scalers
+ for fi, scaler in enumerate(scalers):
+ try:
+ features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
+ except Exception:
+ pass
+
+ # Run model once on full window
+ with torch.no_grad():
+ x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
+ imminent_probs, detected_probs = model(x_tensor)
+ detected_probs = detected_probs.squeeze(0).cpu().numpy()
+
+ # Find refined harvest (argmax in window)
+ refined_idx_in_window = int(np.argmax(detected_probs))
+ refined_idx_global = window_start_idx + refined_idx_in_window
+ refined_harvest_date = field_data.iloc[refined_idx_global]['Date']
+
+ refined_harvests.append((refined_harvest_date, refined_idx_global))
+
+ except Exception:
+ refined_harvests.append((phase1_harvest_date, phase1_idx))
+
+ return refined_harvests
+
+
+def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=None):
+ """
+ Two-step harvest detection for each field:
+ 1. Phase 1: Growing window with 3-day threshold confirmation
+ 2. Phase 2: ±40 day refinement with argmax
+
+ Returns list of dicts with field, season_start_date, season_end_date, etc.
+ """
+ if device is None:
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+ results = []
+ ci_column = config['data']['ci_column']
+
+ # Group by field and count total fields for progress
+ field_groups = list(df.groupby('field'))
+ total_fields = len(field_groups)
+ harvests_found = 0
+
+ print(f" Processing {total_fields} fields...")
+
+ for idx, (field, field_data) in enumerate(field_groups, 1):
+ # Simple progress indicator
+ pct = int((idx / total_fields) * 100)
+ bar_length = 40
+ filled = int((idx / total_fields) * bar_length)
+ bar = "█" * filled + "░" * (bar_length - filled)
+ print(f" [{bar}] {pct:3d}% ({idx}/{total_fields} fields)", end='\r')
+
+ field_data = field_data.sort_values('Date').reset_index(drop=True)
+
+ # Phase 1: Growing window detection
+ phase1_harvests = run_phase1_growing_window(field_data, model, config, scalers, ci_column, device)
+
+ if not phase1_harvests:
+ continue
+
+ # Phase 2: Refinement
+ phase2_harvests = run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device)
+
+ # Store results
+ for i, (harvest_date, harvest_idx) in enumerate(phase2_harvests):
+ if i == 0:
+ season_start_date = field_data.iloc[0]['Date']
+ else:
+ prev_harvest_idx = phase2_harvests[i-1][1]
+ season_start_idx = prev_harvest_idx + 1
+ if season_start_idx >= len(field_data):
+ break
+ season_start_date = field_data.iloc[season_start_idx]['Date']
+
+ season_end_date = harvest_date
+
+ result = {
+ 'field': field,
+ 'season': i + 1,
+ 'season_start_date': season_start_date,
+ 'season_end_date': season_end_date,
+ 'phase2_harvest_date': harvest_date,
+ }
+ results.append(result)
+ harvests_found += 1
+
+ print() # New line after progress bar
+ print(f" ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields")
+
+ return results
+
+
+def build_production_harvest_table(refined_results: List[Dict]) -> pd.DataFrame:
+ """
+ Build a DataFrame from refined results with columns for production pipeline.
+ One row per field/season with season start and end dates (formatted as YYYY-MM-DD).
+ """
+ if not refined_results:
+ print("WARNING: No refined results to build table")
+ return pd.DataFrame(columns=['field', 'season', 'season_start_date', 'season_end_date'])
+
+ # Build DataFrame
+ df = pd.DataFrame(refined_results)
+
+ # Ensure date columns are datetime
+ df['season_start_date'] = pd.to_datetime(df['season_start_date']).dt.strftime('%Y-%m-%d')
+ df['season_end_date'] = pd.to_datetime(df['season_end_date']).dt.strftime('%Y-%m-%d')
+ df['phase1_harvest_date'] = pd.to_datetime(df['phase1_harvest_date']).dt.strftime('%Y-%m-%d')
+
+ print(f"Built production table with {len(df)} field/season combinations")
+
+ return df
diff --git a/python_app/model_307.pt b/python_app/model_307.pt
new file mode 100644
index 0000000..0a30c3d
Binary files /dev/null and b/python_app/model_307.pt differ
diff --git a/python_app/model_config.json b/python_app/model_config.json
new file mode 100644
index 0000000..3be1268
--- /dev/null
+++ b/python_app/model_config.json
@@ -0,0 +1,144 @@
+{
+ "name": "307_dropout02_with_doy",
+ "description": "Production Model 307: LSTM-based harvest detection (Phase 3, minimal regularization)",
+ "model_info": {
+ "type": "LSTM",
+ "architecture": "Unidirectional LSTM with dual output heads (imminent + detected)",
+ "total_parameters": 105120,
+ "input_features": 14,
+ "hidden_units": 256,
+ "output_heads": 2,
+ "training_data": "Historical multi-season CI data from multiple estates",
+ "validation_method": "5-fold cross-validation",
+ "device": "GPU (CUDA) or CPU fallback"
+ },
+ "production_scripts": {
+ "baseline": {
+ "script": "01_harvest_baseline_prediction.py",
+ "frequency": "Run ONCE during setup",
+ "purpose": "Predict all harvest dates (ground truth baseline)",
+ "input": "ci_data_for_python.csv (complete historical data)",
+ "output": "harvest_production_export.xlsx",
+ "time_estimate": "5-30 minutes depending on data volume"
+ },
+ "monitoring": {
+ "script": "02_harvest_imminent_weekly.py",
+ "frequency": "Run WEEKLY (or daily if required)",
+ "purpose": "Real-time harvest status and imminent alerts",
+ "input": "ci_data_for_python.csv (recent data)",
+ "output": "harvest_imminent_weekly.csv",
+ "time_estimate": "1-5 minutes"
+ }
+ },
+ "features": [
+ "CI_raw",
+ "7d_MA",
+ "14d_MA",
+ "21d_MA",
+ "7d_velocity",
+ "14d_velocity",
+ "21d_velocity",
+ "7d_min",
+ "14d_min",
+ "21d_min",
+ "7d_std",
+ "14d_std",
+ "21d_std",
+ "DOY_normalized"
+ ],
+ "model": {
+ "type": "LSTM",
+ "hidden_size": 256,
+ "num_layers": 1,
+ "dropout": 0.2
+ },
+ "training": {
+ "imminent_days_before": 28,
+ "imminent_days_before_end": 1,
+ "detected_days_after_start": 1,
+ "detected_days_after_end": 21,
+ "k_folds": 5,
+ "num_epochs": 150,
+ "patience": 20,
+ "learning_rate": 0.001,
+ "batch_size": 4
+ },
+ "data": {
+ "csv_path": "../lstm_complete_data.csv",
+ "ci_column": "FitData",
+ "test_fraction": 0.15,
+ "seed": 42
+ },
+ "workflow_instructions": {
+ "overview": "Model 307 uses a two-script approach: baseline setup + weekly monitoring",
+ "step_1_baseline": {
+ "description": "Establish historical harvest date reference for all fields",
+ "script": "01_harvest_baseline_prediction.py",
+ "when": "Run once after setting up CI extraction pipeline",
+ "command": "conda activate python_gpu && python 01_harvest_baseline_prediction.py",
+ "input_data": "ci_data_for_python.csv (all available historical CI data)",
+ "output_file": "harvest_production_export.xlsx (ground truth baseline)",
+ "columns": [
+ "field - Field ID",
+ "sub_field - Sub-field designation",
+ "season - Season number (1, 2, 3...)",
+ "year - Year of harvest",
+ "season_start_date - Start of growing season",
+ "season_end_date - End of season (harvest date)",
+ "phase1_harvest_date - Refined harvest prediction"
+ ],
+ "notes": "This becomes your reference - compare all weekly monitoring against this"
+ },
+ "step_2_monitoring": {
+ "description": "Weekly real-time harvest status and imminent alerts",
+ "script": "02_harvest_imminent_weekly.py",
+ "when": "Run every week (e.g., Mondays) or daily if near harvest",
+ "command": "conda activate python_gpu && python 02_harvest_imminent_weekly.py",
+ "input_data": "ci_data_for_python.csv (latest CI data from 02b conversion)",
+ "output_file": "harvest_imminent_weekly.csv",
+ "columns": [
+ "field - Field ID",
+ "sub_field - Sub-field designation",
+ "imminent_prob - Likelihood of harvest readiness in next 28 days (0.0-1.0)",
+ "detected_prob - Current harvest probability (0.0-1.0)",
+ "week - ISO week number",
+ "year - Year",
+ "as_of_date - Latest date in dataset",
+ "num_days - Days of history used"
+ ],
+ "alert_thresholds": {
+ "imminent_high": "imminent_prob > 0.7 (prepare harvest)",
+ "imminent_medium": "imminent_prob 0.5-0.7 (monitor closely)",
+ "detected_high": "detected_prob > 0.6 (active harvesting)"
+ }
+ },
+ "integration_with_r_pipeline": {
+ "before_model_307": [
+ "Planet 8-band download: download_8band_pu_optimized.ipynb",
+ "CI extraction: 02_ci_extraction.R",
+ "Convert to CSV: 02b_convert_rds_to_csv.R (outputs ci_data_for_python.csv)"
+ ],
+ "model_307_here": [
+ "BASELINE: 01_harvest_baseline_prediction.py (run once)",
+ "MONITORING: 02_harvest_imminent_weekly.py (run weekly)"
+ ],
+ "after_model_307": [
+ "Field analysis: 09b_field_analysis_weekly.R (reads harvest predictions)",
+ "Reports: 10_CI_report_with_kpis.Rmd (includes harvest status)"
+ ]
+ },
+ "environment_requirements": {
+ "python_env": "python_gpu",
+ "activation": "conda activate python_gpu",
+ "required_packages": [
+ "torch (GPU-enabled)",
+ "pandas",
+ "numpy",
+ "scikit-learn",
+ "pyyaml",
+ "openpyxl"
+ ],
+ "gpu": "NVIDIA GPU with CUDA (optional - falls back to CPU if unavailable)"
+ }
+ }
+}
\ No newline at end of file
diff --git a/python_app/model_scalers.pkl b/python_app/model_scalers.pkl
new file mode 100644
index 0000000..4c1ead5
Binary files /dev/null and b/python_app/model_scalers.pkl differ
diff --git a/r_app/02b_convert_ci_rds_to_csv.R b/r_app/02b_convert_ci_rds_to_csv.R
index bdf57c6..f75f6af 100644
--- a/r_app/02b_convert_ci_rds_to_csv.R
+++ b/r_app/02b_convert_ci_rds_to_csv.R
@@ -15,9 +15,113 @@
suppressPackageStartupMessages({
library(tidyverse)
library(lubridate)
+ library(zoo)
library(here)
})
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+
+#' Convert wide format RDS to long format
+#'
+#' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns
+#' @return Long format tibble: field, sub_field, Date, FitData
+wide_to_long_ci_data <- function(ci_data_wide) {
+ ci_data_wide %>%
+ pivot_longer(
+ cols = -c(field, sub_field),
+ names_to = "Date",
+ values_to = "FitData",
+ values_drop_na = TRUE
+ ) %>%
+ mutate(
+ Date = as.Date(Date),
+ FitData = as.numeric(FitData)
+ ) %>%
+ filter(!is.na(FitData))
+}
+
+#' Create daily interpolated sequences with DOY for each field
+#'
+#' For each field/sub_field combination, creates complete daily sequences from first to last date,
+#' fills in measurements, and interpolates missing dates.
+#'
+#' @param ci_data_long Long format tibble: field, sub_field, Date, FitData
+#' @return Tibble with: field, sub_field, Date, FitData, DOY, value
+create_interpolated_daily_sequences <- function(ci_data_long) {
+ ci_data_long %>%
+ group_by(field, sub_field) %>%
+ nest() %>%
+ mutate(
+ data = map(data, function(df) {
+ # Sort measurements by date
+ df <- df %>% arrange(Date)
+
+ # Create complete daily sequence from first to last date
+ date_seq <- seq(min(df$Date), max(df$Date), by = "day")
+
+ # Build daily dataframe (field/sub_field stay in outer df, not here)
+ daily_df <- tibble(
+ Date = date_seq,
+ value = NA_real_,
+ FitData = NA_real_,
+ DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ...
+ )
+
+ # Fill in actual measurement values
+ for (i in seq_len(nrow(df))) {
+ idx <- which(daily_df$Date == df$Date[i])
+ if (length(idx) > 0) {
+ daily_df$value[idx] <- df$FitData[i]
+ }
+ }
+
+ # Interpolate missing dates linearly
+ daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE)
+
+ daily_df
+ })
+ ) %>%
+ unnest(data) %>%
+ select(field, sub_field, Date, FitData, DOY, value) %>%
+ arrange(field, Date)
+}
+
+#' Validate conversion output
+#'
+#' @param ci_data_python Tibble with converted CI data
+#' @return Invisibly returns the tibble (for piping)
+validate_conversion_output <- function(ci_data_python) {
+ cat(sprintf("\nValidation:\n"))
+ cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
+ cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python)))
+ cat(sprintf(" Date range: %s to %s\n",
+ min(ci_data_python$Date, na.rm = TRUE),
+ max(ci_data_python$Date, na.rm = TRUE)))
+ cat(sprintf(" FitData range: %.2f to %.2f\n",
+ min(ci_data_python$FitData, na.rm = TRUE),
+ max(ci_data_python$FitData, na.rm = TRUE)))
+ cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value))))
+ cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData))))
+
+ invisible(ci_data_python)
+}
+
+#' Print next steps message
+print_next_steps <- function() {
+ cat("\nNext steps for Python harvest detection:\n")
+ cat(" 1. Read this CSV file in Python\n")
+ cat(" 2. Group by field to identify seasons\n")
+ cat(" 3. Run LSTM model to detect harvest dates\n")
+ cat(" 4. Save predicted harvest dates to Excel\n")
+ cat(" 5. Use output in script 03 for interpolation\n")
+}
+
+# ============================================================================
+# MAIN FUNCTION
+# ============================================================================
+
main <- function() {
# Process command line arguments
args <- commandArgs(trailingOnly = TRUE)
@@ -28,7 +132,7 @@ main <- function() {
} else if (exists("project_dir", envir = .GlobalEnv)) {
project_dir <- get("project_dir", envir = .GlobalEnv)
} else {
- project_dir <- "esa"
+ project_dir <- "angata"
}
# Make available globally
@@ -49,9 +153,17 @@ main <- function() {
})
# Define paths
- ci_data_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
- input_file <- file.path(ci_data_dir, "combined_CI_data.rds")
- output_file <- file.path(ci_data_dir, "ci_data_for_python.csv")
+ ci_data_source_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
+ ci_data_output_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "ci_data_for_python")
+
+ # Create output directory if it doesn't exist (for new projects)
+ if (!dir.exists(ci_data_output_dir)) {
+ dir.create(ci_data_output_dir, recursive = TRUE, showWarnings = FALSE)
+ cat(sprintf("✓ Created output directory: %s\n", ci_data_output_dir))
+ }
+
+ input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds")
+ output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv")
# Check if input file exists
if (!file.exists(input_file)) {
@@ -61,52 +173,32 @@ main <- function() {
cat(sprintf("Loading: %s\n", input_file))
# Load RDS file
- ci_data <- readRDS(input_file) %>%
+ ci_data_wide <- readRDS(input_file) %>%
as_tibble()
- cat(sprintf(" Loaded %d rows\n", nrow(ci_data)))
- cat(sprintf(" Columns: %s\n", paste(names(ci_data), collapse = ", ")))
+ cat(sprintf(" Loaded %d rows\n", nrow(ci_data_wide)))
+ cat(sprintf(" Format: WIDE (field, sub_field, then dates as columns)\n"))
+ cat(sprintf(" Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", ")))
- # Prepare data for Python
- ci_data_python <- ci_data %>%
- # Ensure standard column names
- rename(
- field = field,
- sub_field = sub_field,
- Date = Date,
- FitData = FitData,
- DOY = DOY
- ) %>%
- # Add 'value' as an alias for FitData (sometimes needed)
- mutate(value = FitData) %>%
- # Keep only necessary columns
- select(field, sub_field, Date, FitData, DOY, value) %>%
- # Sort by field and date
- arrange(field, Date)
+ # Step 1: Convert from WIDE to LONG format
+ cat("\nStep 1: Converting from wide to long format...\n")
+ ci_data_long <- wide_to_long_ci_data(ci_data_wide)
- # Validate data
- cat(sprintf("\nValidation:\n"))
- cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
- cat(sprintf(" Date range: %s to %s\n",
- min(ci_data_python$Date, na.rm = TRUE),
- max(ci_data_python$Date, na.rm = TRUE)))
- cat(sprintf(" FitData range: %.2f to %.2f\n",
- min(ci_data_python$FitData, na.rm = TRUE),
- max(ci_data_python$FitData, na.rm = TRUE)))
- cat(sprintf(" Missing FitData: %d rows\n", sum(is.na(ci_data_python$FitData))))
+ # Step 2: Create complete daily sequences with interpolation
+ cat("\nStep 2: Creating complete daily sequences with interpolation...\n")
+ ci_data_python <- create_interpolated_daily_sequences(ci_data_long)
- # Save to CSV
- cat(sprintf("\nSaving to: %s\n", output_file))
+ # Step 3: Validate output
+ cat("\nStep 3: Validating output...")
+ validate_conversion_output(ci_data_python)
+ # Step 4: Save to CSV
+ cat(sprintf("\nStep 4: Saving to CSV...\n"))
+ cat(sprintf(" Output: %s\n", output_file))
write_csv(ci_data_python, output_file)
- cat(sprintf("✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
- cat("\nNext steps for Python harvest detection:\n")
- cat(" 1. Read this CSV file in Python\n")
- cat(" 2. Group by field to identify seasons\n")
- cat(" 3. Run LSTM model to detect harvest dates\n")
- cat(" 4. Save predicted harvest dates to Excel\n")
- cat(" 5. Use output in script 03 for interpolation\n")
+ cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
+ print_next_steps()
}
if (sys.nframe() == 0) {
diff --git a/webapps/geojson_viewer.html b/webapps/geojson_viewer.html
new file mode 100644
index 0000000..36511a1
--- /dev/null
+++ b/webapps/geojson_viewer.html
@@ -0,0 +1,1328 @@
+
+
+
+
+
+ GeoJSON Viewer
+
+
+
+
+
+
+
+
+
+
+
diff --git a/webapps/index.html b/webapps/index.html
index deef357..918f1e3 100644
--- a/webapps/index.html
+++ b/webapps/index.html
@@ -212,6 +212,22 @@
Open Tool
+
+
+
+
📍
+
+
GeoJSON Viewer
+
Upload and visualize GeoJSON files on an interactive map with feature properties.
+
+ - Upload GeoJSON files
+ - Interactive map view
+ - View feature properties
+ - Download exports
+
+
Open Viewer
+
+