diff --git a/python_app/download_8band_pu_optimized.py b/python_app/00_download_8band_pu_optimized.py similarity index 97% rename from python_app/download_8band_pu_optimized.py rename to python_app/00_download_8band_pu_optimized.py index 4fb5c4f..9d2841d 100644 --- a/python_app/download_8band_pu_optimized.py +++ b/python_app/00_download_8band_pu_optimized.py @@ -31,7 +31,7 @@ Examples: python download_8band_pu_optimized.py chemba # Uses today's date python download_8band_pu_optimized.py xinavane --clear-singles --cleanup python download_8band_pu_optimized.py angata --clear-all --resolution 5 - + Cost Model: - 4-band uint16 with cloud masking: ~50% lower cost than 9-band FLOAT32 - Reduced bbox sizes: ~10-20% lower cost due to smaller average tile size @@ -39,6 +39,18 @@ Cost Model: - Requests: Slightly higher (~50-60 tiles) but within 700k budget Expected result: ~75% PU savings with dynamic geometry-fitted grid + +Example running it in powershell: + $startDate = [DateTime]::ParseExact("2025-11-01", "yyyy-MM-dd", $null) + $endDate = [DateTime]::ParseExact("2025-12-24", "yyyy-MM-dd", $null) + + $current = $startDate + while ($current -le $endDate) { + $dateStr = $current.ToString("yyyy-MM-dd") + Write-Host "Downloading $dateStr..." + python download_8band_pu_optimized.py angata --date $dateStr + $current = $current.AddDays(1) + } """ import os diff --git a/python_app/01_harvest_baseline_prediction.py b/python_app/01_harvest_baseline_prediction.py new file mode 100644 index 0000000..7461964 --- /dev/null +++ b/python_app/01_harvest_baseline_prediction.py @@ -0,0 +1,111 @@ +""" +Script: 01_harvest_baseline_prediction.py +Purpose: BASELINE PREDICTION - Run ONCE to establish harvest date baseline for all fields and seasons + +This script processes COMPLETE historical CI data (all available dates) and uses Model 307 +to predict ALL harvest dates across the entire dataset. This becomes your reference baseline +for monitoring and comparisons going forward. + +RUN FREQUENCY: Once during initial setup +INPUT: ci_data_for_python.csv (complete historical CI data from 02b_convert_rds_to_csv.R) + Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv +OUTPUT: harvest_production_export.xlsx (baseline harvest predictions for all fields/seasons) + +Workflow: +1. Load ci_data_for_python.csv (daily interpolated, all historical dates) +2. Group data by field and season (Model 307 detects season boundaries internally) +3. Run two-step harvest detection (Phase 1: fast detection, Phase 2: ±40 day refinement) +4. Export harvest_production_export.xlsx with columns: + - field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date + +Two-Step Detection Algorithm: + Phase 1 (Growing Window): Expands daily, checks when detected_prob > 0.5 for 3 consecutive days + Phase 2 (Refinement): Extracts ±40 day window, finds peak harvest signal with argmax + +This is your GROUND TRUTH - compare all future predictions against this baseline. + +Usage: + python 01_harvest_baseline_prediction.py [project_name] + + Examples: + python 01_harvest_baseline_prediction.py angata + python 01_harvest_baseline_prediction.py esa + python 01_harvest_baseline_prediction.py chemba + + If no project specified, defaults to 'angata' +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, + run_two_step_refinement, + build_production_harvest_table +) + + +def main(): + # Get project name from command line or use default + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + + # Construct paths + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + harvest_data_dir = base_storage / "HarvestData" + harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist + OUTPUT_XLSX = harvest_data_dir / "harvest_production_export.xlsx" + MODEL_DIR = Path(".") # Model files in python_app/ + + # Check if input exists + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + print(f" Expected at: {CI_DATA_FILE.resolve()}") + print(f"\n Run 02b_convert_rds_to_csv.R first to generate this file:") + print(f" Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}") + return + + print("="*80) + print(f"HARVEST DATE PREDICTION - LSTM MODEL 307 ({project_name})") + print("="*80) + + # [1/4] Load model + print("\n[1/4] Loading Model 307...") + model, config, scalers = load_model_and_config(MODEL_DIR) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f" Device: {device}") + + # [2/4] Load and prepare CI data + print("\n[2/4] Loading CI data...") + print(f" From: {CI_DATA_FILE}") + ci_data = pd.read_csv(CI_DATA_FILE) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + print(f" Loaded {len(ci_data)} daily rows across {ci_data['field'].nunique()} fields") + print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}") + + # [3/4] Run model predictions with two-step detection + print("\n[3/4] Running two-step harvest detection...") + refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device) + + # Build and export + print("\nBuilding production harvest table...") + prod_table = build_production_harvest_table(refined_results) + + prod_table.to_excel(OUTPUT_XLSX, index=False) + print(f"\n✓ Exported {len(prod_table)} predictions to {OUTPUT_XLSX}") + print(f"\nOutput location: {OUTPUT_XLSX.resolve()}") + print(f"\nStorage structure:") + print(f" Input: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/") + print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/") + print(f"\nColumn structure:") + print(f" field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date") + print(f"\nNext steps:") + print(f" 1. Review baseline predictions in harvest_production_export.xlsx") + print(f" 2. Run weekly monitoring: python 02_harvest_imminent_weekly.py {project_name}") + +if __name__ == "__main__": + main() diff --git a/python_app/02_harvest_imminent_weekly.py b/python_app/02_harvest_imminent_weekly.py new file mode 100644 index 0000000..09cf3de --- /dev/null +++ b/python_app/02_harvest_imminent_weekly.py @@ -0,0 +1,348 @@ +""" +Script: 02_harvest_imminent_weekly.py +Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields + +This script runs on RECENT CI data (typically last 300 days) to predict whether each field +is approaching harvest. Use this for operational decision-making and real-time alerts. + +RUN FREQUENCY: Weekly (or daily if required) +INPUT: + - ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R) + Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv + - harvest_production_export.xlsx (baseline from script 01 - optional, for reference) +OUTPUT: + - harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year) + +Workflow: +1. Load harvest_production_export.xlsx (baseline dates - optional, for context) +2. Load ci_data_for_python.csv (recent CI data) +3. For each field, extract last 300 days of history +4. Run Model 307 inference on full sequence (last timestep probabilities) +5. Export harvest_imminent_weekly.csv with probabilities + +Output Columns: + - field: Field ID + - sub_field: Sub-field identifier + - imminent_prob: Probability field will be harvestable in next 28 days (0.0-1.0) + - detected_prob: Probability field is currently being harvested (0.0-1.0) + - week: ISO week number + - year: Year + - as_of_date: Latest date in dataset + - num_days: Number of days of history used + +Use Cases: + - Alert when imminent_prob > 0.7 (prepare harvest operations) + - Alert when detected_prob > 0.6 (field is being harvested) + - Track trends over weeks to validate baseline predictions + - Feed into 09b script for weekly dashboard reports + +Usage: + python 02_harvest_imminent_weekly.py [project_name] + + Examples: + python 02_harvest_imminent_weekly.py angata + python 02_harvest_imminent_weekly.py esa + python 02_harvest_imminent_weekly.py chemba + + If no project specified, defaults to 'angata' +""" + +import pandas as pd +import numpy as np +import torch +import subprocess +import sys +from pathlib import Path +from datetime import datetime, timedelta +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) + + +def load_harvest_dates(harvest_file): + """Load latest harvest end dates from Excel file (from harvest_production_export.xlsx).""" + print("[1/5] Loading harvest dates...") + + if not Path(harvest_file).exists(): + print(f" ERROR: {harvest_file} not found") + print(" Using 180-day lookback as default") + return None + + try: + harvest_df = pd.read_excel(harvest_file) + print(f" Loaded {len(harvest_df)} field-season records") + + # Use season_end_date column (output from harvest prediction script) + harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date']) + + # Group by field and get the latest season_end_date + harvest_dates = {} + for field_id, group in harvest_df.groupby('field'): + latest_end = group['season_end_date'].max() + harvest_dates[str(field_id).strip()] = latest_end + + print(f" Successfully mapped {len(harvest_dates)} fields") + print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}") + return harvest_dates + except Exception as e: + print(f" ERROR loading harvest file: {e}") + print(f" Using 180-day lookback instead") + return None + + +def run_rds_to_csv_conversion(): + """Run R script to convert RDS to CSV.""" + print("\n[2/5] Converting RDS to CSV (daily interpolation)...") + r_script = Path("02b_convert_rds_to_csv.R") + + if not r_script.exists(): + print(f" ERROR: {r_script} not found") + return False + + # Use full path to Rscript on Windows + rscript_exe = r"C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" + + try: + result = subprocess.run( + [rscript_exe, str(r_script)], + capture_output=True, + text=True, + timeout=300 + ) + + if result.returncode != 0: + print(f" ERROR running R script:\n{result.stderr}") + return False + + # Show last few lines of output + lines = result.stdout.strip().split('\n') + for line in lines[-5:]: + if line.strip(): + print(f" {line}") + + return True + except Exception as e: + print(f" ERROR: {e}") + return False + + +def load_ci_data(csv_file): + """Load CI data.""" + print("\n[3/5] Loading CI data...") + + if not Path(csv_file).exists(): + print(f" ERROR: {csv_file} not found") + return None + + ci_data = pd.read_csv(csv_file) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + print(f" Loaded {len(ci_data)} daily rows for {ci_data['field'].nunique()} fields") + print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}") + + return ci_data + + +def extract_seasonal_data(field_id, harvest_date, ci_data): + """ + Extract CI data from harvest date to latest for a specific field. + Returns dataframe sorted by date, or None if insufficient data. + """ + # field_id is int, ci_data['field'] is also int + field_data = ci_data[ci_data['field'] == field_id].copy() + + if len(field_data) == 0: + return None + + # Filter from harvest date onwards + field_data = field_data[field_data['Date'] >= harvest_date].sort_values('Date') + + # Need at least 30 days of data for meaningful inference + if len(field_data) < 30: + return None + + return field_data + + +def run_inference_on_season(field_data, model, config, scalers, device, ci_column='FitData'): + """ + Run Model 307 inference on recent field CI history. + Predicts probability that field will be ready to harvest in next 28 days. + Uses last timestep from the provided data sequence. + Returns (imminent_prob, detected_prob) for prediction. + """ + try: + # Use last 300 days of data for inference (enough history for meaningful patterns, + # avoids training data seasonality mismatch) + if len(field_data) > 300: + field_data = field_data.iloc[-300:] + + # Extract features + features_array = extract_features(field_data, config['features'], ci_column) + + if features_array.shape[0] < 10: + return None, None + + # Scale features using per-feature scalers (CRITICAL: same as Phase 1 in harvest_date_pred_utils.py) + # Scalers is a list of StandardScaler objects, one per feature + if scalers and isinstance(scalers, list): + for fi, scaler in enumerate(scalers): + try: + features_array[:, fi] = scaler.transform(features_array[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run inference + with torch.no_grad(): + x_tensor = torch.tensor(features_array, dtype=torch.float32).unsqueeze(0).to(device) + out_imm, out_det = model(x_tensor) + + # Get last timestep probabilities + imminent_prob = out_imm.squeeze(0)[-1].cpu().item() + detected_prob = out_det.squeeze(0)[-1].cpu().item() + + return round(imminent_prob, 4), round(detected_prob, 4) + + except Exception as e: + return None, None + + +def main(): + # Get project name from command line or use default + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + + # Construct paths + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + harvest_data_dir = base_storage / "HarvestData" + BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx" + OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv" + harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist + + print("="*80) + print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})") + print("="*80) + + # [1] Load harvest dates (optional - for projects with predictions) + harvest_dates = None + if BASELINE_FILE.exists(): + harvest_dates = load_harvest_dates(BASELINE_FILE) + else: + print("[1/5] Loading harvest dates...") + print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)") + + # [2] Load CI data + print(f"\n[2/5] Loading CI data...") + print(f" From: {CI_DATA_FILE}") + + if not CI_DATA_FILE.exists(): + print(f" ERROR: {CI_DATA_FILE} not found") + print(f" Expected at: {CI_DATA_FILE.resolve()}") + print(f"\n Run 02b_convert_rds_to_csv.R first to generate this file:") + print(f" Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}") + return + + ci_data = load_ci_data(CI_DATA_FILE) + if ci_data is None: + print("ERROR: Could not load CI data") + return + + # [3] Load model (from python_app directory) + print("\n[3/5] Loading Model 307...") + model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl + model, config, scalers = load_model_and_config(model_dir) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f" Device: {device}") + + # [4] Run inference per field + print("\n[4/5] Running seasonal inference...") + + results_list = [] + ci_column = config['data']['ci_column'] + + # Get field metadata + field_meta = ci_data.groupby('field').agg({ + 'sub_field': 'first', + 'Date': 'max' + }).reset_index() + field_meta.columns = ['field', 'sub_field', 'latest_date'] + + count = 0 + for field_id in ci_data['field'].unique(): + # Get metadata + meta = field_meta[field_meta['field'] == field_id] + if len(meta) == 0: + continue + + sub_field = meta['sub_field'].iloc[0] + latest_date = meta['latest_date'].iloc[0] + + # Use recent CI history (last 300 days from latest available data) + field_data = ci_data[ci_data['field'] == field_id].copy() + field_data = field_data.sort_values('Date') + + # Keep last 300 days of history for inference + if len(field_data) > 300: + field_data = field_data.iloc[-300:] + + if len(field_data) < 30: + continue + + # Run inference on recent history to predict next 28 days + imminent_prob, detected_prob = run_inference_on_season( + field_data, model, config, scalers, device, ci_column + ) + + if imminent_prob is None: + continue + + week = int(latest_date.strftime('%V')) + year = int(latest_date.strftime('%Y')) + + results_list.append({ + 'field': field_id, + 'sub_field': sub_field, + 'imminent_prob': imminent_prob, + 'detected_prob': detected_prob, + 'week': week, + 'year': year, + 'as_of_date': latest_date, + 'num_days': len(field_data), + }) + + count += 1 + + print(f" Completed inference for {count} fields") + + # Build output DataFrame + df = pd.DataFrame(results_list) + df.to_csv(OUTPUT_CSV, index=False) + + print(f"\n[5/5] Exporting results...") + print(f"✓ Exported {len(df)} fields to {OUTPUT_CSV}") + print(f" Output location: {OUTPUT_CSV.resolve()}") + + if len(df) > 0: + print(f"\nSample rows:") + print(df[['field', 'sub_field', 'imminent_prob', 'detected_prob', 'num_days', 'week', 'year']].head(15).to_string(index=False)) + + # Show alert summary + high_imminent = len(df[df['imminent_prob'] > 0.7]) + high_detected = len(df[df['detected_prob'] > 0.6]) + print(f"\n⚠ ALERTS:") + print(f" Fields with imminent_prob > 0.70: {high_imminent}") + print(f" Fields with detected_prob > 0.60: {high_detected}") + else: + print(f" WARNING: No results exported - check CI data availability") + + print(f"\nStorage structure:") + print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/") + print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx") + print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/") + print(f"\nReady to load into 09b field analysis report") + + +if __name__ == "__main__": + main() diff --git a/python_app/download_planet_missing_dates.py b/python_app/download_planet_missing_dates.py index dc2f870..b6723b7 100644 --- a/python_app/download_planet_missing_dates.py +++ b/python_app/download_planet_missing_dates.py @@ -18,6 +18,7 @@ import sys import json import datetime import argparse +import subprocess from pathlib import Path from osgeo import gdal import time @@ -441,6 +442,7 @@ def get_evalscript(): def main(): print("="*80) print("PLANET SATELLITE DATA DOWNLOADER - MISSING DATES ONLY") + print("Wrapper for 00_download_8band_pu_optimized.py") print("="*80) config_dict = get_config() @@ -495,47 +497,45 @@ def main(): print(f" - {date}") if config_dict['dry_run']: - print("\n[DRY-RUN] Would download and merge above dates") + print("\n[DRY-RUN] Would download above dates using 00_download_8band_pu_optimized.py") return 0 - # Setup BBox list - print(f"\nLoading field geometries...") - bbox_list = setup_bbox_list(paths['geojson'], resolution=config_dict['resolution']) - if bbox_list is None: - return 1 - print(f" Created {len(bbox_list)} BBox tiles") - - # Download and merge each missing date - print(f"\nDownloading missing dates...") + # Download each missing date using the optimized downloader + print(f"\n{'='*80}") + print(f"Downloading missing dates using optimized script...") print(f"{'='*80}") success_count = 0 - for i, slot in enumerate(missing_dates, 1): - print(f"\n[{i}/{len(missing_dates)}] Processing {slot}...") + for i, date_str in enumerate(missing_dates, 1): + print(f"\n[{i}/{len(missing_dates)}] Downloading {date_str}...") - # Check availability - if not is_image_available(slot, bbox_list, collection_id): - print(f" Skipping {slot} - no imagery available") - continue + # Call 00_download_8band_pu_optimized.py for this date + cmd = [ + sys.executable, + "00_download_8band_pu_optimized.py", + config_dict['project'], + "--date", date_str, + "--resolution", str(config_dict['resolution']), + "--cleanup" + ] - # Download for all bboxes - print(f" Downloading {len(bbox_list)} tiles...") - for bbox in bbox_list: - size = bbox_to_dimensions(bbox, resolution=config_dict['resolution']) - download_function(slot, bbox, size, paths['single_images']) - - # Merge - print(f" Merging tiles...") - if merge_files(slot, paths['single_images'], paths['merged_tifs'], paths['virtual_raster']): + try: + result = subprocess.run(cmd, check=True, capture_output=False) success_count += 1 + print(f" ✓ Successfully downloaded {date_str}") + except subprocess.CalledProcessError as e: + print(f" ✗ Failed to download {date_str}: {e}") + # Continue with next date instead of stopping + continue # Summary print(f"\n{'='*80}") print(f"SUMMARY:") print(f" Successfully processed: {success_count}/{len(missing_dates)} dates") print(f" Output folder: {paths['merged_tifs']}") + print(f"{'='*80}") - return 0 + return 0 if success_count == len(missing_dates) else 1 if __name__ == "__main__": sys.exit(main()) diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py new file mode 100644 index 0000000..7c9eb07 --- /dev/null +++ b/python_app/harvest_date_pred_utils.py @@ -0,0 +1,482 @@ +""" +Self-contained utility module for two-step harvest date prediction and Excel export. +Includes model architecture, feature engineering, and core prediction logic. +""" + +import sys +import pandas as pd +import numpy as np +import torch +import torch.nn as nn +import pickle +import yaml +from pathlib import Path +from typing import Tuple, Dict, Any, List +from sklearn.preprocessing import StandardScaler + +# ============================================================================ +# TORCH MODELS (from src/models.py, inlined for self-containment) +# ============================================================================ + +class HarvestDetectionLSTM(nn.Module): + """Unidirectional LSTM for harvest detection with dual outputs.""" + def __init__(self, input_size: int, hidden_size: int = 128, + num_layers: int = 1, dropout: float = 0.5): + super(HarvestDetectionLSTM, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout if num_layers > 1 else 0, + bidirectional=False, + batch_first=True + ) + + self.imminent_head = nn.Sequential( + nn.Linear(hidden_size, 16), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(16, 1), + nn.Sigmoid() + ) + + self.detected_head = nn.Sequential( + nn.Linear(hidden_size, 16), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(16, 1), + nn.Sigmoid() + ) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + lstm_out, _ = self.lstm(x) + batch_size, seq_len, hidden_size = lstm_out.shape + lstm_flat = lstm_out.reshape(-1, hidden_size) + + imminent_flat = self.imminent_head(lstm_flat).reshape(batch_size, seq_len) + detected_flat = self.detected_head(lstm_flat).reshape(batch_size, seq_len) + + return imminent_flat, detected_flat + + +class HarvestDetectionGRU(nn.Module): + """Unidirectional GRU for harvest detection with dual outputs.""" + def __init__(self, input_size: int, hidden_size: int = 128, + num_layers: int = 1, dropout: float = 0.5): + super(HarvestDetectionGRU, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + + self.gru = nn.GRU( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout if num_layers > 1 else 0, + bidirectional=False, + batch_first=True + ) + + self.imminent_head = nn.Sequential( + nn.Linear(hidden_size, 16), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(16, 1), + nn.Sigmoid() + ) + + self.detected_head = nn.Sequential( + nn.Linear(hidden_size, 16), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(16, 1), + nn.Sigmoid() + ) + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + gru_out, _ = self.gru(x) + batch_size, seq_len, hidden_size = gru_out.shape + gru_flat = gru_out.reshape(-1, hidden_size) + + imminent_flat = self.imminent_head(gru_flat).reshape(batch_size, seq_len) + detected_flat = self.detected_head(gru_flat).reshape(batch_size, seq_len) + + return imminent_flat, detected_flat + + +def create_model(model_type: str, input_size: int, hidden_size: int = 128, + num_layers: int = 1, dropout: float = 0.5, device = None) -> nn.Module: + """Create a model from registry.""" + registry = {'LSTM': HarvestDetectionLSTM, 'GRU': HarvestDetectionGRU} + if model_type not in registry: + raise ValueError(f"Unknown model type: {model_type}") + + model = registry[model_type]( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + dropout=dropout + ) + + if device: + model = model.to(device) + + # Print model info + total_params = sum(p.numel() for p in model.parameters()) + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"Model: {model_type}") + print(f" Input size: {input_size}") + print(f" Hidden size: {hidden_size}") + print(f" Num layers: {num_layers}") + print(f" Dropout: {dropout}") + print(f" Total parameters: {total_params:,}") + print(f" Trainable parameters: {trainable_params:,}") + print(f" Device: {device}") + + return model + + +# ============================================================================ +# FEATURE ENGINEERING (from src/feature_engineering.py, simplified for inline) +# ============================================================================ + +def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> pd.DataFrame: + """Compute all CI-based features (state, velocity, acceleration, min/max/range/std/CV).""" + features = pd.DataFrame(index=ci_series.index) + + # State (moving averages) + features['CI_raw'] = ci_series + features['7d_MA'] = ci_series.rolling(window=7, min_periods=1).mean() + features['14d_MA'] = ci_series.rolling(window=14, min_periods=1).mean() + features['21d_MA'] = ci_series.rolling(window=21, min_periods=1).mean() + + # Velocity (gradient of MA) + for window in [7, 14, 21]: + ma = ci_series.rolling(window=window, min_periods=1).mean() + features[f'{window}d_velocity'] = ma.diff() / 1.0 # Simplified gradient + + # Acceleration (gradient of velocity) + for window in [7, 14, 21]: + ma = ci_series.rolling(window=window, min_periods=1).mean() + vel = ma.diff() + features[f'{window}d_acceleration'] = vel.diff() + + # Min, Max, Range + for window in [7, 14, 21]: + features[f'{window}d_min'] = ci_series.rolling(window=window, min_periods=1).min() + features[f'{window}d_max'] = ci_series.rolling(window=window, min_periods=1).max() + features[f'{window}d_range'] = features[f'{window}d_max'] - features[f'{window}d_min'] + + # Std and CV + for window in [7, 14, 21]: + features[f'{window}d_std'] = ci_series.rolling(window=window, min_periods=1).std() + ma = ci_series.rolling(window=window, min_periods=1).mean() + features[f'{window}d_CV'] = features[f'{window}d_std'] / (ma + 1e-6) + + # DOY normalized + if doy_series is not None: + features['DOY_normalized'] = doy_series / 450.0 + + return features.fillna(0) + + +def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray: + """Extract and return specified features as numpy array.""" + # Compute all CI features + ci_series = data_df[ci_column].astype(float) + doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None + + all_features = compute_ci_features(ci_series, doy_series) + + # Select requested features + requested = [f for f in feature_names if f in all_features.columns] + if not requested: + raise ValueError(f"No valid features found. Requested: {feature_names}") + + return all_features[requested].values + + +# ============================================================================ +# MAIN UTILITY FUNCTIONS +# ============================================================================ + +def load_model_and_config(model_dir: Path): + """Load model, config, and scalers from a given directory.""" + cwd = Path.cwd() + + # Try different naming conventions + candidates = [ + # Standard names + (model_dir / "config.json", model_dir / "model.pt", model_dir / "scalers.pkl"), + # Model 307 specific names + (model_dir / "model_config.json", model_dir / "model_307.pt", model_dir / "model_scalers.pkl"), + # CWD standard names + (cwd / "config.json", cwd / "model.pt", cwd / "scalers.pkl"), + # CWD Model 307 names + (cwd / "model_config.json", cwd / "model_307.pt", cwd / "model_scalers.pkl"), + ] + + config_file = model_file = scalers_file = None + for cfg, mdl, scl in candidates: + if cfg.exists() and mdl.exists() and scl.exists(): + config_file, model_file, scalers_file = cfg, mdl, scl + print(f"Found model files in: {cfg.parent}") + break + + if not (config_file and model_file and scalers_file): + missing = [] + for cfg, mdl, scl in candidates: + if not cfg.exists(): + missing.append(str(cfg)) + if not mdl.exists(): + missing.append(str(mdl)) + if not scl.exists(): + missing.append(str(scl)) + raise FileNotFoundError( + f"Missing model files. Checked multiple locations. Missing: {missing}" + ) + + with open(config_file) as f: + config = yaml.safe_load(f) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = create_model( + model_type=config['model']['type'], + input_size=len(config['features']), + hidden_size=config['model']['hidden_size'], + num_layers=config['model']['num_layers'], + dropout=config['model']['dropout'], + device=device + ) + + print(f"Loading weights from: {model_file}") + model.load_state_dict(torch.load(model_file, map_location=device, weights_only=False)) + model.eval() + + with open(scalers_file, 'rb') as f: + scalers = pickle.load(f) + + return model, config, scalers + + +def load_harvest_data(data_file: Path) -> pd.DataFrame: + """Load harvest data CSV.""" + print(f"Loading data from: {data_file}") + df = pd.read_csv(data_file) + print(f"Loaded {len(df)} rows") + return df + + +def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device): + """ + Phase 1: Growing window detection with threshold crossing. + Expand window day-by-day, check last timestep's detected_prob. + When 3 consecutive days have prob > 0.5, harvest detected. + Returns list of (harvest_date, harvest_idx) tuples. + """ + harvest_dates = [] + current_pos = 0 + + while current_pos < len(field_data): + consecutive_above_threshold = 0 + + for window_end in range(current_pos + 1, len(field_data) + 1): + window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) + + try: + features = extract_features(window_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run model + with torch.no_grad(): + x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + imminent_probs, detected_probs = model(x_tensor) + detected_probs = detected_probs.squeeze(0).cpu().numpy() + + # Check LAST timestep + last_prob = detected_probs[-1] + + if last_prob > 0.5: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + # Harvest detected: 3 consecutive days above threshold + if consecutive_above_threshold >= 3: + harvest_date = field_data.iloc[current_pos + window_end - 3]['Date'] + harvest_dates.append((harvest_date, current_pos + window_end - 3)) + + # Reset to next day after harvest + current_pos = current_pos + window_end - 2 + break + + except Exception: + continue + else: + break + + return harvest_dates + + +def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device): + """ + Phase 2: Refinement with ±40 day window. + For each Phase 1 harvest, extract window and refine with argmax. + Returns list of (harvest_date, harvest_idx) tuples. + """ + refined_harvests = [] + field_data = field_data.sort_values('Date').reset_index(drop=True) + + for i, (phase1_harvest_date, phase1_idx) in enumerate(phase1_harvests): + try: + # Determine season start + if i == 0: + season_start_date = field_data.iloc[0]['Date'] + else: + prev_harvest_idx = phase1_harvests[i-1][1] + season_start_idx = prev_harvest_idx + 1 + if season_start_idx >= len(field_data): + break + season_start_date = field_data.iloc[season_start_idx]['Date'] + + # Extract ±40 day window + window_start_date = season_start_date - pd.Timedelta(days=40) + window_end_date = phase1_harvest_date + pd.Timedelta(days=40) + + window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0) + window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data)) + + if window_end_idx <= window_start_idx: + refined_harvests.append((phase1_harvest_date, phase1_idx)) + continue + + window_data = field_data.iloc[window_start_idx:window_end_idx].copy().reset_index(drop=True) + + # Extract features for full window + features = extract_features(window_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run model once on full window + with torch.no_grad(): + x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + imminent_probs, detected_probs = model(x_tensor) + detected_probs = detected_probs.squeeze(0).cpu().numpy() + + # Find refined harvest (argmax in window) + refined_idx_in_window = int(np.argmax(detected_probs)) + refined_idx_global = window_start_idx + refined_idx_in_window + refined_harvest_date = field_data.iloc[refined_idx_global]['Date'] + + refined_harvests.append((refined_harvest_date, refined_idx_global)) + + except Exception: + refined_harvests.append((phase1_harvest_date, phase1_idx)) + + return refined_harvests + + +def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=None): + """ + Two-step harvest detection for each field: + 1. Phase 1: Growing window with 3-day threshold confirmation + 2. Phase 2: ±40 day refinement with argmax + + Returns list of dicts with field, season_start_date, season_end_date, etc. + """ + if device is None: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + results = [] + ci_column = config['data']['ci_column'] + + # Group by field and count total fields for progress + field_groups = list(df.groupby('field')) + total_fields = len(field_groups) + harvests_found = 0 + + print(f" Processing {total_fields} fields...") + + for idx, (field, field_data) in enumerate(field_groups, 1): + # Simple progress indicator + pct = int((idx / total_fields) * 100) + bar_length = 40 + filled = int((idx / total_fields) * bar_length) + bar = "█" * filled + "░" * (bar_length - filled) + print(f" [{bar}] {pct:3d}% ({idx}/{total_fields} fields)", end='\r') + + field_data = field_data.sort_values('Date').reset_index(drop=True) + + # Phase 1: Growing window detection + phase1_harvests = run_phase1_growing_window(field_data, model, config, scalers, ci_column, device) + + if not phase1_harvests: + continue + + # Phase 2: Refinement + phase2_harvests = run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device) + + # Store results + for i, (harvest_date, harvest_idx) in enumerate(phase2_harvests): + if i == 0: + season_start_date = field_data.iloc[0]['Date'] + else: + prev_harvest_idx = phase2_harvests[i-1][1] + season_start_idx = prev_harvest_idx + 1 + if season_start_idx >= len(field_data): + break + season_start_date = field_data.iloc[season_start_idx]['Date'] + + season_end_date = harvest_date + + result = { + 'field': field, + 'season': i + 1, + 'season_start_date': season_start_date, + 'season_end_date': season_end_date, + 'phase2_harvest_date': harvest_date, + } + results.append(result) + harvests_found += 1 + + print() # New line after progress bar + print(f" ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields") + + return results + + +def build_production_harvest_table(refined_results: List[Dict]) -> pd.DataFrame: + """ + Build a DataFrame from refined results with columns for production pipeline. + One row per field/season with season start and end dates (formatted as YYYY-MM-DD). + """ + if not refined_results: + print("WARNING: No refined results to build table") + return pd.DataFrame(columns=['field', 'season', 'season_start_date', 'season_end_date']) + + # Build DataFrame + df = pd.DataFrame(refined_results) + + # Ensure date columns are datetime + df['season_start_date'] = pd.to_datetime(df['season_start_date']).dt.strftime('%Y-%m-%d') + df['season_end_date'] = pd.to_datetime(df['season_end_date']).dt.strftime('%Y-%m-%d') + df['phase1_harvest_date'] = pd.to_datetime(df['phase1_harvest_date']).dt.strftime('%Y-%m-%d') + + print(f"Built production table with {len(df)} field/season combinations") + + return df diff --git a/python_app/model_307.pt b/python_app/model_307.pt new file mode 100644 index 0000000..0a30c3d Binary files /dev/null and b/python_app/model_307.pt differ diff --git a/python_app/model_config.json b/python_app/model_config.json new file mode 100644 index 0000000..3be1268 --- /dev/null +++ b/python_app/model_config.json @@ -0,0 +1,144 @@ +{ + "name": "307_dropout02_with_doy", + "description": "Production Model 307: LSTM-based harvest detection (Phase 3, minimal regularization)", + "model_info": { + "type": "LSTM", + "architecture": "Unidirectional LSTM with dual output heads (imminent + detected)", + "total_parameters": 105120, + "input_features": 14, + "hidden_units": 256, + "output_heads": 2, + "training_data": "Historical multi-season CI data from multiple estates", + "validation_method": "5-fold cross-validation", + "device": "GPU (CUDA) or CPU fallback" + }, + "production_scripts": { + "baseline": { + "script": "01_harvest_baseline_prediction.py", + "frequency": "Run ONCE during setup", + "purpose": "Predict all harvest dates (ground truth baseline)", + "input": "ci_data_for_python.csv (complete historical data)", + "output": "harvest_production_export.xlsx", + "time_estimate": "5-30 minutes depending on data volume" + }, + "monitoring": { + "script": "02_harvest_imminent_weekly.py", + "frequency": "Run WEEKLY (or daily if required)", + "purpose": "Real-time harvest status and imminent alerts", + "input": "ci_data_for_python.csv (recent data)", + "output": "harvest_imminent_weekly.csv", + "time_estimate": "1-5 minutes" + } + }, + "features": [ + "CI_raw", + "7d_MA", + "14d_MA", + "21d_MA", + "7d_velocity", + "14d_velocity", + "21d_velocity", + "7d_min", + "14d_min", + "21d_min", + "7d_std", + "14d_std", + "21d_std", + "DOY_normalized" + ], + "model": { + "type": "LSTM", + "hidden_size": 256, + "num_layers": 1, + "dropout": 0.2 + }, + "training": { + "imminent_days_before": 28, + "imminent_days_before_end": 1, + "detected_days_after_start": 1, + "detected_days_after_end": 21, + "k_folds": 5, + "num_epochs": 150, + "patience": 20, + "learning_rate": 0.001, + "batch_size": 4 + }, + "data": { + "csv_path": "../lstm_complete_data.csv", + "ci_column": "FitData", + "test_fraction": 0.15, + "seed": 42 + }, + "workflow_instructions": { + "overview": "Model 307 uses a two-script approach: baseline setup + weekly monitoring", + "step_1_baseline": { + "description": "Establish historical harvest date reference for all fields", + "script": "01_harvest_baseline_prediction.py", + "when": "Run once after setting up CI extraction pipeline", + "command": "conda activate python_gpu && python 01_harvest_baseline_prediction.py", + "input_data": "ci_data_for_python.csv (all available historical CI data)", + "output_file": "harvest_production_export.xlsx (ground truth baseline)", + "columns": [ + "field - Field ID", + "sub_field - Sub-field designation", + "season - Season number (1, 2, 3...)", + "year - Year of harvest", + "season_start_date - Start of growing season", + "season_end_date - End of season (harvest date)", + "phase1_harvest_date - Refined harvest prediction" + ], + "notes": "This becomes your reference - compare all weekly monitoring against this" + }, + "step_2_monitoring": { + "description": "Weekly real-time harvest status and imminent alerts", + "script": "02_harvest_imminent_weekly.py", + "when": "Run every week (e.g., Mondays) or daily if near harvest", + "command": "conda activate python_gpu && python 02_harvest_imminent_weekly.py", + "input_data": "ci_data_for_python.csv (latest CI data from 02b conversion)", + "output_file": "harvest_imminent_weekly.csv", + "columns": [ + "field - Field ID", + "sub_field - Sub-field designation", + "imminent_prob - Likelihood of harvest readiness in next 28 days (0.0-1.0)", + "detected_prob - Current harvest probability (0.0-1.0)", + "week - ISO week number", + "year - Year", + "as_of_date - Latest date in dataset", + "num_days - Days of history used" + ], + "alert_thresholds": { + "imminent_high": "imminent_prob > 0.7 (prepare harvest)", + "imminent_medium": "imminent_prob 0.5-0.7 (monitor closely)", + "detected_high": "detected_prob > 0.6 (active harvesting)" + } + }, + "integration_with_r_pipeline": { + "before_model_307": [ + "Planet 8-band download: download_8band_pu_optimized.ipynb", + "CI extraction: 02_ci_extraction.R", + "Convert to CSV: 02b_convert_rds_to_csv.R (outputs ci_data_for_python.csv)" + ], + "model_307_here": [ + "BASELINE: 01_harvest_baseline_prediction.py (run once)", + "MONITORING: 02_harvest_imminent_weekly.py (run weekly)" + ], + "after_model_307": [ + "Field analysis: 09b_field_analysis_weekly.R (reads harvest predictions)", + "Reports: 10_CI_report_with_kpis.Rmd (includes harvest status)" + ] + }, + "environment_requirements": { + "python_env": "python_gpu", + "activation": "conda activate python_gpu", + "required_packages": [ + "torch (GPU-enabled)", + "pandas", + "numpy", + "scikit-learn", + "pyyaml", + "openpyxl" + ], + "gpu": "NVIDIA GPU with CUDA (optional - falls back to CPU if unavailable)" + } + } +} \ No newline at end of file diff --git a/python_app/model_scalers.pkl b/python_app/model_scalers.pkl new file mode 100644 index 0000000..4c1ead5 Binary files /dev/null and b/python_app/model_scalers.pkl differ diff --git a/r_app/02b_convert_ci_rds_to_csv.R b/r_app/02b_convert_ci_rds_to_csv.R index bdf57c6..f75f6af 100644 --- a/r_app/02b_convert_ci_rds_to_csv.R +++ b/r_app/02b_convert_ci_rds_to_csv.R @@ -15,9 +15,113 @@ suppressPackageStartupMessages({ library(tidyverse) library(lubridate) + library(zoo) library(here) }) +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +#' Convert wide format RDS to long format +#' +#' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns +#' @return Long format tibble: field, sub_field, Date, FitData +wide_to_long_ci_data <- function(ci_data_wide) { + ci_data_wide %>% + pivot_longer( + cols = -c(field, sub_field), + names_to = "Date", + values_to = "FitData", + values_drop_na = TRUE + ) %>% + mutate( + Date = as.Date(Date), + FitData = as.numeric(FitData) + ) %>% + filter(!is.na(FitData)) +} + +#' Create daily interpolated sequences with DOY for each field +#' +#' For each field/sub_field combination, creates complete daily sequences from first to last date, +#' fills in measurements, and interpolates missing dates. +#' +#' @param ci_data_long Long format tibble: field, sub_field, Date, FitData +#' @return Tibble with: field, sub_field, Date, FitData, DOY, value +create_interpolated_daily_sequences <- function(ci_data_long) { + ci_data_long %>% + group_by(field, sub_field) %>% + nest() %>% + mutate( + data = map(data, function(df) { + # Sort measurements by date + df <- df %>% arrange(Date) + + # Create complete daily sequence from first to last date + date_seq <- seq(min(df$Date), max(df$Date), by = "day") + + # Build daily dataframe (field/sub_field stay in outer df, not here) + daily_df <- tibble( + Date = date_seq, + value = NA_real_, + FitData = NA_real_, + DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ... + ) + + # Fill in actual measurement values + for (i in seq_len(nrow(df))) { + idx <- which(daily_df$Date == df$Date[i]) + if (length(idx) > 0) { + daily_df$value[idx] <- df$FitData[i] + } + } + + # Interpolate missing dates linearly + daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE) + + daily_df + }) + ) %>% + unnest(data) %>% + select(field, sub_field, Date, FitData, DOY, value) %>% + arrange(field, Date) +} + +#' Validate conversion output +#' +#' @param ci_data_python Tibble with converted CI data +#' @return Invisibly returns the tibble (for piping) +validate_conversion_output <- function(ci_data_python) { + cat(sprintf("\nValidation:\n")) + cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field))) + cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python))) + cat(sprintf(" Date range: %s to %s\n", + min(ci_data_python$Date, na.rm = TRUE), + max(ci_data_python$Date, na.rm = TRUE))) + cat(sprintf(" FitData range: %.2f to %.2f\n", + min(ci_data_python$FitData, na.rm = TRUE), + max(ci_data_python$FitData, na.rm = TRUE))) + cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value)))) + cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData)))) + + invisible(ci_data_python) +} + +#' Print next steps message +print_next_steps <- function() { + cat("\nNext steps for Python harvest detection:\n") + cat(" 1. Read this CSV file in Python\n") + cat(" 2. Group by field to identify seasons\n") + cat(" 3. Run LSTM model to detect harvest dates\n") + cat(" 4. Save predicted harvest dates to Excel\n") + cat(" 5. Use output in script 03 for interpolation\n") +} + +# ============================================================================ +# MAIN FUNCTION +# ============================================================================ + main <- function() { # Process command line arguments args <- commandArgs(trailingOnly = TRUE) @@ -28,7 +132,7 @@ main <- function() { } else if (exists("project_dir", envir = .GlobalEnv)) { project_dir <- get("project_dir", envir = .GlobalEnv) } else { - project_dir <- "esa" + project_dir <- "angata" } # Make available globally @@ -49,9 +153,17 @@ main <- function() { }) # Define paths - ci_data_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals") - input_file <- file.path(ci_data_dir, "combined_CI_data.rds") - output_file <- file.path(ci_data_dir, "ci_data_for_python.csv") + ci_data_source_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals") + ci_data_output_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "ci_data_for_python") + + # Create output directory if it doesn't exist (for new projects) + if (!dir.exists(ci_data_output_dir)) { + dir.create(ci_data_output_dir, recursive = TRUE, showWarnings = FALSE) + cat(sprintf("✓ Created output directory: %s\n", ci_data_output_dir)) + } + + input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds") + output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv") # Check if input file exists if (!file.exists(input_file)) { @@ -61,52 +173,32 @@ main <- function() { cat(sprintf("Loading: %s\n", input_file)) # Load RDS file - ci_data <- readRDS(input_file) %>% + ci_data_wide <- readRDS(input_file) %>% as_tibble() - cat(sprintf(" Loaded %d rows\n", nrow(ci_data))) - cat(sprintf(" Columns: %s\n", paste(names(ci_data), collapse = ", "))) + cat(sprintf(" Loaded %d rows\n", nrow(ci_data_wide))) + cat(sprintf(" Format: WIDE (field, sub_field, then dates as columns)\n")) + cat(sprintf(" Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", "))) - # Prepare data for Python - ci_data_python <- ci_data %>% - # Ensure standard column names - rename( - field = field, - sub_field = sub_field, - Date = Date, - FitData = FitData, - DOY = DOY - ) %>% - # Add 'value' as an alias for FitData (sometimes needed) - mutate(value = FitData) %>% - # Keep only necessary columns - select(field, sub_field, Date, FitData, DOY, value) %>% - # Sort by field and date - arrange(field, Date) + # Step 1: Convert from WIDE to LONG format + cat("\nStep 1: Converting from wide to long format...\n") + ci_data_long <- wide_to_long_ci_data(ci_data_wide) - # Validate data - cat(sprintf("\nValidation:\n")) - cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field))) - cat(sprintf(" Date range: %s to %s\n", - min(ci_data_python$Date, na.rm = TRUE), - max(ci_data_python$Date, na.rm = TRUE))) - cat(sprintf(" FitData range: %.2f to %.2f\n", - min(ci_data_python$FitData, na.rm = TRUE), - max(ci_data_python$FitData, na.rm = TRUE))) - cat(sprintf(" Missing FitData: %d rows\n", sum(is.na(ci_data_python$FitData)))) + # Step 2: Create complete daily sequences with interpolation + cat("\nStep 2: Creating complete daily sequences with interpolation...\n") + ci_data_python <- create_interpolated_daily_sequences(ci_data_long) - # Save to CSV - cat(sprintf("\nSaving to: %s\n", output_file)) + # Step 3: Validate output + cat("\nStep 3: Validating output...") + validate_conversion_output(ci_data_python) + # Step 4: Save to CSV + cat(sprintf("\nStep 4: Saving to CSV...\n")) + cat(sprintf(" Output: %s\n", output_file)) write_csv(ci_data_python, output_file) - cat(sprintf("✓ Successfully created CSV with %d rows\n", nrow(ci_data_python))) - cat("\nNext steps for Python harvest detection:\n") - cat(" 1. Read this CSV file in Python\n") - cat(" 2. Group by field to identify seasons\n") - cat(" 3. Run LSTM model to detect harvest dates\n") - cat(" 4. Save predicted harvest dates to Excel\n") - cat(" 5. Use output in script 03 for interpolation\n") + cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python))) + print_next_steps() } if (sys.nframe() == 0) { diff --git a/webapps/geojson_viewer.html b/webapps/geojson_viewer.html new file mode 100644 index 0000000..36511a1 --- /dev/null +++ b/webapps/geojson_viewer.html @@ -0,0 +1,1328 @@ + + + + + + GeoJSON Viewer + + + + + +
+
+

📍 GeoJSON Viewer

+

Upload a GeoJSON file to visualize features on an interactive map

+
+ +
+ + +
+
+
+ +
+
+
+ + + +
+ + + + + diff --git a/webapps/index.html b/webapps/index.html index deef357..918f1e3 100644 --- a/webapps/index.html +++ b/webapps/index.html @@ -212,6 +212,22 @@ Open Tool + + +
+
📍
+
+

GeoJSON Viewer

+

Upload and visualize GeoJSON files on an interactive map with feature properties.

+ + Open Viewer +
+