updated code to fit in workflow better
This commit is contained in:
parent
d1303dd218
commit
cb63cf00b8
|
|
@ -31,7 +31,7 @@ Examples:
|
||||||
python download_8band_pu_optimized.py chemba # Uses today's date
|
python download_8band_pu_optimized.py chemba # Uses today's date
|
||||||
python download_8band_pu_optimized.py xinavane --clear-singles --cleanup
|
python download_8band_pu_optimized.py xinavane --clear-singles --cleanup
|
||||||
python download_8band_pu_optimized.py angata --clear-all --resolution 5
|
python download_8band_pu_optimized.py angata --clear-all --resolution 5
|
||||||
|
|
||||||
Cost Model:
|
Cost Model:
|
||||||
- 4-band uint16 with cloud masking: ~50% lower cost than 9-band FLOAT32
|
- 4-band uint16 with cloud masking: ~50% lower cost than 9-band FLOAT32
|
||||||
- Reduced bbox sizes: ~10-20% lower cost due to smaller average tile size
|
- Reduced bbox sizes: ~10-20% lower cost due to smaller average tile size
|
||||||
|
|
@ -39,6 +39,18 @@ Cost Model:
|
||||||
- Requests: Slightly higher (~50-60 tiles) but within 700k budget
|
- Requests: Slightly higher (~50-60 tiles) but within 700k budget
|
||||||
|
|
||||||
Expected result: ~75% PU savings with dynamic geometry-fitted grid
|
Expected result: ~75% PU savings with dynamic geometry-fitted grid
|
||||||
|
|
||||||
|
Example running it in powershell:
|
||||||
|
$startDate = [DateTime]::ParseExact("2025-11-01", "yyyy-MM-dd", $null)
|
||||||
|
$endDate = [DateTime]::ParseExact("2025-12-24", "yyyy-MM-dd", $null)
|
||||||
|
|
||||||
|
$current = $startDate
|
||||||
|
while ($current -le $endDate) {
|
||||||
|
$dateStr = $current.ToString("yyyy-MM-dd")
|
||||||
|
Write-Host "Downloading $dateStr..."
|
||||||
|
python download_8band_pu_optimized.py angata --date $dateStr
|
||||||
|
$current = $current.AddDays(1)
|
||||||
|
}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
111
python_app/01_harvest_baseline_prediction.py
Normal file
111
python_app/01_harvest_baseline_prediction.py
Normal file
|
|
@ -0,0 +1,111 @@
|
||||||
|
"""
|
||||||
|
Script: 01_harvest_baseline_prediction.py
|
||||||
|
Purpose: BASELINE PREDICTION - Run ONCE to establish harvest date baseline for all fields and seasons
|
||||||
|
|
||||||
|
This script processes COMPLETE historical CI data (all available dates) and uses Model 307
|
||||||
|
to predict ALL harvest dates across the entire dataset. This becomes your reference baseline
|
||||||
|
for monitoring and comparisons going forward.
|
||||||
|
|
||||||
|
RUN FREQUENCY: Once during initial setup
|
||||||
|
INPUT: ci_data_for_python.csv (complete historical CI data from 02b_convert_rds_to_csv.R)
|
||||||
|
Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
|
||||||
|
OUTPUT: harvest_production_export.xlsx (baseline harvest predictions for all fields/seasons)
|
||||||
|
|
||||||
|
Workflow:
|
||||||
|
1. Load ci_data_for_python.csv (daily interpolated, all historical dates)
|
||||||
|
2. Group data by field and season (Model 307 detects season boundaries internally)
|
||||||
|
3. Run two-step harvest detection (Phase 1: fast detection, Phase 2: ±40 day refinement)
|
||||||
|
4. Export harvest_production_export.xlsx with columns:
|
||||||
|
- field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date
|
||||||
|
|
||||||
|
Two-Step Detection Algorithm:
|
||||||
|
Phase 1 (Growing Window): Expands daily, checks when detected_prob > 0.5 for 3 consecutive days
|
||||||
|
Phase 2 (Refinement): Extracts ±40 day window, finds peak harvest signal with argmax
|
||||||
|
|
||||||
|
This is your GROUND TRUTH - compare all future predictions against this baseline.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python 01_harvest_baseline_prediction.py [project_name]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
python 01_harvest_baseline_prediction.py angata
|
||||||
|
python 01_harvest_baseline_prediction.py esa
|
||||||
|
python 01_harvest_baseline_prediction.py chemba
|
||||||
|
|
||||||
|
If no project specified, defaults to 'angata'
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from harvest_date_pred_utils import (
|
||||||
|
load_model_and_config,
|
||||||
|
extract_features,
|
||||||
|
run_two_step_refinement,
|
||||||
|
build_production_harvest_table
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Get project name from command line or use default
|
||||||
|
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||||
|
|
||||||
|
# Construct paths
|
||||||
|
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||||
|
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||||
|
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||||
|
harvest_data_dir = base_storage / "HarvestData"
|
||||||
|
harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
|
||||||
|
OUTPUT_XLSX = harvest_data_dir / "harvest_production_export.xlsx"
|
||||||
|
MODEL_DIR = Path(".") # Model files in python_app/
|
||||||
|
|
||||||
|
# Check if input exists
|
||||||
|
if not CI_DATA_FILE.exists():
|
||||||
|
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||||
|
print(f" Expected at: {CI_DATA_FILE.resolve()}")
|
||||||
|
print(f"\n Run 02b_convert_rds_to_csv.R first to generate this file:")
|
||||||
|
print(f" Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("="*80)
|
||||||
|
print(f"HARVEST DATE PREDICTION - LSTM MODEL 307 ({project_name})")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
# [1/4] Load model
|
||||||
|
print("\n[1/4] Loading Model 307...")
|
||||||
|
model, config, scalers = load_model_and_config(MODEL_DIR)
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
print(f" Device: {device}")
|
||||||
|
|
||||||
|
# [2/4] Load and prepare CI data
|
||||||
|
print("\n[2/4] Loading CI data...")
|
||||||
|
print(f" From: {CI_DATA_FILE}")
|
||||||
|
ci_data = pd.read_csv(CI_DATA_FILE)
|
||||||
|
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||||
|
print(f" Loaded {len(ci_data)} daily rows across {ci_data['field'].nunique()} fields")
|
||||||
|
print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
|
||||||
|
|
||||||
|
# [3/4] Run model predictions with two-step detection
|
||||||
|
print("\n[3/4] Running two-step harvest detection...")
|
||||||
|
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device)
|
||||||
|
|
||||||
|
# Build and export
|
||||||
|
print("\nBuilding production harvest table...")
|
||||||
|
prod_table = build_production_harvest_table(refined_results)
|
||||||
|
|
||||||
|
prod_table.to_excel(OUTPUT_XLSX, index=False)
|
||||||
|
print(f"\n✓ Exported {len(prod_table)} predictions to {OUTPUT_XLSX}")
|
||||||
|
print(f"\nOutput location: {OUTPUT_XLSX.resolve()}")
|
||||||
|
print(f"\nStorage structure:")
|
||||||
|
print(f" Input: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
|
||||||
|
print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
|
||||||
|
print(f"\nColumn structure:")
|
||||||
|
print(f" field, sub_field, season, year, season_start_date, season_end_date, phase1_harvest_date")
|
||||||
|
print(f"\nNext steps:")
|
||||||
|
print(f" 1. Review baseline predictions in harvest_production_export.xlsx")
|
||||||
|
print(f" 2. Run weekly monitoring: python 02_harvest_imminent_weekly.py {project_name}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
348
python_app/02_harvest_imminent_weekly.py
Normal file
348
python_app/02_harvest_imminent_weekly.py
Normal file
|
|
@ -0,0 +1,348 @@
|
||||||
|
"""
|
||||||
|
Script: 02_harvest_imminent_weekly.py
|
||||||
|
Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields
|
||||||
|
|
||||||
|
This script runs on RECENT CI data (typically last 300 days) to predict whether each field
|
||||||
|
is approaching harvest. Use this for operational decision-making and real-time alerts.
|
||||||
|
|
||||||
|
RUN FREQUENCY: Weekly (or daily if required)
|
||||||
|
INPUT:
|
||||||
|
- ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R)
|
||||||
|
Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
|
||||||
|
- harvest_production_export.xlsx (baseline from script 01 - optional, for reference)
|
||||||
|
OUTPUT:
|
||||||
|
- harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
|
||||||
|
|
||||||
|
Workflow:
|
||||||
|
1. Load harvest_production_export.xlsx (baseline dates - optional, for context)
|
||||||
|
2. Load ci_data_for_python.csv (recent CI data)
|
||||||
|
3. For each field, extract last 300 days of history
|
||||||
|
4. Run Model 307 inference on full sequence (last timestep probabilities)
|
||||||
|
5. Export harvest_imminent_weekly.csv with probabilities
|
||||||
|
|
||||||
|
Output Columns:
|
||||||
|
- field: Field ID
|
||||||
|
- sub_field: Sub-field identifier
|
||||||
|
- imminent_prob: Probability field will be harvestable in next 28 days (0.0-1.0)
|
||||||
|
- detected_prob: Probability field is currently being harvested (0.0-1.0)
|
||||||
|
- week: ISO week number
|
||||||
|
- year: Year
|
||||||
|
- as_of_date: Latest date in dataset
|
||||||
|
- num_days: Number of days of history used
|
||||||
|
|
||||||
|
Use Cases:
|
||||||
|
- Alert when imminent_prob > 0.7 (prepare harvest operations)
|
||||||
|
- Alert when detected_prob > 0.6 (field is being harvested)
|
||||||
|
- Track trends over weeks to validate baseline predictions
|
||||||
|
- Feed into 09b script for weekly dashboard reports
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python 02_harvest_imminent_weekly.py [project_name]
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
python 02_harvest_imminent_weekly.py angata
|
||||||
|
python 02_harvest_imminent_weekly.py esa
|
||||||
|
python 02_harvest_imminent_weekly.py chemba
|
||||||
|
|
||||||
|
If no project specified, defaults to 'angata'
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from harvest_date_pred_utils import (
|
||||||
|
load_model_and_config,
|
||||||
|
extract_features,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_harvest_dates(harvest_file):
|
||||||
|
"""Load latest harvest end dates from Excel file (from harvest_production_export.xlsx)."""
|
||||||
|
print("[1/5] Loading harvest dates...")
|
||||||
|
|
||||||
|
if not Path(harvest_file).exists():
|
||||||
|
print(f" ERROR: {harvest_file} not found")
|
||||||
|
print(" Using 180-day lookback as default")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
harvest_df = pd.read_excel(harvest_file)
|
||||||
|
print(f" Loaded {len(harvest_df)} field-season records")
|
||||||
|
|
||||||
|
# Use season_end_date column (output from harvest prediction script)
|
||||||
|
harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date'])
|
||||||
|
|
||||||
|
# Group by field and get the latest season_end_date
|
||||||
|
harvest_dates = {}
|
||||||
|
for field_id, group in harvest_df.groupby('field'):
|
||||||
|
latest_end = group['season_end_date'].max()
|
||||||
|
harvest_dates[str(field_id).strip()] = latest_end
|
||||||
|
|
||||||
|
print(f" Successfully mapped {len(harvest_dates)} fields")
|
||||||
|
print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
|
||||||
|
return harvest_dates
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR loading harvest file: {e}")
|
||||||
|
print(f" Using 180-day lookback instead")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def run_rds_to_csv_conversion():
|
||||||
|
"""Run R script to convert RDS to CSV."""
|
||||||
|
print("\n[2/5] Converting RDS to CSV (daily interpolation)...")
|
||||||
|
r_script = Path("02b_convert_rds_to_csv.R")
|
||||||
|
|
||||||
|
if not r_script.exists():
|
||||||
|
print(f" ERROR: {r_script} not found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Use full path to Rscript on Windows
|
||||||
|
rscript_exe = r"C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe"
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[rscript_exe, str(r_script)],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=300
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
print(f" ERROR running R script:\n{result.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Show last few lines of output
|
||||||
|
lines = result.stdout.strip().split('\n')
|
||||||
|
for line in lines[-5:]:
|
||||||
|
if line.strip():
|
||||||
|
print(f" {line}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def load_ci_data(csv_file):
|
||||||
|
"""Load CI data."""
|
||||||
|
print("\n[3/5] Loading CI data...")
|
||||||
|
|
||||||
|
if not Path(csv_file).exists():
|
||||||
|
print(f" ERROR: {csv_file} not found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
ci_data = pd.read_csv(csv_file)
|
||||||
|
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||||
|
|
||||||
|
print(f" Loaded {len(ci_data)} daily rows for {ci_data['field'].nunique()} fields")
|
||||||
|
print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
|
||||||
|
|
||||||
|
return ci_data
|
||||||
|
|
||||||
|
|
||||||
|
def extract_seasonal_data(field_id, harvest_date, ci_data):
|
||||||
|
"""
|
||||||
|
Extract CI data from harvest date to latest for a specific field.
|
||||||
|
Returns dataframe sorted by date, or None if insufficient data.
|
||||||
|
"""
|
||||||
|
# field_id is int, ci_data['field'] is also int
|
||||||
|
field_data = ci_data[ci_data['field'] == field_id].copy()
|
||||||
|
|
||||||
|
if len(field_data) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Filter from harvest date onwards
|
||||||
|
field_data = field_data[field_data['Date'] >= harvest_date].sort_values('Date')
|
||||||
|
|
||||||
|
# Need at least 30 days of data for meaningful inference
|
||||||
|
if len(field_data) < 30:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return field_data
|
||||||
|
|
||||||
|
|
||||||
|
def run_inference_on_season(field_data, model, config, scalers, device, ci_column='FitData'):
|
||||||
|
"""
|
||||||
|
Run Model 307 inference on recent field CI history.
|
||||||
|
Predicts probability that field will be ready to harvest in next 28 days.
|
||||||
|
Uses last timestep from the provided data sequence.
|
||||||
|
Returns (imminent_prob, detected_prob) for prediction.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Use last 300 days of data for inference (enough history for meaningful patterns,
|
||||||
|
# avoids training data seasonality mismatch)
|
||||||
|
if len(field_data) > 300:
|
||||||
|
field_data = field_data.iloc[-300:]
|
||||||
|
|
||||||
|
# Extract features
|
||||||
|
features_array = extract_features(field_data, config['features'], ci_column)
|
||||||
|
|
||||||
|
if features_array.shape[0] < 10:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Scale features using per-feature scalers (CRITICAL: same as Phase 1 in harvest_date_pred_utils.py)
|
||||||
|
# Scalers is a list of StandardScaler objects, one per feature
|
||||||
|
if scalers and isinstance(scalers, list):
|
||||||
|
for fi, scaler in enumerate(scalers):
|
||||||
|
try:
|
||||||
|
features_array[:, fi] = scaler.transform(features_array[:, fi].reshape(-1, 1)).flatten()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
with torch.no_grad():
|
||||||
|
x_tensor = torch.tensor(features_array, dtype=torch.float32).unsqueeze(0).to(device)
|
||||||
|
out_imm, out_det = model(x_tensor)
|
||||||
|
|
||||||
|
# Get last timestep probabilities
|
||||||
|
imminent_prob = out_imm.squeeze(0)[-1].cpu().item()
|
||||||
|
detected_prob = out_det.squeeze(0)[-1].cpu().item()
|
||||||
|
|
||||||
|
return round(imminent_prob, 4), round(detected_prob, 4)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Get project name from command line or use default
|
||||||
|
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||||
|
|
||||||
|
# Construct paths
|
||||||
|
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||||
|
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||||
|
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||||
|
harvest_data_dir = base_storage / "HarvestData"
|
||||||
|
BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx"
|
||||||
|
OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv"
|
||||||
|
harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
|
||||||
|
|
||||||
|
print("="*80)
|
||||||
|
print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})")
|
||||||
|
print("="*80)
|
||||||
|
|
||||||
|
# [1] Load harvest dates (optional - for projects with predictions)
|
||||||
|
harvest_dates = None
|
||||||
|
if BASELINE_FILE.exists():
|
||||||
|
harvest_dates = load_harvest_dates(BASELINE_FILE)
|
||||||
|
else:
|
||||||
|
print("[1/5] Loading harvest dates...")
|
||||||
|
print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)")
|
||||||
|
|
||||||
|
# [2] Load CI data
|
||||||
|
print(f"\n[2/5] Loading CI data...")
|
||||||
|
print(f" From: {CI_DATA_FILE}")
|
||||||
|
|
||||||
|
if not CI_DATA_FILE.exists():
|
||||||
|
print(f" ERROR: {CI_DATA_FILE} not found")
|
||||||
|
print(f" Expected at: {CI_DATA_FILE.resolve()}")
|
||||||
|
print(f"\n Run 02b_convert_rds_to_csv.R first to generate this file:")
|
||||||
|
print(f" Rscript r_app/02b_convert_ci_rds_to_csv.R {project_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
ci_data = load_ci_data(CI_DATA_FILE)
|
||||||
|
if ci_data is None:
|
||||||
|
print("ERROR: Could not load CI data")
|
||||||
|
return
|
||||||
|
|
||||||
|
# [3] Load model (from python_app directory)
|
||||||
|
print("\n[3/5] Loading Model 307...")
|
||||||
|
model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl
|
||||||
|
model, config, scalers = load_model_and_config(model_dir)
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
print(f" Device: {device}")
|
||||||
|
|
||||||
|
# [4] Run inference per field
|
||||||
|
print("\n[4/5] Running seasonal inference...")
|
||||||
|
|
||||||
|
results_list = []
|
||||||
|
ci_column = config['data']['ci_column']
|
||||||
|
|
||||||
|
# Get field metadata
|
||||||
|
field_meta = ci_data.groupby('field').agg({
|
||||||
|
'sub_field': 'first',
|
||||||
|
'Date': 'max'
|
||||||
|
}).reset_index()
|
||||||
|
field_meta.columns = ['field', 'sub_field', 'latest_date']
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for field_id in ci_data['field'].unique():
|
||||||
|
# Get metadata
|
||||||
|
meta = field_meta[field_meta['field'] == field_id]
|
||||||
|
if len(meta) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sub_field = meta['sub_field'].iloc[0]
|
||||||
|
latest_date = meta['latest_date'].iloc[0]
|
||||||
|
|
||||||
|
# Use recent CI history (last 300 days from latest available data)
|
||||||
|
field_data = ci_data[ci_data['field'] == field_id].copy()
|
||||||
|
field_data = field_data.sort_values('Date')
|
||||||
|
|
||||||
|
# Keep last 300 days of history for inference
|
||||||
|
if len(field_data) > 300:
|
||||||
|
field_data = field_data.iloc[-300:]
|
||||||
|
|
||||||
|
if len(field_data) < 30:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Run inference on recent history to predict next 28 days
|
||||||
|
imminent_prob, detected_prob = run_inference_on_season(
|
||||||
|
field_data, model, config, scalers, device, ci_column
|
||||||
|
)
|
||||||
|
|
||||||
|
if imminent_prob is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
week = int(latest_date.strftime('%V'))
|
||||||
|
year = int(latest_date.strftime('%Y'))
|
||||||
|
|
||||||
|
results_list.append({
|
||||||
|
'field': field_id,
|
||||||
|
'sub_field': sub_field,
|
||||||
|
'imminent_prob': imminent_prob,
|
||||||
|
'detected_prob': detected_prob,
|
||||||
|
'week': week,
|
||||||
|
'year': year,
|
||||||
|
'as_of_date': latest_date,
|
||||||
|
'num_days': len(field_data),
|
||||||
|
})
|
||||||
|
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
print(f" Completed inference for {count} fields")
|
||||||
|
|
||||||
|
# Build output DataFrame
|
||||||
|
df = pd.DataFrame(results_list)
|
||||||
|
df.to_csv(OUTPUT_CSV, index=False)
|
||||||
|
|
||||||
|
print(f"\n[5/5] Exporting results...")
|
||||||
|
print(f"✓ Exported {len(df)} fields to {OUTPUT_CSV}")
|
||||||
|
print(f" Output location: {OUTPUT_CSV.resolve()}")
|
||||||
|
|
||||||
|
if len(df) > 0:
|
||||||
|
print(f"\nSample rows:")
|
||||||
|
print(df[['field', 'sub_field', 'imminent_prob', 'detected_prob', 'num_days', 'week', 'year']].head(15).to_string(index=False))
|
||||||
|
|
||||||
|
# Show alert summary
|
||||||
|
high_imminent = len(df[df['imminent_prob'] > 0.7])
|
||||||
|
high_detected = len(df[df['detected_prob'] > 0.6])
|
||||||
|
print(f"\n⚠ ALERTS:")
|
||||||
|
print(f" Fields with imminent_prob > 0.70: {high_imminent}")
|
||||||
|
print(f" Fields with detected_prob > 0.60: {high_detected}")
|
||||||
|
else:
|
||||||
|
print(f" WARNING: No results exported - check CI data availability")
|
||||||
|
|
||||||
|
print(f"\nStorage structure:")
|
||||||
|
print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
|
||||||
|
print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx")
|
||||||
|
print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
|
||||||
|
print(f"\nReady to load into 09b field analysis report")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -18,6 +18,7 @@ import sys
|
||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
import argparse
|
import argparse
|
||||||
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from osgeo import gdal
|
from osgeo import gdal
|
||||||
import time
|
import time
|
||||||
|
|
@ -441,6 +442,7 @@ def get_evalscript():
|
||||||
def main():
|
def main():
|
||||||
print("="*80)
|
print("="*80)
|
||||||
print("PLANET SATELLITE DATA DOWNLOADER - MISSING DATES ONLY")
|
print("PLANET SATELLITE DATA DOWNLOADER - MISSING DATES ONLY")
|
||||||
|
print("Wrapper for 00_download_8band_pu_optimized.py")
|
||||||
print("="*80)
|
print("="*80)
|
||||||
|
|
||||||
config_dict = get_config()
|
config_dict = get_config()
|
||||||
|
|
@ -495,47 +497,45 @@ def main():
|
||||||
print(f" - {date}")
|
print(f" - {date}")
|
||||||
|
|
||||||
if config_dict['dry_run']:
|
if config_dict['dry_run']:
|
||||||
print("\n[DRY-RUN] Would download and merge above dates")
|
print("\n[DRY-RUN] Would download above dates using 00_download_8band_pu_optimized.py")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# Setup BBox list
|
# Download each missing date using the optimized downloader
|
||||||
print(f"\nLoading field geometries...")
|
print(f"\n{'='*80}")
|
||||||
bbox_list = setup_bbox_list(paths['geojson'], resolution=config_dict['resolution'])
|
print(f"Downloading missing dates using optimized script...")
|
||||||
if bbox_list is None:
|
|
||||||
return 1
|
|
||||||
print(f" Created {len(bbox_list)} BBox tiles")
|
|
||||||
|
|
||||||
# Download and merge each missing date
|
|
||||||
print(f"\nDownloading missing dates...")
|
|
||||||
print(f"{'='*80}")
|
print(f"{'='*80}")
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
for i, slot in enumerate(missing_dates, 1):
|
for i, date_str in enumerate(missing_dates, 1):
|
||||||
print(f"\n[{i}/{len(missing_dates)}] Processing {slot}...")
|
print(f"\n[{i}/{len(missing_dates)}] Downloading {date_str}...")
|
||||||
|
|
||||||
# Check availability
|
# Call 00_download_8band_pu_optimized.py for this date
|
||||||
if not is_image_available(slot, bbox_list, collection_id):
|
cmd = [
|
||||||
print(f" Skipping {slot} - no imagery available")
|
sys.executable,
|
||||||
continue
|
"00_download_8band_pu_optimized.py",
|
||||||
|
config_dict['project'],
|
||||||
|
"--date", date_str,
|
||||||
|
"--resolution", str(config_dict['resolution']),
|
||||||
|
"--cleanup"
|
||||||
|
]
|
||||||
|
|
||||||
# Download for all bboxes
|
try:
|
||||||
print(f" Downloading {len(bbox_list)} tiles...")
|
result = subprocess.run(cmd, check=True, capture_output=False)
|
||||||
for bbox in bbox_list:
|
|
||||||
size = bbox_to_dimensions(bbox, resolution=config_dict['resolution'])
|
|
||||||
download_function(slot, bbox, size, paths['single_images'])
|
|
||||||
|
|
||||||
# Merge
|
|
||||||
print(f" Merging tiles...")
|
|
||||||
if merge_files(slot, paths['single_images'], paths['merged_tifs'], paths['virtual_raster']):
|
|
||||||
success_count += 1
|
success_count += 1
|
||||||
|
print(f" ✓ Successfully downloaded {date_str}")
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f" ✗ Failed to download {date_str}: {e}")
|
||||||
|
# Continue with next date instead of stopping
|
||||||
|
continue
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
print(f"\n{'='*80}")
|
print(f"\n{'='*80}")
|
||||||
print(f"SUMMARY:")
|
print(f"SUMMARY:")
|
||||||
print(f" Successfully processed: {success_count}/{len(missing_dates)} dates")
|
print(f" Successfully processed: {success_count}/{len(missing_dates)} dates")
|
||||||
print(f" Output folder: {paths['merged_tifs']}")
|
print(f" Output folder: {paths['merged_tifs']}")
|
||||||
|
print(f"{'='*80}")
|
||||||
|
|
||||||
return 0
|
return 0 if success_count == len(missing_dates) else 1
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|
|
||||||
482
python_app/harvest_date_pred_utils.py
Normal file
482
python_app/harvest_date_pred_utils.py
Normal file
|
|
@ -0,0 +1,482 @@
|
||||||
|
"""
|
||||||
|
Self-contained utility module for two-step harvest date prediction and Excel export.
|
||||||
|
Includes model architecture, feature engineering, and core prediction logic.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import pickle
|
||||||
|
import yaml
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple, Dict, Any, List
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# TORCH MODELS (from src/models.py, inlined for self-containment)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class HarvestDetectionLSTM(nn.Module):
|
||||||
|
"""Unidirectional LSTM for harvest detection with dual outputs."""
|
||||||
|
def __init__(self, input_size: int, hidden_size: int = 128,
|
||||||
|
num_layers: int = 1, dropout: float = 0.5):
|
||||||
|
super(HarvestDetectionLSTM, self).__init__()
|
||||||
|
self.input_size = input_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_layers = num_layers
|
||||||
|
|
||||||
|
self.lstm = nn.LSTM(
|
||||||
|
input_size=input_size,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
num_layers=num_layers,
|
||||||
|
dropout=dropout if num_layers > 1 else 0,
|
||||||
|
bidirectional=False,
|
||||||
|
batch_first=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.imminent_head = nn.Sequential(
|
||||||
|
nn.Linear(hidden_size, 16),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Dropout(dropout),
|
||||||
|
nn.Linear(16, 1),
|
||||||
|
nn.Sigmoid()
|
||||||
|
)
|
||||||
|
|
||||||
|
self.detected_head = nn.Sequential(
|
||||||
|
nn.Linear(hidden_size, 16),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Dropout(dropout),
|
||||||
|
nn.Linear(16, 1),
|
||||||
|
nn.Sigmoid()
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
lstm_out, _ = self.lstm(x)
|
||||||
|
batch_size, seq_len, hidden_size = lstm_out.shape
|
||||||
|
lstm_flat = lstm_out.reshape(-1, hidden_size)
|
||||||
|
|
||||||
|
imminent_flat = self.imminent_head(lstm_flat).reshape(batch_size, seq_len)
|
||||||
|
detected_flat = self.detected_head(lstm_flat).reshape(batch_size, seq_len)
|
||||||
|
|
||||||
|
return imminent_flat, detected_flat
|
||||||
|
|
||||||
|
|
||||||
|
class HarvestDetectionGRU(nn.Module):
|
||||||
|
"""Unidirectional GRU for harvest detection with dual outputs."""
|
||||||
|
def __init__(self, input_size: int, hidden_size: int = 128,
|
||||||
|
num_layers: int = 1, dropout: float = 0.5):
|
||||||
|
super(HarvestDetectionGRU, self).__init__()
|
||||||
|
self.input_size = input_size
|
||||||
|
self.hidden_size = hidden_size
|
||||||
|
self.num_layers = num_layers
|
||||||
|
|
||||||
|
self.gru = nn.GRU(
|
||||||
|
input_size=input_size,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
num_layers=num_layers,
|
||||||
|
dropout=dropout if num_layers > 1 else 0,
|
||||||
|
bidirectional=False,
|
||||||
|
batch_first=True
|
||||||
|
)
|
||||||
|
|
||||||
|
self.imminent_head = nn.Sequential(
|
||||||
|
nn.Linear(hidden_size, 16),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Dropout(dropout),
|
||||||
|
nn.Linear(16, 1),
|
||||||
|
nn.Sigmoid()
|
||||||
|
)
|
||||||
|
|
||||||
|
self.detected_head = nn.Sequential(
|
||||||
|
nn.Linear(hidden_size, 16),
|
||||||
|
nn.ReLU(),
|
||||||
|
nn.Dropout(dropout),
|
||||||
|
nn.Linear(16, 1),
|
||||||
|
nn.Sigmoid()
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
gru_out, _ = self.gru(x)
|
||||||
|
batch_size, seq_len, hidden_size = gru_out.shape
|
||||||
|
gru_flat = gru_out.reshape(-1, hidden_size)
|
||||||
|
|
||||||
|
imminent_flat = self.imminent_head(gru_flat).reshape(batch_size, seq_len)
|
||||||
|
detected_flat = self.detected_head(gru_flat).reshape(batch_size, seq_len)
|
||||||
|
|
||||||
|
return imminent_flat, detected_flat
|
||||||
|
|
||||||
|
|
||||||
|
def create_model(model_type: str, input_size: int, hidden_size: int = 128,
|
||||||
|
num_layers: int = 1, dropout: float = 0.5, device = None) -> nn.Module:
|
||||||
|
"""Create a model from registry."""
|
||||||
|
registry = {'LSTM': HarvestDetectionLSTM, 'GRU': HarvestDetectionGRU}
|
||||||
|
if model_type not in registry:
|
||||||
|
raise ValueError(f"Unknown model type: {model_type}")
|
||||||
|
|
||||||
|
model = registry[model_type](
|
||||||
|
input_size=input_size,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
num_layers=num_layers,
|
||||||
|
dropout=dropout
|
||||||
|
)
|
||||||
|
|
||||||
|
if device:
|
||||||
|
model = model.to(device)
|
||||||
|
|
||||||
|
# Print model info
|
||||||
|
total_params = sum(p.numel() for p in model.parameters())
|
||||||
|
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||||
|
print(f"Model: {model_type}")
|
||||||
|
print(f" Input size: {input_size}")
|
||||||
|
print(f" Hidden size: {hidden_size}")
|
||||||
|
print(f" Num layers: {num_layers}")
|
||||||
|
print(f" Dropout: {dropout}")
|
||||||
|
print(f" Total parameters: {total_params:,}")
|
||||||
|
print(f" Trainable parameters: {trainable_params:,}")
|
||||||
|
print(f" Device: {device}")
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# FEATURE ENGINEERING (from src/feature_engineering.py, simplified for inline)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> pd.DataFrame:
|
||||||
|
"""Compute all CI-based features (state, velocity, acceleration, min/max/range/std/CV)."""
|
||||||
|
features = pd.DataFrame(index=ci_series.index)
|
||||||
|
|
||||||
|
# State (moving averages)
|
||||||
|
features['CI_raw'] = ci_series
|
||||||
|
features['7d_MA'] = ci_series.rolling(window=7, min_periods=1).mean()
|
||||||
|
features['14d_MA'] = ci_series.rolling(window=14, min_periods=1).mean()
|
||||||
|
features['21d_MA'] = ci_series.rolling(window=21, min_periods=1).mean()
|
||||||
|
|
||||||
|
# Velocity (gradient of MA)
|
||||||
|
for window in [7, 14, 21]:
|
||||||
|
ma = ci_series.rolling(window=window, min_periods=1).mean()
|
||||||
|
features[f'{window}d_velocity'] = ma.diff() / 1.0 # Simplified gradient
|
||||||
|
|
||||||
|
# Acceleration (gradient of velocity)
|
||||||
|
for window in [7, 14, 21]:
|
||||||
|
ma = ci_series.rolling(window=window, min_periods=1).mean()
|
||||||
|
vel = ma.diff()
|
||||||
|
features[f'{window}d_acceleration'] = vel.diff()
|
||||||
|
|
||||||
|
# Min, Max, Range
|
||||||
|
for window in [7, 14, 21]:
|
||||||
|
features[f'{window}d_min'] = ci_series.rolling(window=window, min_periods=1).min()
|
||||||
|
features[f'{window}d_max'] = ci_series.rolling(window=window, min_periods=1).max()
|
||||||
|
features[f'{window}d_range'] = features[f'{window}d_max'] - features[f'{window}d_min']
|
||||||
|
|
||||||
|
# Std and CV
|
||||||
|
for window in [7, 14, 21]:
|
||||||
|
features[f'{window}d_std'] = ci_series.rolling(window=window, min_periods=1).std()
|
||||||
|
ma = ci_series.rolling(window=window, min_periods=1).mean()
|
||||||
|
features[f'{window}d_CV'] = features[f'{window}d_std'] / (ma + 1e-6)
|
||||||
|
|
||||||
|
# DOY normalized
|
||||||
|
if doy_series is not None:
|
||||||
|
features['DOY_normalized'] = doy_series / 450.0
|
||||||
|
|
||||||
|
return features.fillna(0)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
|
||||||
|
"""Extract and return specified features as numpy array."""
|
||||||
|
# Compute all CI features
|
||||||
|
ci_series = data_df[ci_column].astype(float)
|
||||||
|
doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
|
||||||
|
|
||||||
|
all_features = compute_ci_features(ci_series, doy_series)
|
||||||
|
|
||||||
|
# Select requested features
|
||||||
|
requested = [f for f in feature_names if f in all_features.columns]
|
||||||
|
if not requested:
|
||||||
|
raise ValueError(f"No valid features found. Requested: {feature_names}")
|
||||||
|
|
||||||
|
return all_features[requested].values
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN UTILITY FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def load_model_and_config(model_dir: Path):
|
||||||
|
"""Load model, config, and scalers from a given directory."""
|
||||||
|
cwd = Path.cwd()
|
||||||
|
|
||||||
|
# Try different naming conventions
|
||||||
|
candidates = [
|
||||||
|
# Standard names
|
||||||
|
(model_dir / "config.json", model_dir / "model.pt", model_dir / "scalers.pkl"),
|
||||||
|
# Model 307 specific names
|
||||||
|
(model_dir / "model_config.json", model_dir / "model_307.pt", model_dir / "model_scalers.pkl"),
|
||||||
|
# CWD standard names
|
||||||
|
(cwd / "config.json", cwd / "model.pt", cwd / "scalers.pkl"),
|
||||||
|
# CWD Model 307 names
|
||||||
|
(cwd / "model_config.json", cwd / "model_307.pt", cwd / "model_scalers.pkl"),
|
||||||
|
]
|
||||||
|
|
||||||
|
config_file = model_file = scalers_file = None
|
||||||
|
for cfg, mdl, scl in candidates:
|
||||||
|
if cfg.exists() and mdl.exists() and scl.exists():
|
||||||
|
config_file, model_file, scalers_file = cfg, mdl, scl
|
||||||
|
print(f"Found model files in: {cfg.parent}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not (config_file and model_file and scalers_file):
|
||||||
|
missing = []
|
||||||
|
for cfg, mdl, scl in candidates:
|
||||||
|
if not cfg.exists():
|
||||||
|
missing.append(str(cfg))
|
||||||
|
if not mdl.exists():
|
||||||
|
missing.append(str(mdl))
|
||||||
|
if not scl.exists():
|
||||||
|
missing.append(str(scl))
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Missing model files. Checked multiple locations. Missing: {missing}"
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(config_file) as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
model = create_model(
|
||||||
|
model_type=config['model']['type'],
|
||||||
|
input_size=len(config['features']),
|
||||||
|
hidden_size=config['model']['hidden_size'],
|
||||||
|
num_layers=config['model']['num_layers'],
|
||||||
|
dropout=config['model']['dropout'],
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Loading weights from: {model_file}")
|
||||||
|
model.load_state_dict(torch.load(model_file, map_location=device, weights_only=False))
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
with open(scalers_file, 'rb') as f:
|
||||||
|
scalers = pickle.load(f)
|
||||||
|
|
||||||
|
return model, config, scalers
|
||||||
|
|
||||||
|
|
||||||
|
def load_harvest_data(data_file: Path) -> pd.DataFrame:
|
||||||
|
"""Load harvest data CSV."""
|
||||||
|
print(f"Loading data from: {data_file}")
|
||||||
|
df = pd.read_csv(data_file)
|
||||||
|
print(f"Loaded {len(df)} rows")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device):
|
||||||
|
"""
|
||||||
|
Phase 1: Growing window detection with threshold crossing.
|
||||||
|
Expand window day-by-day, check last timestep's detected_prob.
|
||||||
|
When 3 consecutive days have prob > 0.5, harvest detected.
|
||||||
|
Returns list of (harvest_date, harvest_idx) tuples.
|
||||||
|
"""
|
||||||
|
harvest_dates = []
|
||||||
|
current_pos = 0
|
||||||
|
|
||||||
|
while current_pos < len(field_data):
|
||||||
|
consecutive_above_threshold = 0
|
||||||
|
|
||||||
|
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||||
|
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||||
|
|
||||||
|
# Apply scalers
|
||||||
|
for fi, scaler in enumerate(scalers):
|
||||||
|
try:
|
||||||
|
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Run model
|
||||||
|
with torch.no_grad():
|
||||||
|
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||||
|
imminent_probs, detected_probs = model(x_tensor)
|
||||||
|
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||||
|
|
||||||
|
# Check LAST timestep
|
||||||
|
last_prob = detected_probs[-1]
|
||||||
|
|
||||||
|
if last_prob > 0.5:
|
||||||
|
consecutive_above_threshold += 1
|
||||||
|
else:
|
||||||
|
consecutive_above_threshold = 0
|
||||||
|
|
||||||
|
# Harvest detected: 3 consecutive days above threshold
|
||||||
|
if consecutive_above_threshold >= 3:
|
||||||
|
harvest_date = field_data.iloc[current_pos + window_end - 3]['Date']
|
||||||
|
harvest_dates.append((harvest_date, current_pos + window_end - 3))
|
||||||
|
|
||||||
|
# Reset to next day after harvest
|
||||||
|
current_pos = current_pos + window_end - 2
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return harvest_dates
|
||||||
|
|
||||||
|
|
||||||
|
def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device):
|
||||||
|
"""
|
||||||
|
Phase 2: Refinement with ±40 day window.
|
||||||
|
For each Phase 1 harvest, extract window and refine with argmax.
|
||||||
|
Returns list of (harvest_date, harvest_idx) tuples.
|
||||||
|
"""
|
||||||
|
refined_harvests = []
|
||||||
|
field_data = field_data.sort_values('Date').reset_index(drop=True)
|
||||||
|
|
||||||
|
for i, (phase1_harvest_date, phase1_idx) in enumerate(phase1_harvests):
|
||||||
|
try:
|
||||||
|
# Determine season start
|
||||||
|
if i == 0:
|
||||||
|
season_start_date = field_data.iloc[0]['Date']
|
||||||
|
else:
|
||||||
|
prev_harvest_idx = phase1_harvests[i-1][1]
|
||||||
|
season_start_idx = prev_harvest_idx + 1
|
||||||
|
if season_start_idx >= len(field_data):
|
||||||
|
break
|
||||||
|
season_start_date = field_data.iloc[season_start_idx]['Date']
|
||||||
|
|
||||||
|
# Extract ±40 day window
|
||||||
|
window_start_date = season_start_date - pd.Timedelta(days=40)
|
||||||
|
window_end_date = phase1_harvest_date + pd.Timedelta(days=40)
|
||||||
|
|
||||||
|
window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0)
|
||||||
|
window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data))
|
||||||
|
|
||||||
|
if window_end_idx <= window_start_idx:
|
||||||
|
refined_harvests.append((phase1_harvest_date, phase1_idx))
|
||||||
|
continue
|
||||||
|
|
||||||
|
window_data = field_data.iloc[window_start_idx:window_end_idx].copy().reset_index(drop=True)
|
||||||
|
|
||||||
|
# Extract features for full window
|
||||||
|
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||||
|
|
||||||
|
# Apply scalers
|
||||||
|
for fi, scaler in enumerate(scalers):
|
||||||
|
try:
|
||||||
|
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Run model once on full window
|
||||||
|
with torch.no_grad():
|
||||||
|
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||||
|
imminent_probs, detected_probs = model(x_tensor)
|
||||||
|
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||||
|
|
||||||
|
# Find refined harvest (argmax in window)
|
||||||
|
refined_idx_in_window = int(np.argmax(detected_probs))
|
||||||
|
refined_idx_global = window_start_idx + refined_idx_in_window
|
||||||
|
refined_harvest_date = field_data.iloc[refined_idx_global]['Date']
|
||||||
|
|
||||||
|
refined_harvests.append((refined_harvest_date, refined_idx_global))
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
refined_harvests.append((phase1_harvest_date, phase1_idx))
|
||||||
|
|
||||||
|
return refined_harvests
|
||||||
|
|
||||||
|
|
||||||
|
def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=None):
|
||||||
|
"""
|
||||||
|
Two-step harvest detection for each field:
|
||||||
|
1. Phase 1: Growing window with 3-day threshold confirmation
|
||||||
|
2. Phase 2: ±40 day refinement with argmax
|
||||||
|
|
||||||
|
Returns list of dicts with field, season_start_date, season_end_date, etc.
|
||||||
|
"""
|
||||||
|
if device is None:
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
ci_column = config['data']['ci_column']
|
||||||
|
|
||||||
|
# Group by field and count total fields for progress
|
||||||
|
field_groups = list(df.groupby('field'))
|
||||||
|
total_fields = len(field_groups)
|
||||||
|
harvests_found = 0
|
||||||
|
|
||||||
|
print(f" Processing {total_fields} fields...")
|
||||||
|
|
||||||
|
for idx, (field, field_data) in enumerate(field_groups, 1):
|
||||||
|
# Simple progress indicator
|
||||||
|
pct = int((idx / total_fields) * 100)
|
||||||
|
bar_length = 40
|
||||||
|
filled = int((idx / total_fields) * bar_length)
|
||||||
|
bar = "█" * filled + "░" * (bar_length - filled)
|
||||||
|
print(f" [{bar}] {pct:3d}% ({idx}/{total_fields} fields)", end='\r')
|
||||||
|
|
||||||
|
field_data = field_data.sort_values('Date').reset_index(drop=True)
|
||||||
|
|
||||||
|
# Phase 1: Growing window detection
|
||||||
|
phase1_harvests = run_phase1_growing_window(field_data, model, config, scalers, ci_column, device)
|
||||||
|
|
||||||
|
if not phase1_harvests:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Phase 2: Refinement
|
||||||
|
phase2_harvests = run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, ci_column, device)
|
||||||
|
|
||||||
|
# Store results
|
||||||
|
for i, (harvest_date, harvest_idx) in enumerate(phase2_harvests):
|
||||||
|
if i == 0:
|
||||||
|
season_start_date = field_data.iloc[0]['Date']
|
||||||
|
else:
|
||||||
|
prev_harvest_idx = phase2_harvests[i-1][1]
|
||||||
|
season_start_idx = prev_harvest_idx + 1
|
||||||
|
if season_start_idx >= len(field_data):
|
||||||
|
break
|
||||||
|
season_start_date = field_data.iloc[season_start_idx]['Date']
|
||||||
|
|
||||||
|
season_end_date = harvest_date
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'field': field,
|
||||||
|
'season': i + 1,
|
||||||
|
'season_start_date': season_start_date,
|
||||||
|
'season_end_date': season_end_date,
|
||||||
|
'phase2_harvest_date': harvest_date,
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
harvests_found += 1
|
||||||
|
|
||||||
|
print() # New line after progress bar
|
||||||
|
print(f" ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def build_production_harvest_table(refined_results: List[Dict]) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Build a DataFrame from refined results with columns for production pipeline.
|
||||||
|
One row per field/season with season start and end dates (formatted as YYYY-MM-DD).
|
||||||
|
"""
|
||||||
|
if not refined_results:
|
||||||
|
print("WARNING: No refined results to build table")
|
||||||
|
return pd.DataFrame(columns=['field', 'season', 'season_start_date', 'season_end_date'])
|
||||||
|
|
||||||
|
# Build DataFrame
|
||||||
|
df = pd.DataFrame(refined_results)
|
||||||
|
|
||||||
|
# Ensure date columns are datetime
|
||||||
|
df['season_start_date'] = pd.to_datetime(df['season_start_date']).dt.strftime('%Y-%m-%d')
|
||||||
|
df['season_end_date'] = pd.to_datetime(df['season_end_date']).dt.strftime('%Y-%m-%d')
|
||||||
|
df['phase1_harvest_date'] = pd.to_datetime(df['phase1_harvest_date']).dt.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
print(f"Built production table with {len(df)} field/season combinations")
|
||||||
|
|
||||||
|
return df
|
||||||
BIN
python_app/model_307.pt
Normal file
BIN
python_app/model_307.pt
Normal file
Binary file not shown.
144
python_app/model_config.json
Normal file
144
python_app/model_config.json
Normal file
|
|
@ -0,0 +1,144 @@
|
||||||
|
{
|
||||||
|
"name": "307_dropout02_with_doy",
|
||||||
|
"description": "Production Model 307: LSTM-based harvest detection (Phase 3, minimal regularization)",
|
||||||
|
"model_info": {
|
||||||
|
"type": "LSTM",
|
||||||
|
"architecture": "Unidirectional LSTM with dual output heads (imminent + detected)",
|
||||||
|
"total_parameters": 105120,
|
||||||
|
"input_features": 14,
|
||||||
|
"hidden_units": 256,
|
||||||
|
"output_heads": 2,
|
||||||
|
"training_data": "Historical multi-season CI data from multiple estates",
|
||||||
|
"validation_method": "5-fold cross-validation",
|
||||||
|
"device": "GPU (CUDA) or CPU fallback"
|
||||||
|
},
|
||||||
|
"production_scripts": {
|
||||||
|
"baseline": {
|
||||||
|
"script": "01_harvest_baseline_prediction.py",
|
||||||
|
"frequency": "Run ONCE during setup",
|
||||||
|
"purpose": "Predict all harvest dates (ground truth baseline)",
|
||||||
|
"input": "ci_data_for_python.csv (complete historical data)",
|
||||||
|
"output": "harvest_production_export.xlsx",
|
||||||
|
"time_estimate": "5-30 minutes depending on data volume"
|
||||||
|
},
|
||||||
|
"monitoring": {
|
||||||
|
"script": "02_harvest_imminent_weekly.py",
|
||||||
|
"frequency": "Run WEEKLY (or daily if required)",
|
||||||
|
"purpose": "Real-time harvest status and imminent alerts",
|
||||||
|
"input": "ci_data_for_python.csv (recent data)",
|
||||||
|
"output": "harvest_imminent_weekly.csv",
|
||||||
|
"time_estimate": "1-5 minutes"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"features": [
|
||||||
|
"CI_raw",
|
||||||
|
"7d_MA",
|
||||||
|
"14d_MA",
|
||||||
|
"21d_MA",
|
||||||
|
"7d_velocity",
|
||||||
|
"14d_velocity",
|
||||||
|
"21d_velocity",
|
||||||
|
"7d_min",
|
||||||
|
"14d_min",
|
||||||
|
"21d_min",
|
||||||
|
"7d_std",
|
||||||
|
"14d_std",
|
||||||
|
"21d_std",
|
||||||
|
"DOY_normalized"
|
||||||
|
],
|
||||||
|
"model": {
|
||||||
|
"type": "LSTM",
|
||||||
|
"hidden_size": 256,
|
||||||
|
"num_layers": 1,
|
||||||
|
"dropout": 0.2
|
||||||
|
},
|
||||||
|
"training": {
|
||||||
|
"imminent_days_before": 28,
|
||||||
|
"imminent_days_before_end": 1,
|
||||||
|
"detected_days_after_start": 1,
|
||||||
|
"detected_days_after_end": 21,
|
||||||
|
"k_folds": 5,
|
||||||
|
"num_epochs": 150,
|
||||||
|
"patience": 20,
|
||||||
|
"learning_rate": 0.001,
|
||||||
|
"batch_size": 4
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"csv_path": "../lstm_complete_data.csv",
|
||||||
|
"ci_column": "FitData",
|
||||||
|
"test_fraction": 0.15,
|
||||||
|
"seed": 42
|
||||||
|
},
|
||||||
|
"workflow_instructions": {
|
||||||
|
"overview": "Model 307 uses a two-script approach: baseline setup + weekly monitoring",
|
||||||
|
"step_1_baseline": {
|
||||||
|
"description": "Establish historical harvest date reference for all fields",
|
||||||
|
"script": "01_harvest_baseline_prediction.py",
|
||||||
|
"when": "Run once after setting up CI extraction pipeline",
|
||||||
|
"command": "conda activate python_gpu && python 01_harvest_baseline_prediction.py",
|
||||||
|
"input_data": "ci_data_for_python.csv (all available historical CI data)",
|
||||||
|
"output_file": "harvest_production_export.xlsx (ground truth baseline)",
|
||||||
|
"columns": [
|
||||||
|
"field - Field ID",
|
||||||
|
"sub_field - Sub-field designation",
|
||||||
|
"season - Season number (1, 2, 3...)",
|
||||||
|
"year - Year of harvest",
|
||||||
|
"season_start_date - Start of growing season",
|
||||||
|
"season_end_date - End of season (harvest date)",
|
||||||
|
"phase1_harvest_date - Refined harvest prediction"
|
||||||
|
],
|
||||||
|
"notes": "This becomes your reference - compare all weekly monitoring against this"
|
||||||
|
},
|
||||||
|
"step_2_monitoring": {
|
||||||
|
"description": "Weekly real-time harvest status and imminent alerts",
|
||||||
|
"script": "02_harvest_imminent_weekly.py",
|
||||||
|
"when": "Run every week (e.g., Mondays) or daily if near harvest",
|
||||||
|
"command": "conda activate python_gpu && python 02_harvest_imminent_weekly.py",
|
||||||
|
"input_data": "ci_data_for_python.csv (latest CI data from 02b conversion)",
|
||||||
|
"output_file": "harvest_imminent_weekly.csv",
|
||||||
|
"columns": [
|
||||||
|
"field - Field ID",
|
||||||
|
"sub_field - Sub-field designation",
|
||||||
|
"imminent_prob - Likelihood of harvest readiness in next 28 days (0.0-1.0)",
|
||||||
|
"detected_prob - Current harvest probability (0.0-1.0)",
|
||||||
|
"week - ISO week number",
|
||||||
|
"year - Year",
|
||||||
|
"as_of_date - Latest date in dataset",
|
||||||
|
"num_days - Days of history used"
|
||||||
|
],
|
||||||
|
"alert_thresholds": {
|
||||||
|
"imminent_high": "imminent_prob > 0.7 (prepare harvest)",
|
||||||
|
"imminent_medium": "imminent_prob 0.5-0.7 (monitor closely)",
|
||||||
|
"detected_high": "detected_prob > 0.6 (active harvesting)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"integration_with_r_pipeline": {
|
||||||
|
"before_model_307": [
|
||||||
|
"Planet 8-band download: download_8band_pu_optimized.ipynb",
|
||||||
|
"CI extraction: 02_ci_extraction.R",
|
||||||
|
"Convert to CSV: 02b_convert_rds_to_csv.R (outputs ci_data_for_python.csv)"
|
||||||
|
],
|
||||||
|
"model_307_here": [
|
||||||
|
"BASELINE: 01_harvest_baseline_prediction.py (run once)",
|
||||||
|
"MONITORING: 02_harvest_imminent_weekly.py (run weekly)"
|
||||||
|
],
|
||||||
|
"after_model_307": [
|
||||||
|
"Field analysis: 09b_field_analysis_weekly.R (reads harvest predictions)",
|
||||||
|
"Reports: 10_CI_report_with_kpis.Rmd (includes harvest status)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"environment_requirements": {
|
||||||
|
"python_env": "python_gpu",
|
||||||
|
"activation": "conda activate python_gpu",
|
||||||
|
"required_packages": [
|
||||||
|
"torch (GPU-enabled)",
|
||||||
|
"pandas",
|
||||||
|
"numpy",
|
||||||
|
"scikit-learn",
|
||||||
|
"pyyaml",
|
||||||
|
"openpyxl"
|
||||||
|
],
|
||||||
|
"gpu": "NVIDIA GPU with CUDA (optional - falls back to CPU if unavailable)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
python_app/model_scalers.pkl
Normal file
BIN
python_app/model_scalers.pkl
Normal file
Binary file not shown.
|
|
@ -15,9 +15,113 @@
|
||||||
suppressPackageStartupMessages({
|
suppressPackageStartupMessages({
|
||||||
library(tidyverse)
|
library(tidyverse)
|
||||||
library(lubridate)
|
library(lubridate)
|
||||||
|
library(zoo)
|
||||||
library(here)
|
library(here)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# HELPER FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
#' Convert wide format RDS to long format
|
||||||
|
#'
|
||||||
|
#' @param ci_data_wide Tibble with columns: field, sub_field, and dates as columns
|
||||||
|
#' @return Long format tibble: field, sub_field, Date, FitData
|
||||||
|
wide_to_long_ci_data <- function(ci_data_wide) {
|
||||||
|
ci_data_wide %>%
|
||||||
|
pivot_longer(
|
||||||
|
cols = -c(field, sub_field),
|
||||||
|
names_to = "Date",
|
||||||
|
values_to = "FitData",
|
||||||
|
values_drop_na = TRUE
|
||||||
|
) %>%
|
||||||
|
mutate(
|
||||||
|
Date = as.Date(Date),
|
||||||
|
FitData = as.numeric(FitData)
|
||||||
|
) %>%
|
||||||
|
filter(!is.na(FitData))
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Create daily interpolated sequences with DOY for each field
|
||||||
|
#'
|
||||||
|
#' For each field/sub_field combination, creates complete daily sequences from first to last date,
|
||||||
|
#' fills in measurements, and interpolates missing dates.
|
||||||
|
#'
|
||||||
|
#' @param ci_data_long Long format tibble: field, sub_field, Date, FitData
|
||||||
|
#' @return Tibble with: field, sub_field, Date, FitData, DOY, value
|
||||||
|
create_interpolated_daily_sequences <- function(ci_data_long) {
|
||||||
|
ci_data_long %>%
|
||||||
|
group_by(field, sub_field) %>%
|
||||||
|
nest() %>%
|
||||||
|
mutate(
|
||||||
|
data = map(data, function(df) {
|
||||||
|
# Sort measurements by date
|
||||||
|
df <- df %>% arrange(Date)
|
||||||
|
|
||||||
|
# Create complete daily sequence from first to last date
|
||||||
|
date_seq <- seq(min(df$Date), max(df$Date), by = "day")
|
||||||
|
|
||||||
|
# Build daily dataframe (field/sub_field stay in outer df, not here)
|
||||||
|
daily_df <- tibble(
|
||||||
|
Date = date_seq,
|
||||||
|
value = NA_real_,
|
||||||
|
FitData = NA_real_,
|
||||||
|
DOY = seq_along(date_seq) # Continuous day counter: 1, 2, 3, ...
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fill in actual measurement values
|
||||||
|
for (i in seq_len(nrow(df))) {
|
||||||
|
idx <- which(daily_df$Date == df$Date[i])
|
||||||
|
if (length(idx) > 0) {
|
||||||
|
daily_df$value[idx] <- df$FitData[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Interpolate missing dates linearly
|
||||||
|
daily_df$FitData <- zoo::na.approx(daily_df$value, na.rm = FALSE)
|
||||||
|
|
||||||
|
daily_df
|
||||||
|
})
|
||||||
|
) %>%
|
||||||
|
unnest(data) %>%
|
||||||
|
select(field, sub_field, Date, FitData, DOY, value) %>%
|
||||||
|
arrange(field, Date)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Validate conversion output
|
||||||
|
#'
|
||||||
|
#' @param ci_data_python Tibble with converted CI data
|
||||||
|
#' @return Invisibly returns the tibble (for piping)
|
||||||
|
validate_conversion_output <- function(ci_data_python) {
|
||||||
|
cat(sprintf("\nValidation:\n"))
|
||||||
|
cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
|
||||||
|
cat(sprintf(" Total daily rows: %d\n", nrow(ci_data_python)))
|
||||||
|
cat(sprintf(" Date range: %s to %s\n",
|
||||||
|
min(ci_data_python$Date, na.rm = TRUE),
|
||||||
|
max(ci_data_python$Date, na.rm = TRUE)))
|
||||||
|
cat(sprintf(" FitData range: %.2f to %.2f\n",
|
||||||
|
min(ci_data_python$FitData, na.rm = TRUE),
|
||||||
|
max(ci_data_python$FitData, na.rm = TRUE)))
|
||||||
|
cat(sprintf(" Raw measurements: %d\n", sum(!is.na(ci_data_python$value))))
|
||||||
|
cat(sprintf(" Interpolated values: %d\n", sum(is.na(ci_data_python$value) & !is.na(ci_data_python$FitData))))
|
||||||
|
|
||||||
|
invisible(ci_data_python)
|
||||||
|
}
|
||||||
|
|
||||||
|
#' Print next steps message
|
||||||
|
print_next_steps <- function() {
|
||||||
|
cat("\nNext steps for Python harvest detection:\n")
|
||||||
|
cat(" 1. Read this CSV file in Python\n")
|
||||||
|
cat(" 2. Group by field to identify seasons\n")
|
||||||
|
cat(" 3. Run LSTM model to detect harvest dates\n")
|
||||||
|
cat(" 4. Save predicted harvest dates to Excel\n")
|
||||||
|
cat(" 5. Use output in script 03 for interpolation\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN FUNCTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
main <- function() {
|
main <- function() {
|
||||||
# Process command line arguments
|
# Process command line arguments
|
||||||
args <- commandArgs(trailingOnly = TRUE)
|
args <- commandArgs(trailingOnly = TRUE)
|
||||||
|
|
@ -28,7 +132,7 @@ main <- function() {
|
||||||
} else if (exists("project_dir", envir = .GlobalEnv)) {
|
} else if (exists("project_dir", envir = .GlobalEnv)) {
|
||||||
project_dir <- get("project_dir", envir = .GlobalEnv)
|
project_dir <- get("project_dir", envir = .GlobalEnv)
|
||||||
} else {
|
} else {
|
||||||
project_dir <- "esa"
|
project_dir <- "angata"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Make available globally
|
# Make available globally
|
||||||
|
|
@ -49,9 +153,17 @@ main <- function() {
|
||||||
})
|
})
|
||||||
|
|
||||||
# Define paths
|
# Define paths
|
||||||
ci_data_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
|
ci_data_source_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "cumulative_vals")
|
||||||
input_file <- file.path(ci_data_dir, "combined_CI_data.rds")
|
ci_data_output_dir <- here::here("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "ci_data_for_python")
|
||||||
output_file <- file.path(ci_data_dir, "ci_data_for_python.csv")
|
|
||||||
|
# Create output directory if it doesn't exist (for new projects)
|
||||||
|
if (!dir.exists(ci_data_output_dir)) {
|
||||||
|
dir.create(ci_data_output_dir, recursive = TRUE, showWarnings = FALSE)
|
||||||
|
cat(sprintf("✓ Created output directory: %s\n", ci_data_output_dir))
|
||||||
|
}
|
||||||
|
|
||||||
|
input_file <- file.path(ci_data_source_dir, "combined_CI_data.rds")
|
||||||
|
output_file <- file.path(ci_data_output_dir, "ci_data_for_python.csv")
|
||||||
|
|
||||||
# Check if input file exists
|
# Check if input file exists
|
||||||
if (!file.exists(input_file)) {
|
if (!file.exists(input_file)) {
|
||||||
|
|
@ -61,52 +173,32 @@ main <- function() {
|
||||||
cat(sprintf("Loading: %s\n", input_file))
|
cat(sprintf("Loading: %s\n", input_file))
|
||||||
|
|
||||||
# Load RDS file
|
# Load RDS file
|
||||||
ci_data <- readRDS(input_file) %>%
|
ci_data_wide <- readRDS(input_file) %>%
|
||||||
as_tibble()
|
as_tibble()
|
||||||
|
|
||||||
cat(sprintf(" Loaded %d rows\n", nrow(ci_data)))
|
cat(sprintf(" Loaded %d rows\n", nrow(ci_data_wide)))
|
||||||
cat(sprintf(" Columns: %s\n", paste(names(ci_data), collapse = ", ")))
|
cat(sprintf(" Format: WIDE (field, sub_field, then dates as columns)\n"))
|
||||||
|
cat(sprintf(" Sample columns: %s\n", paste(names(ci_data_wide)[1:6], collapse = ", ")))
|
||||||
|
|
||||||
# Prepare data for Python
|
# Step 1: Convert from WIDE to LONG format
|
||||||
ci_data_python <- ci_data %>%
|
cat("\nStep 1: Converting from wide to long format...\n")
|
||||||
# Ensure standard column names
|
ci_data_long <- wide_to_long_ci_data(ci_data_wide)
|
||||||
rename(
|
|
||||||
field = field,
|
|
||||||
sub_field = sub_field,
|
|
||||||
Date = Date,
|
|
||||||
FitData = FitData,
|
|
||||||
DOY = DOY
|
|
||||||
) %>%
|
|
||||||
# Add 'value' as an alias for FitData (sometimes needed)
|
|
||||||
mutate(value = FitData) %>%
|
|
||||||
# Keep only necessary columns
|
|
||||||
select(field, sub_field, Date, FitData, DOY, value) %>%
|
|
||||||
# Sort by field and date
|
|
||||||
arrange(field, Date)
|
|
||||||
|
|
||||||
# Validate data
|
# Step 2: Create complete daily sequences with interpolation
|
||||||
cat(sprintf("\nValidation:\n"))
|
cat("\nStep 2: Creating complete daily sequences with interpolation...\n")
|
||||||
cat(sprintf(" Unique fields: %d\n", n_distinct(ci_data_python$field)))
|
ci_data_python <- create_interpolated_daily_sequences(ci_data_long)
|
||||||
cat(sprintf(" Date range: %s to %s\n",
|
|
||||||
min(ci_data_python$Date, na.rm = TRUE),
|
|
||||||
max(ci_data_python$Date, na.rm = TRUE)))
|
|
||||||
cat(sprintf(" FitData range: %.2f to %.2f\n",
|
|
||||||
min(ci_data_python$FitData, na.rm = TRUE),
|
|
||||||
max(ci_data_python$FitData, na.rm = TRUE)))
|
|
||||||
cat(sprintf(" Missing FitData: %d rows\n", sum(is.na(ci_data_python$FitData))))
|
|
||||||
|
|
||||||
# Save to CSV
|
# Step 3: Validate output
|
||||||
cat(sprintf("\nSaving to: %s\n", output_file))
|
cat("\nStep 3: Validating output...")
|
||||||
|
validate_conversion_output(ci_data_python)
|
||||||
|
|
||||||
|
# Step 4: Save to CSV
|
||||||
|
cat(sprintf("\nStep 4: Saving to CSV...\n"))
|
||||||
|
cat(sprintf(" Output: %s\n", output_file))
|
||||||
write_csv(ci_data_python, output_file)
|
write_csv(ci_data_python, output_file)
|
||||||
|
|
||||||
cat(sprintf("✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
|
cat(sprintf("\n✓ Successfully created CSV with %d rows\n", nrow(ci_data_python)))
|
||||||
cat("\nNext steps for Python harvest detection:\n")
|
print_next_steps()
|
||||||
cat(" 1. Read this CSV file in Python\n")
|
|
||||||
cat(" 2. Group by field to identify seasons\n")
|
|
||||||
cat(" 3. Run LSTM model to detect harvest dates\n")
|
|
||||||
cat(" 4. Save predicted harvest dates to Excel\n")
|
|
||||||
cat(" 5. Use output in script 03 for interpolation\n")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sys.nframe() == 0) {
|
if (sys.nframe() == 0) {
|
||||||
|
|
|
||||||
1328
webapps/geojson_viewer.html
Normal file
1328
webapps/geojson_viewer.html
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -212,6 +212,22 @@
|
||||||
<a href="./data_validation_tool/" class="app-btn">Open Tool</a>
|
<a href="./data_validation_tool/" class="app-btn">Open Tool</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- GeoJSON Viewer -->
|
||||||
|
<div class="app-card">
|
||||||
|
<div class="app-icon">📍</div>
|
||||||
|
<div class="app-content">
|
||||||
|
<h2>GeoJSON Viewer</h2>
|
||||||
|
<p>Upload and visualize GeoJSON files on an interactive map with feature properties.</p>
|
||||||
|
<ul class="app-features">
|
||||||
|
<li>Upload GeoJSON files</li>
|
||||||
|
<li>Interactive map view</li>
|
||||||
|
<li>View feature properties</li>
|
||||||
|
<li>Download exports</li>
|
||||||
|
</ul>
|
||||||
|
<a href="./geojson_viewer.html" class="app-btn">Open Viewer</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<footer>
|
<footer>
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue