Merge code-improvements: integrate remote changes
This commit is contained in:
commit
40a7d4d691
|
|
@ -26,6 +26,9 @@ This is your GROUND TRUTH - compare all future predictions against this baseline
|
|||
|
||||
Usage:
|
||||
python 01_harvest_baseline_prediction.py [project_name]
|
||||
conda activate pytorch_gpu
|
||||
cd python_app
|
||||
python 22_harvest_baseline_prediction.py angata
|
||||
|
||||
Examples:
|
||||
python 01_harvest_baseline_prediction.py angata
|
||||
|
|
@ -43,6 +46,7 @@ from pathlib import Path
|
|||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
run_phase1_growing_window,
|
||||
run_two_step_refinement,
|
||||
build_production_harvest_table
|
||||
)
|
||||
|
|
@ -107,9 +111,9 @@ def main():
|
|||
|
||||
# [3/4] Run model predictions with two-step detection
|
||||
print("\n[3/4] Running two-step harvest detection...")
|
||||
print(" (Using threshold=0.5, consecutive_days=3 - tuned for Model 307 output)")
|
||||
print(" (Using threshold=0.3, consecutive_days=2 - tuned baseline with DOY reset)")
|
||||
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device,
|
||||
phase1_threshold=0.5, phase1_consecutive=3)
|
||||
phase1_threshold=0.3, phase1_consecutive=2)
|
||||
|
||||
# Build and export
|
||||
print("\nBuilding production harvest table...")
|
||||
|
|
|
|||
225
python_app/23_convert_harvest_format.py
Normal file
225
python_app/23_convert_harvest_format.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script 23: Convert Harvest Format from Production to Standard
|
||||
==============================================================
|
||||
|
||||
Converts harvest_production_export.xlsx (output from script 22) to the standard
|
||||
harvest.xlsx format used by R scripts 24+.
|
||||
|
||||
INPUT:
|
||||
- harvest_production_export.xlsx (from script 22)
|
||||
Columns: field, season (numeric), season_start_date, season_end_date, phase2_harvest_date
|
||||
Contains detected harvests only
|
||||
|
||||
OUTPUT:
|
||||
- harvest.xlsx (standard format for R pipeline)
|
||||
Columns: field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha
|
||||
|
||||
LOGIC:
|
||||
1. For each field, group all detections chronologically
|
||||
2. Create one row per completed season (has season_end date)
|
||||
3. season_start = first CI date (2024-09-25) for first season, then previous harvest + 1 day
|
||||
4. season_end = phase2_harvest_date (refined harvest date from script 22)
|
||||
5. year = extracted from season_start date
|
||||
6. season = "Data{year} : {field}" format
|
||||
7. sub_field = field (same as field)
|
||||
8. age, sub_area, tonnage_ha = left empty (filled by R scripts later or other data sources)
|
||||
|
||||
Date Format: YYYY-MM-DD
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_ci_date_range(project_dir):
|
||||
"""
|
||||
Get the date range of CI data to establish season_start for first season.
|
||||
|
||||
Returns: (min_date, max_date) as datetime objects
|
||||
"""
|
||||
base_storage = Path("../laravel_app/storage/app") / project_dir / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
ci_csv_path = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not ci_csv_path.exists():
|
||||
# Fallback: assume data starts 2024-09-25 (typical for projects)
|
||||
print(f"[WARNING] CI data CSV not found at {ci_csv_path}, assuming CI starts 2024-09-25")
|
||||
return datetime(2024, 9, 25), datetime.now()
|
||||
|
||||
try:
|
||||
# Read only date column (first column usually has dates)
|
||||
df = pd.read_csv(ci_csv_path, nrows=1)
|
||||
columns = df.columns.tolist()
|
||||
date_col = columns[0] # First column should be dates
|
||||
|
||||
df = pd.read_csv(ci_csv_path, usecols=[date_col])
|
||||
df[date_col] = pd.to_datetime(df[date_col])
|
||||
|
||||
min_date = df[date_col].min()
|
||||
max_date = df[date_col].max()
|
||||
|
||||
print(f"[INFO] CI data date range: {min_date.date()} to {max_date.date()}")
|
||||
return min_date, max_date
|
||||
except Exception as e:
|
||||
print(f"[WARNING] Error reading CI date range: {e}, using fallback dates")
|
||||
return datetime(2024, 9, 25), datetime.now()
|
||||
|
||||
|
||||
def convert_harvest_format(project_dir="angata"):
|
||||
"""
|
||||
Convert harvest_production_export.xlsx to standard harvest.xlsx format.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
project_dir : str
|
||||
Project name (angata, esa, chemba, etc.)
|
||||
"""
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Script 23: Convert Harvest Format")
|
||||
print(f"Project: {project_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Get paths (same as script 22)
|
||||
base_storage = Path("../laravel_app/storage/app") / project_dir / "Data"
|
||||
harvest_data_dir = base_storage / "HarvestData"
|
||||
source_file = harvest_data_dir / "harvest_production_export.xlsx"
|
||||
output_file = base_storage / "harvest.xlsx"
|
||||
|
||||
# Check source file exists
|
||||
if not source_file.exists():
|
||||
print(f"[ERROR] Source file not found: {source_file}")
|
||||
print(f"[ERROR] Please run script 22 first to generate harvest_production_export.xlsx")
|
||||
return False
|
||||
|
||||
print(f"[INFO] Reading source file: {source_file}")
|
||||
|
||||
try:
|
||||
# Read production format
|
||||
df_source = pd.read_excel(source_file)
|
||||
print(f"[INFO] Loaded {len(df_source)} harvest detections")
|
||||
print(f"[INFO] Columns: {list(df_source.columns)}")
|
||||
|
||||
# Validate required columns
|
||||
required_cols = ["field", "season_start_date", "season_end_date", "phase2_harvest_date"]
|
||||
missing = [c for c in required_cols if c not in df_source.columns]
|
||||
if missing:
|
||||
print(f"[ERROR] Missing columns: {missing}")
|
||||
return False
|
||||
|
||||
# Get CI date range for establishing first season start
|
||||
ci_min_date, ci_max_date = get_ci_date_range(project_dir)
|
||||
first_season_start = ci_min_date.strftime("%Y-%m-%d")
|
||||
|
||||
# Convert to datetime for processing
|
||||
df_source["phase2_harvest_date"] = pd.to_datetime(df_source["phase2_harvest_date"])
|
||||
df_source["field"] = df_source["field"].astype(str)
|
||||
|
||||
# Sort by field and harvest date
|
||||
df_source = df_source.sort_values(["field", "phase2_harvest_date"]).reset_index(drop=True)
|
||||
|
||||
# Build output rows
|
||||
output_rows = []
|
||||
|
||||
# Group by field
|
||||
for field_id, group_df in df_source.groupby("field"):
|
||||
# Get all harvest dates for this field, sorted chronologically
|
||||
harvest_dates = group_df["phase2_harvest_date"].dt.strftime("%Y-%m-%d").tolist()
|
||||
|
||||
print(f"[INFO] Field {field_id}: {len(harvest_dates)} harvest detection(s)")
|
||||
|
||||
# First season always starts from CI beginning
|
||||
current_season_start = first_season_start
|
||||
|
||||
for harvest_idx, harvest_date in enumerate(harvest_dates):
|
||||
# Extract year from current season start
|
||||
season_start_obj = pd.to_datetime(current_season_start)
|
||||
year = season_start_obj.year
|
||||
|
||||
# Create season identifier
|
||||
season_str = f"Data{year} : {field_id}"
|
||||
|
||||
# Create row for completed season
|
||||
row = {
|
||||
"field": field_id,
|
||||
"sub_field": field_id, # Same as field
|
||||
"year": year,
|
||||
"season": season_str,
|
||||
"season_start": current_season_start,
|
||||
"season_end": harvest_date, # Filled because harvest detected
|
||||
"age": "", # Empty - will be calculated in R
|
||||
"sub_area": "", # Empty - will be populated from other data
|
||||
"tonnage_ha": "" # Empty - will be populated from other data
|
||||
}
|
||||
|
||||
output_rows.append(row)
|
||||
|
||||
# Next season starts day after this harvest
|
||||
next_season_start = (pd.to_datetime(harvest_date) + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
current_season_start = next_season_start
|
||||
|
||||
# If field has detections, check if we should add a final incomplete season
|
||||
# Only if we're not at the end of the monitoring period
|
||||
last_harvest = pd.to_datetime(harvest_dates[-1])
|
||||
days_after_last = (ci_max_date - last_harvest).days
|
||||
|
||||
if days_after_last > 30: # More than 30 days of data after last harvest
|
||||
# Add incomplete season row (season_end empty)
|
||||
season_start_obj = pd.to_datetime(current_season_start)
|
||||
year = season_start_obj.year
|
||||
season_str = f"Data{year} : {field_id}"
|
||||
|
||||
row = {
|
||||
"field": field_id,
|
||||
"sub_field": field_id,
|
||||
"year": year,
|
||||
"season": season_str,
|
||||
"season_start": current_season_start,
|
||||
"season_end": "", # Empty - season still active
|
||||
"age": "",
|
||||
"sub_area": "",
|
||||
"tonnage_ha": ""
|
||||
}
|
||||
|
||||
output_rows.append(row)
|
||||
print(f"[INFO] Added incomplete season starting {current_season_start}")
|
||||
|
||||
# Create output DataFrame
|
||||
df_output = pd.DataFrame(output_rows)
|
||||
|
||||
# Reorder columns to match standard format
|
||||
column_order = ["field", "sub_field", "year", "season", "season_start", "season_end",
|
||||
"age", "sub_area", "tonnage_ha"]
|
||||
df_output = df_output[column_order]
|
||||
|
||||
# Write to Excel
|
||||
print(f"\n[INFO] Writing output file: {output_file}")
|
||||
df_output.to_excel(output_file, index=False, sheet_name="Harvest")
|
||||
|
||||
# Print summary
|
||||
print(f"\n[SUCCESS] Conversion complete!")
|
||||
print(f"[INFO] Output rows: {len(df_output)}")
|
||||
print(f"[INFO] Unique fields: {df_output['field'].nunique()}")
|
||||
print(f"\n[INFO] Sample output:")
|
||||
print(df_output.head(10).to_string(index=False))
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Conversion failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Get project from command line or use default
|
||||
project = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
|
||||
success = convert_harvest_format(project)
|
||||
sys.exit(0 if success else 1)
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
"""
|
||||
r"""
|
||||
Script: 02_harvest_imminent_weekly.py
|
||||
Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields
|
||||
|
||||
|
|
@ -7,18 +7,19 @@ is approaching harvest. Use this for operational decision-making and real-time a
|
|||
|
||||
RUN FREQUENCY: Weekly (or daily if required)
|
||||
INPUT:
|
||||
- ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R)
|
||||
- harvest.xlsx (baseline from scripts 22+23 - contains last harvest date per field)
|
||||
Location: laravel_app/storage/app/{project}/Data/harvest.xlsx
|
||||
- ci_data_for_python.csv (complete CI data from R script)
|
||||
Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
|
||||
- harvest_production_export.xlsx (baseline from script 01 - optional, for reference)
|
||||
OUTPUT:
|
||||
- harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
|
||||
- reports/kpis/field_stats/{project}_harvest_imminent_week_{WW}_{YYYY}.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year)
|
||||
|
||||
Workflow:
|
||||
1. Load harvest_production_export.xlsx (baseline dates - optional, for context)
|
||||
2. Load ci_data_for_python.csv (recent CI data)
|
||||
3. For each field, extract last 300 days of history
|
||||
4. Run Model 307 inference on full sequence (last timestep probabilities)
|
||||
5. Export harvest_imminent_weekly.csv with probabilities
|
||||
1. Load harvest.xlsx to find last harvest date (season_end) per field
|
||||
2. Load ci_data_for_python.csv (complete CI data)
|
||||
3. For each field, extract all CI data AFTER last harvest (complete current season)
|
||||
4. Run Model 307 inference on full season sequence (last timestep probabilities)
|
||||
5. Export week_WW_YYYY.csv with probabilities
|
||||
|
||||
Output Columns:
|
||||
- field: Field ID
|
||||
|
|
@ -37,12 +38,12 @@ Use Cases:
|
|||
- Feed into 09b script for weekly dashboard reports
|
||||
|
||||
Usage:
|
||||
python 02_harvest_imminent_weekly.py [project_name]
|
||||
python python_app/31_harvest_imminent_weekly.py angata
|
||||
|
||||
Examples:
|
||||
python 02_harvest_imminent_weekly.py angata
|
||||
python 02_harvest_imminent_weekly.py esa
|
||||
python 02_harvest_imminent_weekly.py chemba
|
||||
python python_app/31_harvest_imminent_weekly.py angata
|
||||
python python_app/31_harvest_imminent_weekly.py esa
|
||||
python python_app/31_harvest_imminent_weekly.py chemba
|
||||
|
||||
If no project specified, defaults to 'angata'
|
||||
"""
|
||||
|
|
@ -61,33 +62,34 @@ from harvest_date_pred_utils import (
|
|||
|
||||
|
||||
def load_harvest_dates(harvest_file):
|
||||
"""Load latest harvest end dates from Excel file (from harvest_production_export.xlsx)."""
|
||||
print("[1/5] Loading harvest dates...")
|
||||
"""Load last harvest end dates from harvest.xlsx (output from scripts 22+23)."""
|
||||
print("[1/5] Loading harvest data for season boundaries...")
|
||||
|
||||
if not Path(harvest_file).exists():
|
||||
print(f" ERROR: {harvest_file} not found")
|
||||
print(" Using 180-day lookback as default")
|
||||
print(f" harvest.xlsx is required to determine current season boundaries")
|
||||
return None
|
||||
|
||||
try:
|
||||
harvest_df = pd.read_excel(harvest_file)
|
||||
print(f" Loaded {len(harvest_df)} field-season records")
|
||||
print(f" Loaded {len(harvest_df)} season records")
|
||||
|
||||
# Use season_end_date column (output from harvest prediction script)
|
||||
harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date'])
|
||||
# season_end contains the last harvest date for each season
|
||||
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'])
|
||||
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
|
||||
|
||||
# Group by field and get the latest season_end_date
|
||||
# Group by field and get the LATEST season_end_date (most recent harvest)
|
||||
# This marks the start of the current season
|
||||
harvest_dates = {}
|
||||
for field_id, group in harvest_df.groupby('field'):
|
||||
latest_end = group['season_end_date'].max()
|
||||
harvest_dates[str(field_id).strip()] = latest_end
|
||||
latest_harvest = group['season_end'].max()
|
||||
harvest_dates[field_id] = latest_harvest
|
||||
|
||||
print(f" Successfully mapped {len(harvest_dates)} fields")
|
||||
print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
|
||||
print(f" Last harvest dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}")
|
||||
return harvest_dates
|
||||
except Exception as e:
|
||||
print(f" ERROR loading harvest file: {e}")
|
||||
print(f" Using 180-day lookback instead")
|
||||
print(f" ERROR loading harvest.xlsx: {e}")
|
||||
return None
|
||||
|
||||
|
||||
|
|
@ -212,26 +214,37 @@ def main():
|
|||
# Get project name from command line or use default
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
|
||||
# Construct paths
|
||||
# Construct paths - work from either python_app/ or root smartcane/ directory
|
||||
# Try root first (laravel_app/...), then fall back to ../ (running from python_app/)
|
||||
if Path("laravel_app/storage/app").exists():
|
||||
base_storage = Path("laravel_app/storage/app") / project_name / "Data"
|
||||
else:
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
harvest_data_dir = base_storage / "HarvestData"
|
||||
BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx"
|
||||
OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv"
|
||||
harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist
|
||||
HARVEST_FILE = base_storage / "harvest.xlsx" # Output from scripts 22+23
|
||||
|
||||
# Determine week and year from current date for timestamped export
|
||||
today = datetime.now()
|
||||
week_num = int(today.strftime('%V'))
|
||||
year_num = int(today.strftime('%Y'))
|
||||
|
||||
# Output directory: reports/kpis/field_stats/
|
||||
reports_dir = base_storage.parent / "reports" / "kpis" / "field_stats"
|
||||
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT_CSV = reports_dir / f"{project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv"
|
||||
|
||||
print("="*80)
|
||||
print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})")
|
||||
print("="*80)
|
||||
|
||||
# [1] Load harvest dates (optional - for projects with predictions)
|
||||
harvest_dates = None
|
||||
if BASELINE_FILE.exists():
|
||||
harvest_dates = load_harvest_dates(BASELINE_FILE)
|
||||
else:
|
||||
print("[1/5] Loading harvest dates...")
|
||||
print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)")
|
||||
# [1] Load harvest dates (required to determine season boundaries)
|
||||
harvest_dates = load_harvest_dates(HARVEST_FILE)
|
||||
if harvest_dates is None or len(harvest_dates) == 0:
|
||||
print(f"ERROR: Cannot run without harvest.xlsx - required to determine current season boundaries")
|
||||
print(f" Please run scripts 22 (baseline prediction) and 23 (format conversion) first")
|
||||
return
|
||||
|
||||
# [2] Load CI data
|
||||
print(f"\n[2/5] Loading CI data...")
|
||||
|
|
@ -251,7 +264,7 @@ def main():
|
|||
|
||||
# [3] Load model (from python_app directory)
|
||||
print("\n[3/5] Loading Model 307...")
|
||||
model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl
|
||||
model_dir = Path("python_app") # Model files located in python_app/ directory
|
||||
model, config, scalers = load_model_and_config(model_dir)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f" Device: {device}")
|
||||
|
|
@ -271,6 +284,9 @@ def main():
|
|||
|
||||
count = 0
|
||||
for field_id in ci_data['field'].unique():
|
||||
# Convert field_id to string for consistency
|
||||
field_id_str = str(field_id).strip()
|
||||
|
||||
# Get metadata
|
||||
meta = field_meta[field_meta['field'] == field_id]
|
||||
if len(meta) == 0:
|
||||
|
|
@ -279,18 +295,21 @@ def main():
|
|||
sub_field = meta['sub_field'].iloc[0]
|
||||
latest_date = meta['latest_date'].iloc[0]
|
||||
|
||||
# Use recent CI history (last 300 days from latest available data)
|
||||
# Get last harvest date for this field (start of current season)
|
||||
last_harvest = harvest_dates.get(field_id_str)
|
||||
if last_harvest is None:
|
||||
continue
|
||||
|
||||
# Extract all CI data AFTER last harvest (complete current season)
|
||||
field_data = ci_data[ci_data['field'] == field_id].copy()
|
||||
field_data = field_data.sort_values('Date')
|
||||
field_data = field_data[field_data['Date'] > last_harvest] # After last harvest
|
||||
|
||||
# Keep last 300 days of history for inference
|
||||
if len(field_data) > 300:
|
||||
field_data = field_data.iloc[-300:]
|
||||
|
||||
# Need at least 30 days of data since planting
|
||||
if len(field_data) < 30:
|
||||
continue
|
||||
|
||||
# Run inference on recent history to predict next 28 days
|
||||
# Run inference on full current season to predict next 28 days
|
||||
imminent_prob, detected_prob = run_inference_on_season(
|
||||
field_data, model, config, scalers, device, ci_column
|
||||
)
|
||||
|
|
@ -338,10 +357,11 @@ def main():
|
|||
print(f" WARNING: No results exported - check CI data availability")
|
||||
|
||||
print(f"\nStorage structure:")
|
||||
print(f" Input harvest: laravel_app/storage/app/{project_name}/Data/harvest.xlsx")
|
||||
print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/")
|
||||
print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx")
|
||||
print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/")
|
||||
print(f"\nReady to load into 09b field analysis report")
|
||||
print(f" Output: laravel_app/storage/app/{project_name}/reports/kpis/field_stats/")
|
||||
print(f" Filename: {project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv")
|
||||
print(f"\nReady to load into 80_calculate_kpis.R")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
366
python_app/batch_rgb_validation_top_fields_v3.py
Normal file
366
python_app/batch_rgb_validation_top_fields_v3.py
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Batch RGB Validation for Top 100 Largest Fields - V3
|
||||
|
||||
Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images).
|
||||
|
||||
Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields.
|
||||
Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest.
|
||||
|
||||
Configuration:
|
||||
- GeoJSON: pivot.geojson (defines field boundaries and sizes)
|
||||
- Harvest data: harvest.xlsx (season_end dates for completed harvests)
|
||||
- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png
|
||||
|
||||
Usage:
|
||||
python batch_rgb_validation_top_fields_v3.py --field 1
|
||||
python batch_rgb_validation_top_fields_v3.py
|
||||
|
||||
Output:
|
||||
- Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/
|
||||
- Filenames: field_<NAME>_<YYYYMMDD>_harvest_rgb.png
|
||||
- Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images)
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from rgb_visualization import generate_rgb_grids
|
||||
|
||||
|
||||
def load_geojson_and_calculate_areas(geojson_path):
|
||||
"""
|
||||
Load GeoJSON and calculate area for each field.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending
|
||||
"""
|
||||
geojson_path = Path(geojson_path)
|
||||
|
||||
if not geojson_path.exists():
|
||||
print(f"✗ GeoJSON not found: {geojson_path}")
|
||||
return None
|
||||
|
||||
print(f"Loading GeoJSON: {geojson_path}")
|
||||
|
||||
with open(geojson_path) as f:
|
||||
geojson_data = json.load(f)
|
||||
|
||||
fields = []
|
||||
|
||||
for feature in geojson_data.get('features', []):
|
||||
props = feature.get('properties', {})
|
||||
field_id = str(props.get('field', ''))
|
||||
field_name = props.get('name', f"field_{field_id}")
|
||||
|
||||
geometry = feature.get('geometry', {})
|
||||
geom_type = geometry.get('type', '')
|
||||
coordinates = geometry.get('coordinates', [])
|
||||
|
||||
# Simple area calculation using Shoelace formula
|
||||
area_m2 = 0
|
||||
if geom_type == 'Polygon' and coordinates:
|
||||
coords = coordinates[0] # Exterior ring
|
||||
area_m2 = calculate_polygon_area(coords)
|
||||
elif geom_type == 'MultiPolygon' and coordinates:
|
||||
for poly_coords in coordinates:
|
||||
area_m2 += calculate_polygon_area(poly_coords[0])
|
||||
|
||||
if area_m2 > 0:
|
||||
fields.append({
|
||||
'field': field_id,
|
||||
'field_name': field_name,
|
||||
'area_m2': area_m2,
|
||||
'area_hectares': area_m2 / 10000
|
||||
})
|
||||
|
||||
df = pd.DataFrame(fields)
|
||||
df = df.sort_values('area_m2', ascending=False)
|
||||
|
||||
print(f" ✓ Loaded {len(df)} fields")
|
||||
print(f" Top 10 largest fields (hectares):")
|
||||
for i, row in df.head(10).iterrows():
|
||||
print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def calculate_polygon_area(coords):
|
||||
"""
|
||||
Calculate area of polygon using Shoelace formula (in m²).
|
||||
Assumes coordinates are in lat/lon (roughly converts to meters).
|
||||
"""
|
||||
if len(coords) < 3:
|
||||
return 0
|
||||
|
||||
# Rough conversion: at equator, 1 degree ≈ 111 km
|
||||
# For lat/lon coordinates, use average latitude
|
||||
lats = [c[1] for c in coords]
|
||||
avg_lat = np.mean(lats)
|
||||
lat_m_per_deg = 111000
|
||||
lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat))
|
||||
|
||||
# Convert to meters
|
||||
coords_m = []
|
||||
for lon, lat in coords:
|
||||
x = (lon - coords[0][0]) * lon_m_per_deg
|
||||
y = (lat - coords[0][1]) * lat_m_per_deg
|
||||
coords_m.append((x, y))
|
||||
|
||||
# Shoelace formula
|
||||
area = 0
|
||||
for i in range(len(coords_m)):
|
||||
j = (i + 1) % len(coords_m)
|
||||
area += coords_m[i][0] * coords_m[j][1]
|
||||
area -= coords_m[j][0] * coords_m[i][1]
|
||||
|
||||
return abs(area) / 2
|
||||
|
||||
|
||||
def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df):
|
||||
"""
|
||||
Load harvest data from Excel file and get latest completed season for each field.
|
||||
|
||||
Returns season_end date for each field (latest complete season where season_end is not null).
|
||||
|
||||
Args:
|
||||
harvest_xlsx_path (Path): Path to harvest.xlsx
|
||||
top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering
|
||||
|
||||
Returns:
|
||||
dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}}
|
||||
"""
|
||||
harvest_xlsx_path = Path(harvest_xlsx_path)
|
||||
|
||||
if not harvest_xlsx_path.exists():
|
||||
print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}")
|
||||
return {}
|
||||
|
||||
print(f"Loading harvest data: {harvest_xlsx_path}")
|
||||
|
||||
try:
|
||||
harvest_df = pd.read_excel(harvest_xlsx_path)
|
||||
|
||||
# Ensure date columns are datetime
|
||||
if 'season_end' in harvest_df.columns:
|
||||
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
|
||||
|
||||
# Filter to top 50 fields and get only rows with season_end filled in
|
||||
top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip())
|
||||
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
|
||||
harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)]
|
||||
harvest_df = harvest_df[harvest_df['season_end'].notna()]
|
||||
|
||||
# Group by field and get the LATEST (most recent) season_end
|
||||
latest_harvests = {}
|
||||
|
||||
for field_id in top_50_field_ids:
|
||||
field_records = harvest_df[harvest_df['field'] == field_id]
|
||||
|
||||
if len(field_records) > 0:
|
||||
# Get row with latest season_end
|
||||
latest_idx = field_records['season_end'].idxmax()
|
||||
latest_row = field_records.loc[latest_idx]
|
||||
|
||||
# Get field name from top_50_fields_df
|
||||
field_info = top_50_fields_df[top_50_fields_df['field'] == field_id]
|
||||
if len(field_info) > 0:
|
||||
field_name = field_info.iloc[0]['field_name']
|
||||
else:
|
||||
field_name = f"field_{field_id}"
|
||||
|
||||
latest_harvests[field_id] = {
|
||||
'field_name': field_name,
|
||||
'harvest_date': latest_row['season_end']
|
||||
}
|
||||
|
||||
print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields")
|
||||
|
||||
return latest_harvests
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error loading harvest data: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)')
|
||||
parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")')
|
||||
parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("="*90)
|
||||
if args.field:
|
||||
print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}")
|
||||
else:
|
||||
print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS")
|
||||
print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)")
|
||||
print("="*90)
|
||||
|
||||
# Configuration
|
||||
project = args.project
|
||||
geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson")
|
||||
harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx")
|
||||
output_dir = Path(f"laravel_app/storage/app/{project}/RGB")
|
||||
tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5")
|
||||
|
||||
# Verify paths
|
||||
if not geojson_path.exists():
|
||||
print(f"✗ GeoJSON not found: {geojson_path}")
|
||||
return
|
||||
if not harvest_xlsx.exists():
|
||||
print(f"✗ Harvest Excel not found: {harvest_xlsx}")
|
||||
return
|
||||
if not tiff_dir.exists():
|
||||
print(f"✗ TIFF directory not found: {tiff_dir}")
|
||||
return
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Handle single field mode
|
||||
if args.field:
|
||||
print(f"\n[1/3] Loading harvest data for field {args.field}...")
|
||||
|
||||
harvest_df = pd.read_excel(harvest_xlsx)
|
||||
harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')
|
||||
harvest_df['field'] = harvest_df['field'].astype(str).str.strip()
|
||||
|
||||
field_records = harvest_df[harvest_df['field'] == args.field]
|
||||
field_records = field_records[field_records['season_end'].notna()]
|
||||
|
||||
if len(field_records) == 0:
|
||||
print(f"✗ No harvest data found for field {args.field}")
|
||||
return
|
||||
|
||||
# Get latest harvest for this field
|
||||
latest_idx = field_records['season_end'].idxmax()
|
||||
latest_row = field_records.loc[latest_idx]
|
||||
harvest_date = latest_row['season_end']
|
||||
|
||||
print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}")
|
||||
|
||||
# Load field name from GeoJSON
|
||||
print(f"\n[2/3] Loading field name from GeoJSON...")
|
||||
with open(geojson_path) as f:
|
||||
geojson_data = json.load(f)
|
||||
|
||||
field_name = f"field_{args.field}"
|
||||
for feature in geojson_data.get('features', []):
|
||||
props = feature.get('properties', {})
|
||||
if str(props.get('field', '')) == args.field:
|
||||
field_name = props.get('name', field_name)
|
||||
break
|
||||
|
||||
print(f" ✓ Field name: {field_name}")
|
||||
|
||||
# Generate RGB grid
|
||||
print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...")
|
||||
results = generate_rgb_grids(
|
||||
field_data=None,
|
||||
field_id=args.field,
|
||||
registered_harvest_dates=[],
|
||||
predicted_harvest_dates=[
|
||||
{
|
||||
'harvest_date': harvest_date,
|
||||
'model_name': 'harvest_xlsx'
|
||||
}
|
||||
],
|
||||
output_dir=str(output_dir),
|
||||
tiff_dir=str(tiff_dir),
|
||||
geojson_path=str(geojson_path)
|
||||
)
|
||||
|
||||
print("\n" + "="*90)
|
||||
if results['predicted']:
|
||||
print(f"✓ RGB grid generated successfully!")
|
||||
print(f" Field: {field_name} (ID: {args.field})")
|
||||
print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}")
|
||||
print(f" Output: {output_dir}")
|
||||
else:
|
||||
print(f"⚠ No RGB grid generated (no imagery available)")
|
||||
print("="*90)
|
||||
return
|
||||
|
||||
# Batch mode for top 100 fields
|
||||
print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...")
|
||||
|
||||
fields_df = load_geojson_and_calculate_areas(geojson_path)
|
||||
if fields_df is None:
|
||||
return
|
||||
|
||||
top_100_fields = fields_df.head(100)
|
||||
print(f" ✓ Selected {len(top_100_fields)} largest fields for processing")
|
||||
|
||||
# Step 2: Load harvest dates from Excel
|
||||
print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...")
|
||||
harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields)
|
||||
|
||||
if len(harvest_dates) == 0:
|
||||
print("✗ No harvest dates found in Excel file")
|
||||
return
|
||||
|
||||
print(f" ✓ Found {len(harvest_dates)} fields with completed seasons")
|
||||
for field_id, info in list(harvest_dates.items())[:5]:
|
||||
print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}")
|
||||
if len(harvest_dates) > 5:
|
||||
print(f" ... and {len(harvest_dates) - 5} more")
|
||||
|
||||
# Step 3: Generate RGB grids for each field
|
||||
print("\n[3/4] Generating RGB validation grids (v3 dynamic)...")
|
||||
rgb_count = 0
|
||||
|
||||
for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1):
|
||||
field_name = harvest_info['field_name']
|
||||
harvest_date = harvest_info['harvest_date']
|
||||
|
||||
try:
|
||||
# Run RGB visualization (harvest dates only, no registered/predicted distinction)
|
||||
results = generate_rgb_grids(
|
||||
field_data=None, # Not needed - just for function compatibility
|
||||
field_id=field_id,
|
||||
registered_harvest_dates=[], # Empty - using harvest.xlsx instead
|
||||
predicted_harvest_dates=[
|
||||
{
|
||||
'harvest_date': harvest_date,
|
||||
'model_name': 'harvest_xlsx'
|
||||
}
|
||||
],
|
||||
output_dir=str(output_dir), # All PNGs in same folder
|
||||
tiff_dir=str(tiff_dir),
|
||||
geojson_path=str(geojson_path)
|
||||
)
|
||||
|
||||
if results['predicted']:
|
||||
rgb_count += 1
|
||||
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}")
|
||||
else:
|
||||
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)")
|
||||
|
||||
except Exception as e:
|
||||
print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*90)
|
||||
print(f"SUMMARY:")
|
||||
print(f" Fields with harvest dates: {len(harvest_dates)}")
|
||||
print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}")
|
||||
print(f" Output directory: {output_dir}")
|
||||
print("="*90)
|
||||
print("\nVisual inspection checklist:")
|
||||
print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested")
|
||||
print(" ⚠ Green vegetation at T~0d = Possible data error or replanting")
|
||||
print(" ✓ Green → Brown progression = Normal harvest sequence")
|
||||
print("="*90)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
193
python_app/create_rgb_evaluation_template.py
Normal file
193
python_app/create_rgb_evaluation_template.py
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
"""
|
||||
Create an Excel evaluation template for RGB harvest date predictions.
|
||||
Parses field names and dates directly from RGB image filenames.
|
||||
"""
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
# Configuration
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
RGB_DIR = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB")
|
||||
OUTPUT_PATH = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB_Evaluation_Template.xlsx")
|
||||
|
||||
# Evaluators
|
||||
EVALUATORS = ["Joey", "Daniel", "Nik", "Dimitra", "Timon"]
|
||||
|
||||
def parse_rgb_filenames():
|
||||
"""
|
||||
Parse field names and harvest dates from RGB image filenames.
|
||||
Expected format: field_{field_id or name}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png
|
||||
"""
|
||||
fields_data = []
|
||||
|
||||
# Find all RGB images
|
||||
rgb_files = glob.glob(os.path.join(RGB_DIR, "field_*.png"))
|
||||
|
||||
for filepath in sorted(rgb_files):
|
||||
filename = os.path.basename(filepath)
|
||||
# Pattern: field_{field_id}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png
|
||||
match = re.match(r"field_(.+?)_(\d{8})_harvest_xlsx_harvest_rgb\.png", filename)
|
||||
|
||||
if match:
|
||||
field_id = match.group(1) # e.g., "1000" or "91&92"
|
||||
date_str = match.group(2) # e.g., "20250814"
|
||||
|
||||
# Format date as YYYY-MM-DD
|
||||
try:
|
||||
harvest_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
|
||||
except ValueError:
|
||||
harvest_date = date_str
|
||||
|
||||
fields_data.append({
|
||||
"field_id": field_id,
|
||||
"harvest_date": harvest_date,
|
||||
"filename": filename
|
||||
})
|
||||
|
||||
# Sort by field_id (treating numeric parts as integers where possible)
|
||||
fields_data = sorted(fields_data, key=lambda x: (x["field_id"].replace("&92", ""), ))
|
||||
|
||||
return fields_data
|
||||
|
||||
def create_evaluation_template():
|
||||
"""Create the Excel evaluation template."""
|
||||
print("Loading field data from RGB images...")
|
||||
fields_data = parse_rgb_filenames()
|
||||
|
||||
if not fields_data:
|
||||
print("ERROR: No RGB images found in", RGB_DIR)
|
||||
return
|
||||
|
||||
print(f"Found {len(fields_data)} RGB images")
|
||||
|
||||
# Create workbook
|
||||
wb = Workbook()
|
||||
|
||||
# === SHEET 1: Evaluation Form ===
|
||||
ws_eval = wb.active
|
||||
ws_eval.title = "Evaluation"
|
||||
|
||||
# Define styles
|
||||
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
|
||||
header_font = Font(bold=True, color="FFFFFF", size=11)
|
||||
border = Border(
|
||||
left=Side(style='thin'),
|
||||
right=Side(style='thin'),
|
||||
top=Side(style='thin'),
|
||||
bottom=Side(style='thin')
|
||||
)
|
||||
center_align = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
left_align = Alignment(horizontal="left", vertical="center", wrap_text=True)
|
||||
|
||||
# Column headers
|
||||
headers = ["Field ID", "Predicted Harvest Date"] + EVALUATORS
|
||||
for col_idx, header in enumerate(headers, start=1):
|
||||
cell = ws_eval.cell(row=1, column=col_idx, value=header)
|
||||
cell.fill = header_fill
|
||||
cell.font = header_font
|
||||
cell.alignment = center_align
|
||||
cell.border = border
|
||||
|
||||
# Set column widths
|
||||
ws_eval.column_dimensions['A'].width = 15
|
||||
ws_eval.column_dimensions['B'].width = 20
|
||||
for col_idx in range(3, 3 + len(EVALUATORS)):
|
||||
ws_eval.column_dimensions[get_column_letter(col_idx)].width = 12
|
||||
|
||||
# Add data rows
|
||||
for row_idx, field in enumerate(fields_data, start=2):
|
||||
ws_eval.cell(row=row_idx, column=1, value=field["field_id"])
|
||||
ws_eval.cell(row=row_idx, column=2, value=field["harvest_date"])
|
||||
|
||||
# Add empty cells for evaluator responses
|
||||
for col_idx in range(3, 3 + len(EVALUATORS)):
|
||||
cell = ws_eval.cell(row=row_idx, column=col_idx)
|
||||
cell.alignment = center_align
|
||||
cell.border = border
|
||||
|
||||
# Light alternating row color
|
||||
if row_idx % 2 == 0:
|
||||
for col_idx in range(1, 3 + len(EVALUATORS)):
|
||||
ws_eval.cell(row=row_idx, column=col_idx).fill = PatternFill(
|
||||
start_color="D9E8F5", end_color="D9E8F5", fill_type="solid"
|
||||
)
|
||||
|
||||
# Apply borders to all data cells
|
||||
for col_idx in range(1, 3 + len(EVALUATORS)):
|
||||
ws_eval.cell(row=row_idx, column=col_idx).border = border
|
||||
if col_idx == 1 or col_idx == 2:
|
||||
ws_eval.cell(row=row_idx, column=col_idx).alignment = left_align
|
||||
|
||||
# Freeze panes
|
||||
ws_eval.freeze_panes = "C2"
|
||||
|
||||
# === SHEET 2: Instructions ===
|
||||
ws_instr = wb.create_sheet("Instructions")
|
||||
|
||||
instr_content = [
|
||||
["RGB Evaluation Instructions", ""],
|
||||
["", ""],
|
||||
["Overview:", ""],
|
||||
["The generated RGB images visualize the predicted harvest dates for each field.", ""],
|
||||
["The images are 3x3 grids showing satellite imagery from different dates", ""],
|
||||
["centered on the predicted harvest date (the center/red-box image).", ""],
|
||||
["", ""],
|
||||
["What to Evaluate:", ""],
|
||||
["For each field, determine if the predicted harvest date is CORRECT:", ""],
|
||||
["", ""],
|
||||
["Instructions for Reviewing:", ""],
|
||||
["1. Look at the CENTER image (red box) - this is the predicted harvest date", ""],
|
||||
["2. Compare to surrounding dates (before and after)", ""],
|
||||
["3. Look for change in field color/status:", ""],
|
||||
[" • BEFORE: Field appears GREEN (growing/healthy crop)", ""],
|
||||
[" • AT PREDICTED DATE: Field shows BROWN/YELLOW (soil visible, ripe for harvest)", ""],
|
||||
[" • AFTER: Field continues BROWN (post-harvest or dormant)", ""],
|
||||
["", ""],
|
||||
["How to Enter Your Assessment:", ""],
|
||||
["Enter one of the following in your evaluator column for each field:", ""],
|
||||
[" • YES = Predicted date is CORRECT (brown/harvest-ready at center date)", ""],
|
||||
[" • NO = Predicted date is INCORRECT (not ready or already harvested)", ""],
|
||||
[" • ? or MAYBE = Uncertain (cloudy images, unclear field status)", ""],
|
||||
["", ""],
|
||||
["Workflow Options:", ""],
|
||||
["Option A (Divide Work): Assign 2-3 fields per evaluator (rows divided by column)", ""],
|
||||
["Option B (Full Review): Each evaluator reviews all fields (everyone fills all rows)", ""],
|
||||
["Option C (Spot Check): Each evaluator samples 5-10 random fields", ""],
|
||||
["", ""],
|
||||
["Image Location:", ""],
|
||||
["All RGB images are in: /laravel_app/storage/app/angata/RGB/", ""],
|
||||
["Format: field_{FIELD_ID}_{YYYY-MM-DD}_harvest_xlsx_harvest_rgb.png", ""],
|
||||
["", ""],
|
||||
["Notes:", ""],
|
||||
["• Cloud cover may obscure ground truth - use best judgment", ""],
|
||||
["• Fields with multiple bands or irregular shapes: focus on dominant area", ""],
|
||||
["• Use context from previous/next dates to validate your assessment", ""],
|
||||
]
|
||||
|
||||
# Add instructions text
|
||||
for row_idx, row_data in enumerate(instr_content, start=1):
|
||||
for col_idx, value in enumerate(row_data, start=1):
|
||||
cell = ws_instr.cell(row=row_idx, column=col_idx, value=value)
|
||||
if row_idx == 1:
|
||||
cell.font = Font(bold=True, size=14)
|
||||
elif any(keyword in str(value) for keyword in ["Overview:", "Instructions", "Workflow", "Image Location", "Notes"]):
|
||||
cell.font = Font(bold=True, size=11)
|
||||
cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True)
|
||||
|
||||
ws_instr.column_dimensions['A'].width = 50
|
||||
ws_instr.column_dimensions['B'].width = 80
|
||||
|
||||
# Save workbook
|
||||
wb.save(OUTPUT_PATH)
|
||||
print(f"✓ Evaluation template created: {OUTPUT_PATH}")
|
||||
print(f"✓ {len(fields_data)} fields added to evaluation form")
|
||||
print(f"✓ Evaluators: {', '.join(EVALUATORS)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_evaluation_template()
|
||||
|
|
@ -4,7 +4,7 @@ Purpose: Download Planet satellite data for missing dates only (skip existing fi
|
|||
Can be called from batch scripts or other Python scripts.
|
||||
|
||||
Usage:
|
||||
python download_planet_missing_dates.py --start 2022-01-01 --end 2025-12-15 --project angata
|
||||
python download_planet_missing_dates.py --start 2026-01-17 --end 2026-12-20 --project angata
|
||||
python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run
|
||||
|
||||
Environment variables (alternative to CLI args):
|
||||
|
|
|
|||
|
|
@ -184,18 +184,47 @@ def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> p
|
|||
return features.fillna(0)
|
||||
|
||||
|
||||
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
|
||||
"""Extract and return specified features as numpy array."""
|
||||
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData',
|
||||
season_anchor_day: int = None, lookback_start: int = 0) -> np.ndarray:
|
||||
"""
|
||||
Extract and return specified features as numpy array.
|
||||
|
||||
Args:
|
||||
data_df: DataFrame with Date and CI data (may be a window after a harvest)
|
||||
feature_names: List of feature names to extract
|
||||
ci_column: Name of CI column
|
||||
season_anchor_day: Day in FULL sequence where this season started (for DOY reset)
|
||||
DOY will be recalculated as: 1, 2, 3, ... from this point
|
||||
lookback_start: Starting index in original full data (for season reset calculation)
|
||||
|
||||
Returns:
|
||||
NumPy array of shape (len(data_df), len(feature_names))
|
||||
"""
|
||||
# Compute all CI features
|
||||
ci_series = data_df[ci_column].astype(float)
|
||||
doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
|
||||
|
||||
# Compute DOY (age/days since season start) - NOT day-of-year!
|
||||
# DOY is a continuous counter: 1, 2, 3, ..., 475 (doesn't cycle at 365)
|
||||
# It only resets to 1 after a harvest is detected (new season)
|
||||
doy_series = None
|
||||
if 'DOY_normalized' in feature_names:
|
||||
if season_anchor_day is not None and lookback_start >= season_anchor_day:
|
||||
# Season was reset after harvest. Recalculate DOY as simple counter from 1
|
||||
# This is a window starting at or after harvest, so DOY should be: 1, 2, 3, ...
|
||||
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
|
||||
elif 'DOY' in data_df.columns:
|
||||
# Use DOY directly from CSV - already calculated as continuous age counter
|
||||
doy_series = pd.Series(data_df['DOY'].astype(float).values, index=data_df.index)
|
||||
else:
|
||||
# Fallback: create continuous age counter (1, 2, 3, ...)
|
||||
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
|
||||
|
||||
all_features = compute_ci_features(ci_series, doy_series)
|
||||
|
||||
# Select requested features
|
||||
requested = [f for f in feature_names if f in all_features.columns]
|
||||
if not requested:
|
||||
raise ValueError(f"No valid features found. Requested: {feature_names}")
|
||||
raise ValueError(f"No valid features found. Requested: {feature_names}, Available: {all_features.columns.tolist()}")
|
||||
|
||||
return all_features[requested].values
|
||||
|
||||
|
|
@ -274,59 +303,64 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame:
|
|||
def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device,
|
||||
threshold=0.45, consecutive_days=2):
|
||||
"""
|
||||
Phase 1: Growing window detection with threshold crossing.
|
||||
Expand window day-by-day, check last timestep's detected_prob.
|
||||
When N consecutive days have prob > threshold, harvest detected.
|
||||
Phase 1: Growing window detection with DOY season reset.
|
||||
|
||||
Args:
|
||||
threshold (float): Probability threshold (default 0.45, tuned for Model 307)
|
||||
consecutive_days (int): Required consecutive days above threshold (default 2, reduced from 3 for robustness)
|
||||
|
||||
Returns list of (harvest_date, harvest_idx) tuples.
|
||||
For each detected harvest, reset DOY counter for the next season.
|
||||
This allows the model to detect multiple consecutive harvests in multi-year data.
|
||||
"""
|
||||
harvest_dates = []
|
||||
season_anchor_day = 0
|
||||
current_pos = 0
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
min_window_size = 120
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||
if len(window_data) < min_window_size:
|
||||
continue
|
||||
|
||||
# Apply scalers
|
||||
try:
|
||||
reset_doy = current_pos > season_anchor_day
|
||||
|
||||
features = extract_features(
|
||||
window_data,
|
||||
config['features'],
|
||||
ci_column=ci_column,
|
||||
season_anchor_day=season_anchor_day if reset_doy else None,
|
||||
lookback_start=current_pos
|
||||
)
|
||||
|
||||
features_scaled = features.copy().astype(float)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Run model
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||
|
||||
# Check LAST timestep
|
||||
last_prob = detected_probs[-1]
|
||||
last_prob = detected_probs[0, -1].item()
|
||||
|
||||
if last_prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
# Harvest detected: N consecutive days above threshold
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
|
||||
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
|
||||
harvest_idx = current_pos + window_end - consecutive_days
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
|
||||
# Reset to next day after harvest
|
||||
current_pos = current_pos + window_end - consecutive_days + 1
|
||||
harvest_dates.append((harvest_date, harvest_idx))
|
||||
season_anchor_day = harvest_idx + 1
|
||||
current_pos = harvest_idx + 1
|
||||
break
|
||||
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
|
@ -359,8 +393,21 @@ def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, c
|
|||
window_start_date = season_start_date - pd.Timedelta(days=40)
|
||||
window_end_date = phase1_harvest_date + pd.Timedelta(days=40)
|
||||
|
||||
window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0)
|
||||
window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data))
|
||||
# FIXED: Use proper index selection
|
||||
mask_start = field_data['Date'] >= window_start_date
|
||||
mask_end = field_data['Date'] <= window_end_date
|
||||
|
||||
if mask_start.any():
|
||||
window_start_idx = mask_start.idxmax() # First True index
|
||||
else:
|
||||
window_start_idx = 0
|
||||
|
||||
if mask_end.any():
|
||||
# Last True index: find where condition becomes False from the right
|
||||
true_indices = np.where(mask_end)[0]
|
||||
window_end_idx = true_indices[-1] + 1 # +1 for slicing (exclusive end)
|
||||
else:
|
||||
window_end_idx = len(field_data)
|
||||
|
||||
if window_end_idx <= window_start_idx:
|
||||
refined_harvests.append((phase1_harvest_date, phase1_idx))
|
||||
|
|
@ -414,7 +461,9 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non
|
|||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
results = []
|
||||
ci_column = config['data']['ci_column']
|
||||
# CI column is 'FitData' (interpolated CI) - NOT 'value' (raw with NAs)
|
||||
# 'FitData' is already gap-filled by R stage 03, ready for ML
|
||||
ci_column = 'FitData'
|
||||
|
||||
# Group by field and count total fields for progress
|
||||
field_groups = list(df.groupby('field'))
|
||||
|
|
@ -469,6 +518,10 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non
|
|||
|
||||
print() # New line after progress bar
|
||||
print(f" ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields")
|
||||
if results:
|
||||
print(f" Sample harvest dates: {results[0]['phase2_harvest_date']}")
|
||||
if len(results) > 1:
|
||||
print(f" {results[-1]['phase2_harvest_date']}")
|
||||
|
||||
return results
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,299 @@
|
|||
"""
|
||||
Batch Field Visualization Tool - RGB Imagery Around Harvest Date
|
||||
Purpose: Generate visual validation using RGB satellite imagery samples around
|
||||
predicted harvest date to verify predictions (bare soil = harvested, green = not harvested)
|
||||
|
||||
Shows 12-15 RGB images in a grid, centered around the predicted harvest date
|
||||
|
||||
Usage:
|
||||
python batch_plot_fields_rgb.py field1,field2,field3
|
||||
python batch_plot_fields_rgb.py 10125,88,97
|
||||
|
||||
Or read from CSV:
|
||||
python batch_plot_fields_rgb.py --file fields_to_check.csv
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import sys
|
||||
import rasterio
|
||||
from rasterio.mask import mask
|
||||
import geopandas as gpd
|
||||
from harvest_date_pred_utils import load_model_and_config, extract_features
|
||||
|
||||
|
||||
def get_field_centroid(field_id, geojson_path="pivot.geojson"):
|
||||
"""Get centroid of field from GeoJSON for cropping RGB images."""
|
||||
try:
|
||||
gdf = gpd.read_file(geojson_path)
|
||||
field_geom = gdf[gdf['field'] == str(field_id)]
|
||||
if len(field_geom) > 0:
|
||||
centroid = field_geom.geometry.iloc[0].centroid
|
||||
return (centroid.x, centroid.y)
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not get field centroid - {e}")
|
||||
return None
|
||||
|
||||
|
||||
def load_rgb_image(tif_path, field_id=None, geojson_path="pivot.geojson"):
|
||||
"""
|
||||
Load RGB bands from 8-band GeoTIFF
|
||||
Bands: 0=coastal, 1=blue, 2=green, 3=green_i, 4=yellow, 5=red, 6=rededge, 7=nir
|
||||
RGB = bands 5,3,1 (Red, Green, Blue)
|
||||
"""
|
||||
try:
|
||||
with rasterio.open(tif_path) as src:
|
||||
# Read RGB bands (bands are 1-indexed in rasterio)
|
||||
red = src.read(6) # Band 6 = red (0-indexed band 5)
|
||||
green = src.read(3) # Band 3 = green (0-indexed band 2)
|
||||
blue = src.read(2) # Band 2 = blue (0-indexed band 1)
|
||||
|
||||
# Stack into RGB image
|
||||
rgb = np.stack([red, green, blue], axis=2)
|
||||
|
||||
# Normalize to 0-1 range (8-band data is typically 0-10000)
|
||||
rgb = np.clip(rgb / 5000.0, 0, 1)
|
||||
|
||||
return rgb
|
||||
except Exception as e:
|
||||
print(f" Error loading RGB from {tif_path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device,
|
||||
tif_folder="../../../laravel_app/storage/app/angata/merged_tif_8b",
|
||||
output_dir="validation_plots_rgb"):
|
||||
"""
|
||||
Create validation plot for a single field:
|
||||
- Top: Harvest probability over time with peak marked
|
||||
- Bottom: 12-15 RGB images in grid around predicted harvest date
|
||||
"""
|
||||
# Create output directory
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Filter field data
|
||||
field_data = ci_data[ci_data['field'] == field_id].copy()
|
||||
if len(field_data) == 0:
|
||||
print(f" ✗ Field {field_id}: No CI data found")
|
||||
return False
|
||||
|
||||
field_data = field_data.sort_values('Date')
|
||||
print(f" ✓ Field {field_id}: {len(field_data)} days of data")
|
||||
|
||||
try:
|
||||
# Extract features and run inference
|
||||
ci_column = config['data']['ci_column']
|
||||
feature_names = config['features']
|
||||
|
||||
feat_array = extract_features(field_data, feature_names, ci_column=ci_column)
|
||||
if feat_array is None:
|
||||
print(f" ✗ Field {field_id}: Feature extraction failed")
|
||||
return False
|
||||
|
||||
# Apply scalers
|
||||
if isinstance(scalers, dict) and 'features' in scalers:
|
||||
feat_array = scalers['features'].transform(feat_array)
|
||||
|
||||
# Run inference
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(feat_array, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
out_imm, out_det = model(x_tensor)
|
||||
imm_probs = out_imm.squeeze(0).cpu().numpy()
|
||||
|
||||
# Find peak probability (predicted harvest date)
|
||||
peak_idx = np.argmax(imm_probs)
|
||||
peak_date = field_data['Date'].iloc[peak_idx]
|
||||
peak_prob = imm_probs[peak_idx]
|
||||
|
||||
print(f" Peak probability: {peak_prob:.3f} on {peak_date.strftime('%Y-%m-%d')}")
|
||||
|
||||
# Get date range: ±6 days around peak (12-13 images total)
|
||||
date_range = field_data['Date'].dt.date
|
||||
peak_date_only = peak_date.date() if hasattr(peak_date, 'date') else peak_date
|
||||
|
||||
days_before = 6
|
||||
days_after = 6
|
||||
start_date = peak_date_only - timedelta(days=days_before)
|
||||
end_date = peak_date_only + timedelta(days=days_after)
|
||||
|
||||
# Find available TIF files in date range
|
||||
tif_folder_path = Path(tif_folder)
|
||||
available_dates = []
|
||||
for tif_file in sorted(tif_folder_path.glob("*.tif")):
|
||||
date_str = tif_file.stem # YYYY-MM-DD
|
||||
try:
|
||||
tif_date = datetime.strptime(date_str, "%Y-%m-%d").date()
|
||||
if start_date <= tif_date <= end_date:
|
||||
available_dates.append((tif_date, tif_file))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if len(available_dates) == 0:
|
||||
print(f" Warning: No TIF files found in {start_date} to {end_date}")
|
||||
return False
|
||||
|
||||
print(f" Found {len(available_dates)} RGB images in date range")
|
||||
|
||||
# Load RGB images
|
||||
rgb_images = []
|
||||
rgb_dates = []
|
||||
for tif_date, tif_file in available_dates:
|
||||
rgb = load_rgb_image(str(tif_file), field_id)
|
||||
if rgb is not None:
|
||||
rgb_images.append(rgb)
|
||||
rgb_dates.append(tif_date)
|
||||
|
||||
if len(rgb_images) == 0:
|
||||
print(f" ✗ No RGB images loaded")
|
||||
return False
|
||||
|
||||
print(f" Loaded {len(rgb_images)} RGB images")
|
||||
|
||||
# Create figure with probability plot + RGB grid
|
||||
n_images = len(rgb_images)
|
||||
n_cols = min(5, n_images) # Max 5 columns
|
||||
n_rows = (n_images + n_cols - 1) // n_cols # Calculate rows needed
|
||||
|
||||
fig = plt.figure(figsize=(18, 12))
|
||||
|
||||
# Probability plot (top, spanning all columns)
|
||||
ax_prob = plt.subplot(n_rows + 1, n_cols, (1, n_cols))
|
||||
dates_arr = field_data['Date'].values
|
||||
ax_prob.plot(dates_arr, imm_probs, '-', color='orange', linewidth=2.5, label='Imminent Probability', alpha=0.8)
|
||||
ax_prob.axhline(y=0.5, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label='Threshold (0.5)')
|
||||
ax_prob.axvline(x=peak_date, color='darkred', linestyle=':', linewidth=2, alpha=0.7, label='Peak')
|
||||
ax_prob.fill_between(dates_arr, 0.5, 1.0, alpha=0.08, color='red')
|
||||
ax_prob.set_ylim(-0.05, 1.05)
|
||||
ax_prob.set_ylabel('Probability', fontsize=11, fontweight='bold')
|
||||
ax_prob.set_xlabel('Date', fontsize=11, fontweight='bold')
|
||||
ax_prob.set_title(f'Field {field_id} - Model 307 Harvest Probability', fontsize=12, fontweight='bold')
|
||||
ax_prob.grid(True, alpha=0.3)
|
||||
ax_prob.legend(loc='upper right', fontsize=9)
|
||||
ax_prob.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
|
||||
ax_prob.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
|
||||
plt.setp(ax_prob.xaxis.get_majorticklabels(), rotation=45, ha='right')
|
||||
|
||||
# Annotate peak
|
||||
ax_prob.annotate(f'{peak_prob:.2f}\n{peak_date_only}',
|
||||
xy=(peak_date, peak_prob),
|
||||
xytext=(20, 20), textcoords='offset points',
|
||||
bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.8),
|
||||
arrowprops=dict(arrowstyle='->', lw=1.5, color='darkred'))
|
||||
|
||||
# RGB images in grid (below probability plot)
|
||||
for i, (rgb, date) in enumerate(zip(rgb_images, rgb_dates)):
|
||||
ax = plt.subplot(n_rows + 1, n_cols, n_cols + i + 1)
|
||||
ax.imshow(rgb, extent=[0, 100, 0, 100])
|
||||
|
||||
# Highlight peak date
|
||||
date_label = date.strftime('%m-%d')
|
||||
is_peak = date == peak_date_only
|
||||
color = 'darkred' if is_peak else 'black'
|
||||
weight = 'bold' if is_peak else 'normal'
|
||||
size = 11 if is_peak else 9
|
||||
|
||||
ax.set_title(date_label, fontsize=size, fontweight=weight, color=color)
|
||||
ax.set_xticks([])
|
||||
ax.set_yticks([])
|
||||
|
||||
plt.suptitle(f'Field {field_id} RGB Imagery: {len(rgb_images)} Days Around Peak Harvest Probability\nPeak: {peak_prob:.2f} on {peak_date_only} | Green = Growing | Brown/Bare = Harvested',
|
||||
fontsize=13, fontweight='bold', y=0.995)
|
||||
plt.tight_layout()
|
||||
|
||||
# Save
|
||||
output_file = Path(output_dir) / f"field_{field_id}_rgb_validation.png"
|
||||
plt.savefig(output_file, dpi=100, bbox_inches='tight')
|
||||
print(f" ✓ Saved: {output_file}")
|
||||
plt.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Field {field_id}: Error - {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
print("="*80)
|
||||
print("BATCH RGB VISUALIZATION TOOL")
|
||||
print("Visual check: RGB imagery around predicted harvest date")
|
||||
print("="*80)
|
||||
|
||||
# Parse arguments
|
||||
fields_to_plot = []
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("\nUsage:")
|
||||
print(" python batch_plot_fields_rgb.py field1,field2,field3")
|
||||
print(" python batch_plot_fields_rgb.py --file fields.csv")
|
||||
print("\nExample:")
|
||||
print(" python batch_plot_fields_rgb.py 10125,88,97,440")
|
||||
return
|
||||
|
||||
if sys.argv[1] == "--file":
|
||||
if len(sys.argv) < 3:
|
||||
print("ERROR: --file requires a CSV filename")
|
||||
return
|
||||
csv_file = sys.argv[2]
|
||||
print(f"\n[1/4] Loading fields from CSV: {csv_file}")
|
||||
try:
|
||||
df = pd.read_csv(csv_file)
|
||||
fields_to_plot = df['field'].astype(str).str.strip().tolist()
|
||||
print(f" ✓ Loaded {len(fields_to_plot)} fields")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error reading CSV: {e}")
|
||||
return
|
||||
else:
|
||||
# Parse comma-separated list
|
||||
fields_to_plot = [f.strip() for f in sys.argv[1].split(',')]
|
||||
print(f"\n[1/4] Processing {len(fields_to_plot)} field(s): {', '.join(fields_to_plot)}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2/4] Loading CI data...")
|
||||
try:
|
||||
ci_data = pd.read_csv("ci_data_for_python.csv")
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
ci_data['field'] = ci_data['field'].astype(str).str.strip()
|
||||
print(f" ✓ Loaded {len(ci_data)} observations for {ci_data['field'].nunique()} fields")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error loading CI data: {e}")
|
||||
return
|
||||
|
||||
# Load model
|
||||
print("\n[3/4] Loading model...")
|
||||
try:
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.eval()
|
||||
print(f" ✓ Model loaded on {device}")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error loading model: {e}")
|
||||
return
|
||||
|
||||
# Process each field
|
||||
print("\n[4/4] Generating RGB validation plots...")
|
||||
success_count = 0
|
||||
for field_id in fields_to_plot:
|
||||
if plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device):
|
||||
success_count += 1
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*80)
|
||||
print(f"SUMMARY: {success_count}/{len(fields_to_plot)} fields processed successfully")
|
||||
print(f"Output directory: validation_plots_rgb/")
|
||||
print("="*80)
|
||||
print("\nInspect the PNG files to verify predictions:")
|
||||
print(" ✓ Green imagery BEFORE peak date (field growing)")
|
||||
print(" ✓ Brown/Bare imagery AT/AFTER peak date (harvested)")
|
||||
print(" ✓ Peak date marked with red title")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,485 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
RGB Visualization Tool for Harvest Date Validation
|
||||
|
||||
Creates 3x3 temporal grids showing satellite imagery around registered and predicted harvest dates.
|
||||
Extracts RGB from 8-band Planet scope data and clips to field boundaries from GeoJSON.
|
||||
|
||||
Functions:
|
||||
- load_field_boundaries(): Load field geometries from GeoJSON
|
||||
- find_closest_tiff(): Find available TIFF file closest to target date
|
||||
- load_and_clip_tiff_rgb(): Load TIFF, extract RGB, clip to field boundary
|
||||
- create_temporal_grid(): Create 3x3 grid (4 pre-harvest, 1 near, 2-3 post-harvest)
|
||||
- generate_rgb_grids(): Main orchestration function
|
||||
|
||||
Usage:
|
||||
from rgb_visualization import generate_rgb_grids
|
||||
generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates, output_dir, tiff_dir, geojson_path)
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
from matplotlib.colors import Normalize
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
try:
|
||||
import rasterio
|
||||
from rasterio.mask import mask
|
||||
import shapely.geometry as shgeom
|
||||
except ImportError:
|
||||
print("Warning: rasterio not available. RGB visualization will be skipped.")
|
||||
rasterio = None
|
||||
|
||||
|
||||
def load_field_boundaries(geojson_path, field_id):
|
||||
"""
|
||||
Load field boundary from GeoJSON file.
|
||||
|
||||
Args:
|
||||
geojson_path (Path): Path to pivot.geojson
|
||||
field_id (str): Field identifier (e.g., "13973")
|
||||
|
||||
Returns:
|
||||
dict: GeoJSON feature or None if not found
|
||||
shapely.geometry.Polygon: Field boundary polygon or None
|
||||
"""
|
||||
try:
|
||||
with open(geojson_path) as f:
|
||||
geojson_data = json.load(f)
|
||||
|
||||
# Match field ID in properties
|
||||
for feature in geojson_data.get('features', []):
|
||||
props = feature.get('properties', {})
|
||||
# Try matching on 'field' or 'sub_field'
|
||||
if str(props.get('field', '')) == str(field_id) or \
|
||||
str(props.get('sub_field', '')) == str(field_id):
|
||||
geometry = feature.get('geometry')
|
||||
if geometry:
|
||||
geom_type = geometry.get('type', '')
|
||||
coordinates = geometry.get('coordinates', [])
|
||||
|
||||
# Handle MultiPolygon: coordinates[i] = [[[ring coords]], [[inner ring coords]], ...]
|
||||
if geom_type == 'MultiPolygon':
|
||||
# Use the first polygon from the multipolygon
|
||||
if coordinates and len(coordinates) > 0:
|
||||
coords = coordinates[0][0] # First polygon's exterior ring
|
||||
polygon = shgeom.Polygon(coords)
|
||||
return feature, polygon
|
||||
# Handle Polygon: coordinates = [[[ring coords]], [[inner ring coords]], ...]
|
||||
elif geom_type == 'Polygon':
|
||||
if coordinates and len(coordinates) > 0:
|
||||
coords = coordinates[0] # Exterior ring
|
||||
polygon = shgeom.Polygon(coords)
|
||||
return feature, polygon
|
||||
|
||||
print(f" ⚠ Field {field_id} not found in GeoJSON")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f" ✗ Error loading GeoJSON: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
def find_closest_tiff(target_date, tiff_dir, days_window=60, field_boundary=None):
|
||||
"""
|
||||
Find available TIFF file closest to target date.
|
||||
Skips obviously empty files (< 12 MB) without reading data.
|
||||
|
||||
TIFF files are named: YYYY-MM-DD.tif
|
||||
|
||||
Args:
|
||||
target_date (pd.Timestamp): Target date to find TIFF near
|
||||
tiff_dir (Path): Directory containing TIFF files
|
||||
days_window (int): Max days to search before/after target
|
||||
field_boundary (shapely.Polygon): Unused, kept for compatibility
|
||||
|
||||
Returns:
|
||||
Path: Path to closest TIFF file or None if not found
|
||||
int: Days difference from target (negative=before, positive=after)
|
||||
"""
|
||||
target_date = pd.Timestamp(target_date)
|
||||
tiff_dir = Path(tiff_dir)
|
||||
|
||||
available_tiffs = list(tiff_dir.glob('*.tif'))
|
||||
if not available_tiffs:
|
||||
return None, None
|
||||
|
||||
# Parse dates from filenames, skip obviously empty files by size
|
||||
tiff_dates = []
|
||||
min_size_mb = 12.0 # Empty files are ~11.56 MB, valid files are ~12.6-13.0 MB
|
||||
|
||||
for tiff_path in available_tiffs:
|
||||
try:
|
||||
# Quick size check to skip obviously empty files
|
||||
file_size_mb = tiff_path.stat().st_size / (1024 * 1024)
|
||||
if file_size_mb < min_size_mb:
|
||||
continue
|
||||
|
||||
date_str = tiff_path.stem # Remove .tif extension
|
||||
tiff_date = pd.Timestamp(date_str)
|
||||
days_diff = (tiff_date - target_date).days
|
||||
if abs(days_diff) <= days_window:
|
||||
tiff_dates.append((tiff_path, tiff_date, days_diff))
|
||||
except:
|
||||
pass
|
||||
|
||||
if not tiff_dates:
|
||||
return None, None
|
||||
|
||||
# Return closest by distance
|
||||
closest = min(tiff_dates, key=lambda x: abs(x[2]))
|
||||
return closest[0], closest[2]
|
||||
|
||||
|
||||
def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
|
||||
"""
|
||||
Load TIFF and extract RGB bands clipped to field boundary.
|
||||
|
||||
For merged_final_tif files (cloud-masked and filtered):
|
||||
- Band 1: Red
|
||||
- Band 2: Green
|
||||
- Band 3: Blue
|
||||
- Band 4: NIR
|
||||
- Band 5: CI
|
||||
|
||||
For older merged_tif_8b files (raw Planet Scope 8-band):
|
||||
- Bands 1=Coastal Blue, 2=Blue, 3=Green, 4=Red, 5=Red Edge, 6=NIR, 7=SWIR1, 8=SWIR2
|
||||
- RGB would be bands 4,3,2 = Red, Green, Blue
|
||||
|
||||
This function auto-detects the format based on band count and descriptions.
|
||||
|
||||
Args:
|
||||
tiff_path (Path): Path to TIFF file
|
||||
field_boundary (shapely.Polygon): Field boundary for clipping
|
||||
rgb_bands (tuple): Band indices for RGB (1-indexed, defaults to 1,2,3 for merged_final_tif)
|
||||
|
||||
Returns:
|
||||
np.ndarray: RGB data (height, width, 3) with values 0-1
|
||||
or None if error occurs
|
||||
"""
|
||||
if rasterio is None or field_boundary is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
with rasterio.open(tiff_path) as src:
|
||||
# Check CRS compatibility
|
||||
if src.count < 3:
|
||||
print(f" ⚠ TIFF has only {src.count} bands (need at least 3 for RGB)")
|
||||
return None
|
||||
|
||||
# Auto-detect format based on band count and descriptions
|
||||
if src.count == 5 and hasattr(src, 'descriptions') and src.descriptions:
|
||||
# merged_final_tif format: Red, Green, Blue, NIR, CI
|
||||
bands_to_read = (1, 2, 3)
|
||||
elif src.count == 9:
|
||||
# merged_tif_8b format: use bands 4, 3, 2 for Red, Green, Blue
|
||||
bands_to_read = (4, 3, 2)
|
||||
else:
|
||||
# Default: try to use first 3 bands or specified bands
|
||||
bands_to_read = rgb_bands
|
||||
print(f" ℹ Unknown TIFF format ({src.count} bands), using bands {bands_to_read}")
|
||||
|
||||
# Mask and read bands (crop=True reads only the clipped window, not full resolution)
|
||||
geom = shgeom.mapping(field_boundary)
|
||||
try:
|
||||
# Read RGB bands at once, then mask (more efficient than masking 3 times)
|
||||
masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read))
|
||||
|
||||
# masked_data shape is (3, height, width) - one layer per band
|
||||
rgb_data = []
|
||||
for i, band_idx in enumerate(bands_to_read):
|
||||
band_data = masked_data[i] # Extract the i-th band from masked stack
|
||||
|
||||
# Better debug output that handles NaN values
|
||||
valid_data = band_data[~np.isnan(band_data)]
|
||||
if len(valid_data) > 0:
|
||||
print(f" DEBUG: Band {band_idx} valid data range: {valid_data.min():.4f} - {valid_data.max():.4f} ({len(valid_data)} valid pixels)")
|
||||
else:
|
||||
print(f" DEBUG: Band {band_idx} - all NaN!")
|
||||
rgb_data.append(band_data)
|
||||
|
||||
# Stack RGB
|
||||
rgb = np.stack(rgb_data, axis=-1)
|
||||
|
||||
# Data is already normalized 0-1 from the merged_final_tif files
|
||||
# Just ensure it's float32 and clipped
|
||||
rgb = rgb.astype(np.float32)
|
||||
rgb = np.clip(rgb, 0, 1)
|
||||
|
||||
print(f" DEBUG: Final RGB range: {rgb.min():.4f} - {rgb.max():.4f}")
|
||||
|
||||
return rgb
|
||||
except ValueError as e:
|
||||
print(f" ⚠ Error clipping to field boundary: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ✗ Error loading TIFF {tiff_path.name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary,
|
||||
title, output_dir, harvest_type='registered', model_name=None, harvest_index=None):
|
||||
"""
|
||||
Create 5x3 temporal grid around harvest date (15 images, 7-day intervals).
|
||||
|
||||
Layout:
|
||||
Row 1: T-56d, T-42d, T-35d, T-28d, T-21d (pre-harvest)
|
||||
Row 2: T-14d, T-7d, T~0d, T+7d, T+14d (near harvest)
|
||||
Row 3: T+21d, T+28d, T+35d, T+42d, T+56d (post-harvest progression)
|
||||
|
||||
Args:
|
||||
harvest_date (pd.Timestamp): Target harvest date
|
||||
field_data (pd.DataFrame): Field data with Date column
|
||||
field_id (str): Field identifier
|
||||
tiff_dir (Path): Directory with TIFF files
|
||||
field_boundary (shapely.Polygon): Field boundary
|
||||
title (str): Plot title
|
||||
output_dir (Path): Output directory
|
||||
harvest_type (str): 'registered' or 'predicted'
|
||||
model_name (str): Model name for predicted harvests (e.g., 'Original', 'Long-Season')
|
||||
harvest_index (int): Index of harvest within same model (for multiple harvests)
|
||||
|
||||
Returns:
|
||||
Path: Path to saved PNG or None if failed
|
||||
"""
|
||||
harvest_date = pd.Timestamp(harvest_date)
|
||||
|
||||
# Target dates: 15 images at 7-day intervals (8 pre, 1 near, 6 post)
|
||||
target_dates = [
|
||||
harvest_date - timedelta(days=56), # T-56d
|
||||
harvest_date - timedelta(days=49), # T-49d
|
||||
harvest_date - timedelta(days=42), # T-42d
|
||||
harvest_date - timedelta(days=35), # T-35d
|
||||
harvest_date - timedelta(days=28), # T-28d
|
||||
harvest_date - timedelta(days=21), # T-21d
|
||||
harvest_date - timedelta(days=14), # T-14d
|
||||
harvest_date - timedelta(days=7), # T-7d
|
||||
harvest_date, # T~0d (near harvest)
|
||||
harvest_date + timedelta(days=7), # T+7d
|
||||
harvest_date + timedelta(days=14), # T+14d
|
||||
harvest_date + timedelta(days=21), # T+21d
|
||||
harvest_date + timedelta(days=28), # T+28d
|
||||
harvest_date + timedelta(days=35), # T+35d
|
||||
harvest_date + timedelta(days=42), # T+42d
|
||||
harvest_date + timedelta(days=56), # T+56d (Note: non-standard to fill 5th col in row 3)
|
||||
]
|
||||
|
||||
# Find TIFFs for each date
|
||||
rgb_images = []
|
||||
days_offsets = []
|
||||
actual_dates = [] # Store actual dates of TIFFs found
|
||||
|
||||
for target in target_dates:
|
||||
tiff_path, days_diff = find_closest_tiff(target, tiff_dir, days_window=60, field_boundary=field_boundary)
|
||||
|
||||
if tiff_path is None:
|
||||
rgb_images.append(None)
|
||||
days_offsets.append(None)
|
||||
actual_dates.append(None)
|
||||
print(f" ⚠ No TIFF found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data")
|
||||
continue
|
||||
|
||||
# Extract date from filename: YYYY-MM-DD.tif
|
||||
tiff_date = pd.to_datetime(tiff_path.stem)
|
||||
|
||||
rgb = load_and_clip_tiff_rgb(tiff_path, field_boundary)
|
||||
rgb_images.append(rgb)
|
||||
days_offsets.append(days_diff)
|
||||
actual_dates.append(tiff_date)
|
||||
|
||||
if rgb is not None:
|
||||
print(f" ✓ Loaded {tiff_path.name} ({days_diff:+d}d from target)")
|
||||
else:
|
||||
print(f" ⚠ Loaded {tiff_path.name} but RGB data is None")
|
||||
|
||||
# Create 5x3 grid plot (15 images)
|
||||
fig, axes = plt.subplots(3, 5, figsize=(25, 15))
|
||||
fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")}',
|
||||
fontsize=16, fontweight='bold')
|
||||
|
||||
# Grid positions (5 columns, 3 rows = 15 images)
|
||||
positions = [
|
||||
('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4),
|
||||
('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('T~0d', 1, 3), ('T+7d', 1, 4),
|
||||
('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4),
|
||||
]
|
||||
|
||||
for idx, (label, row, col) in enumerate(positions): # All 15 images
|
||||
ax = axes[row, col]
|
||||
|
||||
if idx < len(rgb_images) and rgb_images[idx] is not None:
|
||||
rgb_data = rgb_images[idx]
|
||||
# Debug: check data range
|
||||
data_min, data_max = np.nanmin(rgb_data), np.nanmax(rgb_data)
|
||||
print(f" DEBUG: {label} RGB range: {data_min:.4f} - {data_max:.4f}, shape: {rgb_data.shape}")
|
||||
|
||||
# Display with explicit vmin/vmax to handle normalized 0-1 data
|
||||
ax.imshow(rgb_data, vmin=0, vmax=1)
|
||||
|
||||
# Build title: label + offset + actual date
|
||||
offset_str = f"{days_offsets[idx]:+d}d" if days_offsets[idx] is not None else "?"
|
||||
date_str = actual_dates[idx].strftime('%Y-%m-%d') if actual_dates[idx] is not None else "No Date"
|
||||
ax.set_title(f'{label}\n{offset_str}\n{date_str}', fontsize=10, fontweight='bold')
|
||||
|
||||
# Add red box around harvest date (T~0d at row=1, col=3)
|
||||
if label == 'T~0d':
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor('red')
|
||||
spine.set_linewidth(4)
|
||||
else:
|
||||
ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray')
|
||||
ax.set_title(label, fontsize=10)
|
||||
|
||||
# Add red box for T~0d even if no data
|
||||
if label == 'T~0d':
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor('red')
|
||||
spine.set_linewidth(4)
|
||||
|
||||
ax.set_xticks([])
|
||||
ax.set_yticks([])
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save figure with detailed naming: field_ID_harvestdate_model_harvestyle.png
|
||||
harvest_date_str = harvest_date.strftime('%Y%m%d')
|
||||
|
||||
if harvest_type == 'registered':
|
||||
filename = f'field_{field_id}_{harvest_date_str}_registered_harvest_rgb.png'
|
||||
else:
|
||||
# For predicted: include model name and harvest index if multiple
|
||||
if harvest_index is not None and harvest_index > 0:
|
||||
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest{harvest_index}_rgb.png'
|
||||
else:
|
||||
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png'
|
||||
|
||||
output_path = Path(output_dir) / filename
|
||||
plt.savefig(output_path, dpi=100, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
print(f" ✓ Saved: {filename}")
|
||||
return output_path
|
||||
|
||||
|
||||
def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates,
|
||||
output_dir, tiff_dir, geojson_path):
|
||||
"""
|
||||
Main orchestration function for RGB visualization.
|
||||
|
||||
Creates 3x3 grids for:
|
||||
1. Registered harvest dates (if available)
|
||||
2. Predicted harvest dates (if available)
|
||||
|
||||
Args:
|
||||
field_data (pd.DataFrame): Field data with Date, CI columns
|
||||
field_id (str): Field identifier
|
||||
registered_harvest_dates (list): List of registered harvest dates (pd.Timestamp)
|
||||
predicted_harvest_dates (list): List of predicted harvest dates (dict or pd.Timestamp)
|
||||
output_dir (Path): Output directory for plots
|
||||
tiff_dir (Path): Directory containing TIFF files
|
||||
geojson_path (Path): Path to pivot.geojson
|
||||
|
||||
Returns:
|
||||
dict: Summary of generated plots with keys 'registered' and 'predicted'
|
||||
"""
|
||||
if rasterio is None:
|
||||
print(" ⚠ Rasterio not available - skipping RGB visualization")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
tiff_dir = Path(tiff_dir)
|
||||
geojson_path = Path(geojson_path)
|
||||
|
||||
if not tiff_dir.exists():
|
||||
print(f" ✗ TIFF directory not found: {tiff_dir}")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
if not geojson_path.exists():
|
||||
print(f" ✗ GeoJSON not found: {geojson_path}")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
# Load field boundary
|
||||
print(f" Loading field boundary for {field_id}...")
|
||||
feature, field_boundary = load_field_boundaries(geojson_path, field_id)
|
||||
|
||||
if field_boundary is None:
|
||||
print(f" ✗ Could not load field boundary for {field_id}")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
results = {'registered': [], 'predicted': []}
|
||||
|
||||
# Process registered harvest dates
|
||||
if registered_harvest_dates and len(registered_harvest_dates) > 0:
|
||||
print(f" Processing {len(registered_harvest_dates)} registered harvest dates...")
|
||||
for i, harvest_date in enumerate(registered_harvest_dates):
|
||||
if pd.isna(harvest_date):
|
||||
continue
|
||||
|
||||
print(f" [{i+1}/{len(registered_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')}")
|
||||
output_path = create_temporal_rgb_grid(
|
||||
harvest_date, field_data, field_id, tiff_dir, field_boundary,
|
||||
title='Registered Harvest Validation',
|
||||
output_dir=output_dir,
|
||||
harvest_type='registered',
|
||||
model_name=None,
|
||||
harvest_index=i
|
||||
)
|
||||
if output_path:
|
||||
results['registered'].append(output_path)
|
||||
|
||||
# Process predicted harvest dates - grouped by model
|
||||
if predicted_harvest_dates and len(predicted_harvest_dates) > 0:
|
||||
print(f" Processing {len(predicted_harvest_dates)} predicted harvest dates...")
|
||||
|
||||
# Group by model to track index per model
|
||||
harvest_by_model = {}
|
||||
for harvest_info in predicted_harvest_dates:
|
||||
# Handle both dict and Timestamp formats
|
||||
if isinstance(harvest_info, dict):
|
||||
harvest_date = harvest_info.get('harvest_date')
|
||||
model_name = harvest_info.get('model_name', 'predicted')
|
||||
else:
|
||||
harvest_date = harvest_info
|
||||
model_name = 'predicted'
|
||||
|
||||
if model_name not in harvest_by_model:
|
||||
harvest_by_model[model_name] = []
|
||||
harvest_by_model[model_name].append(harvest_date)
|
||||
|
||||
# Process each model's harvests
|
||||
overall_index = 1
|
||||
for model_name, harvest_dates in harvest_by_model.items():
|
||||
for model_harvest_idx, harvest_date in enumerate(harvest_dates):
|
||||
if pd.isna(harvest_date):
|
||||
continue
|
||||
|
||||
print(f" [{overall_index}/{len(predicted_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')} ({model_name})")
|
||||
output_path = create_temporal_rgb_grid(
|
||||
harvest_date, field_data, field_id, tiff_dir, field_boundary,
|
||||
title=f'Predicted Harvest Validation ({model_name})',
|
||||
output_dir=output_dir,
|
||||
harvest_type='predicted',
|
||||
model_name=model_name,
|
||||
harvest_index=model_harvest_idx
|
||||
)
|
||||
if output_path:
|
||||
results['predicted'].append(output_path)
|
||||
overall_index += 1
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Example usage
|
||||
print("RGB Visualization Tool")
|
||||
print("This module is intended to be imported and called from compare_307_models_production.py")
|
||||
print("\nExample:")
|
||||
print(" from rgb_visualization import generate_rgb_grids")
|
||||
print(" generate_rgb_grids(field_data, field_id, registered_dates, predicted_dates, output_dir, tiff_dir, geojson_path)")
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
"""
|
||||
BATCH TEST: Sample multiple fields to understand model output distribution
|
||||
Purpose: Determine optimal threshold and consecutive_days parameters
|
||||
|
||||
This runs the model on 10 random fields and summarizes the results,
|
||||
helping us decide what parameters to use in the production script.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
|
||||
|
||||
def test_field(ci_data, field_id, model, config, scalers, device):
|
||||
"""Test a single field and return summary statistics"""
|
||||
|
||||
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
|
||||
if len(field_data) == 0:
|
||||
return None
|
||||
|
||||
try:
|
||||
ci_column = config['data']['ci_column']
|
||||
features = extract_features(field_data, config['features'], ci_column=ci_column)
|
||||
|
||||
# Apply scalers
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Run model
|
||||
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
output = model(features_tensor)
|
||||
|
||||
if isinstance(output, tuple):
|
||||
detected_probs = output[1].cpu().numpy().flatten()
|
||||
else:
|
||||
detected_probs = output.cpu().numpy().flatten()
|
||||
|
||||
# Analyze
|
||||
max_prob = detected_probs.max()
|
||||
mean_prob = detected_probs.mean()
|
||||
median_prob = np.median(detected_probs)
|
||||
|
||||
# Count consecutive days above thresholds
|
||||
consecutive_above = {}
|
||||
for thresh in [0.2, 0.3, 0.4, 0.5]:
|
||||
above = (detected_probs > thresh).astype(int)
|
||||
changes = np.diff(np.concatenate(([0], above, [0])))
|
||||
starts = np.where(changes == 1)[0]
|
||||
ends = np.where(changes == -1)[0]
|
||||
runs = ends - starts if len(starts) > 0 else []
|
||||
consecutive_above[thresh] = np.max(runs) if len(runs) > 0 else 0
|
||||
|
||||
return {
|
||||
'field': field_id,
|
||||
'max_prob': max_prob,
|
||||
'mean_prob': mean_prob,
|
||||
'median_prob': median_prob,
|
||||
'consecutive_0.2': consecutive_above[0.2],
|
||||
'consecutive_0.3': consecutive_above[0.3],
|
||||
'consecutive_0.4': consecutive_above[0.4],
|
||||
'consecutive_0.5': consecutive_above[0.5],
|
||||
'num_days': len(field_data),
|
||||
}
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 10
|
||||
|
||||
# Load data
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not CI_DATA_FILE.exists():
|
||||
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||
return
|
||||
|
||||
print(f"Loading CI data from {CI_DATA_FILE}...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Get random sample of fields
|
||||
all_fields = sorted(ci_data['field'].unique())
|
||||
np.random.seed(42)
|
||||
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
|
||||
print(f"Testing {len(sample_fields)} random fields...")
|
||||
|
||||
# Load model
|
||||
print(f"Loading model...")
|
||||
from harvest_date_pred_utils import load_model_and_config
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Test each field
|
||||
results = []
|
||||
for idx, field_id in enumerate(sample_fields, 1):
|
||||
result = test_field(ci_data, field_id, model, config, scalers, device)
|
||||
if result:
|
||||
results.append(result)
|
||||
print(f" [{idx:2d}/{len(sample_fields)}] Field {field_id:>6s}: "
|
||||
f"max={result['max_prob']:.3f} mean={result['mean_prob']:.4f} "
|
||||
f"consec@0.3={result['consecutive_0.3']:2d} consec@0.2={result['consecutive_0.2']:2d}")
|
||||
|
||||
# Summary statistics
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"SUMMARY STATISTICS ({len(results)} fields tested):")
|
||||
print(f"{'='*80}")
|
||||
print(f"\nMax probability (per field):")
|
||||
print(f" Mean: {results_df['max_prob'].mean():.4f}")
|
||||
print(f" Median: {results_df['max_prob'].median():.4f}")
|
||||
print(f" Min: {results_df['max_prob'].min():.4f}")
|
||||
print(f" Max: {results_df['max_prob'].max():.4f}")
|
||||
|
||||
print(f"\nMean probability (per field):")
|
||||
print(f" Mean: {results_df['mean_prob'].mean():.4f}")
|
||||
print(f" Median: {results_df['mean_prob'].median():.4f}")
|
||||
|
||||
print(f"\nConsecutive days above threshold=0.3:")
|
||||
print(f" Mean: {results_df['consecutive_0.3'].mean():.1f}")
|
||||
print(f" Median: {results_df['consecutive_0.3'].median():.1f}")
|
||||
print(f" Min: {results_df['consecutive_0.3'].min():.0f}")
|
||||
print(f" Max: {results_df['consecutive_0.3'].max():.0f}")
|
||||
|
||||
print(f"\nConsecutive days above threshold=0.2:")
|
||||
print(f" Mean: {results_df['consecutive_0.2'].mean():.1f}")
|
||||
print(f" Median: {results_df['consecutive_0.2'].median():.1f}")
|
||||
print(f" Min: {results_df['consecutive_0.2'].min():.0f}")
|
||||
print(f" Max: {results_df['consecutive_0.2'].max():.0f}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"RECOMMENDATION:")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Calculate recommendations based on data
|
||||
median_consec_0_3 = results_df['consecutive_0.3'].median()
|
||||
median_consec_0_2 = results_df['consecutive_0.2'].median()
|
||||
|
||||
if median_consec_0_3 >= 3:
|
||||
print(f"✓ Threshold=0.3 with consecutive_days=3 should work")
|
||||
print(f" (median consecutive days: {median_consec_0_3:.0f})")
|
||||
elif median_consec_0_3 >= 2:
|
||||
print(f"✓ Threshold=0.3 with consecutive_days=2 recommended")
|
||||
print(f" (median consecutive days: {median_consec_0_3:.0f})")
|
||||
elif median_consec_0_2 >= 3:
|
||||
print(f"✓ Threshold=0.2 with consecutive_days=3 recommended")
|
||||
print(f" (median consecutive days: {median_consec_0_2:.0f})")
|
||||
else:
|
||||
print(f"✓ Threshold=0.2 with consecutive_days=2 recommended")
|
||||
print(f" (median consecutive days @ 0.2: {median_consec_0_2:.0f})")
|
||||
|
||||
print(f"\nCurrent production settings: threshold=0.45, consecutive_days=2")
|
||||
print(f" → These are likely TOO STRICT (only 289 fields detected in batch run)")
|
||||
print(f"\nSuggested adjustment:")
|
||||
print(f" → Lower threshold to 0.2-0.3")
|
||||
print(f" → Reduce consecutive_days to 1-2")
|
||||
print(f" → Re-run batch to get ~1000+ fields detected")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
"""
|
||||
Test DOY reset logic for harvest detection.
|
||||
Verify that DOY resets to 1, 2, 3, ... after harvest is detected.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path.cwd()))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from harvest_date_pred_utils import extract_features, load_model_and_config
|
||||
import torch
|
||||
|
||||
# Load sample data
|
||||
ci_data = pd.read_csv('../laravel_app/storage/app/angata/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv')
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Get field 779 data
|
||||
field_779 = ci_data[ci_data['field'] == '779'].reset_index(drop=True)
|
||||
print(f"Field 779: {len(field_779)} days of data")
|
||||
print(f"Date range: {field_779['Date'].min().date()} to {field_779['Date'].max().date()}\n")
|
||||
|
||||
# Load model config
|
||||
model, config, scalers = load_model_and_config(Path.cwd())
|
||||
|
||||
# Test 1: First season (season_anchor_day = 0)
|
||||
print("=" * 80)
|
||||
print("TEST 1: First season (season_anchor_day=0, lookback_start=0)")
|
||||
print("=" * 80)
|
||||
window = field_779.iloc[0:20].copy().reset_index(drop=True)
|
||||
features = extract_features(window, config['features'], ci_column='FitData',
|
||||
season_anchor_day=0, lookback_start=0)
|
||||
|
||||
# Extract DOY_normalized column (index 13 or find it)
|
||||
feature_names = config['features']
|
||||
doy_idx = feature_names.index('DOY_normalized') if 'DOY_normalized' in feature_names else -1
|
||||
if doy_idx >= 0:
|
||||
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
|
||||
print(f"Window size: {len(window)} days")
|
||||
print(f"DOY values: {doy_values[:10]}")
|
||||
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]")
|
||||
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not incrementing correctly!"
|
||||
print("✓ DOY incrementing correctly for first season\n")
|
||||
|
||||
# Test 2: After harvest detected at day 100, next season starts
|
||||
print("=" * 80)
|
||||
print("TEST 2: After harvest at day 100, new season starts (season_anchor_day=101, lookback_start=101)")
|
||||
print("=" * 80)
|
||||
harvest_day = 100
|
||||
window = field_779.iloc[harvest_day:harvest_day+20].copy().reset_index(drop=True)
|
||||
features = extract_features(window, config['features'], ci_column='FitData',
|
||||
season_anchor_day=harvest_day+1, lookback_start=harvest_day+1)
|
||||
|
||||
if doy_idx >= 0:
|
||||
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
|
||||
print(f"Window size: {len(window)} days (starting at day {harvest_day})")
|
||||
print(f"DOY values: {doy_values[:10]}")
|
||||
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (fresh season)")
|
||||
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not reset after harvest!"
|
||||
print("✓ DOY reset correctly for new season\n")
|
||||
|
||||
print("=" * 80)
|
||||
print("ALL TESTS PASSED! DOY logic is correct.")
|
||||
print("=" * 80)
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick test: Verify feature extraction works
|
||||
"""
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from harvest_date_pred_utils import extract_features, load_model_and_config
|
||||
|
||||
project_name = "angata"
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
|
||||
print("="*80)
|
||||
print("DEBUG: Feature Extraction Test")
|
||||
print("="*80)
|
||||
|
||||
# Load model config
|
||||
print("\n[1] Loading model config...")
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
print(f" Config features: {config['features']}")
|
||||
print(f" Number of features: {len(config['features'])}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
print(f" Columns: {ci_data.columns.tolist()}")
|
||||
print(f" Total rows: {len(ci_data)}")
|
||||
|
||||
# Test on a single field
|
||||
test_field = "1"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
print(f"\n[3] Testing on field {test_field}...")
|
||||
print(f" Data points: {len(field_data)}")
|
||||
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
|
||||
print(f" Columns in field data: {field_data.columns.tolist()}")
|
||||
print(f" Sample values:")
|
||||
print(field_data[['Date', 'value']].head())
|
||||
|
||||
# Test feature extraction on first 50 days
|
||||
print(f"\n[4] Extracting features for first 50 days...")
|
||||
try:
|
||||
subset = field_data.iloc[:50].copy()
|
||||
features = extract_features(subset, config['features'], ci_column='value')
|
||||
print(f" ✓ Success!")
|
||||
print(f" Feature shape: {features.shape}")
|
||||
print(f" Expected shape: (50, {len(config['features'])})")
|
||||
print(f" Feature values sample (first 5 days):")
|
||||
for i in range(min(5, features.shape[0])):
|
||||
print(f" Day {i}: {features[i]}")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n[5] Testing on growing windows...")
|
||||
try:
|
||||
for window_size in [10, 20, 30, 50]:
|
||||
window_data = field_data.iloc[:window_size].copy()
|
||||
features = extract_features(window_data, config['features'], ci_column='value')
|
||||
print(f" Window size {window_size}: shape={features.shape}, min={features.min():.4f}, max={features.max():.4f}")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n✓ Feature extraction test complete")
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test ONLY the growing window method (what production actually uses)
|
||||
Never run model on full sequence - only on expanding windows
|
||||
|
||||
This matches real deployment where data arrives daily
|
||||
"""
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
|
||||
project_name = "angata"
|
||||
|
||||
# Find root by walking up until we find laravel_app
|
||||
script_dir = Path(__file__).parent
|
||||
root = script_dir
|
||||
while root != root.parent: # Stop at filesystem root
|
||||
if (root / "laravel_app").exists():
|
||||
break
|
||||
root = root.parent
|
||||
|
||||
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
MODEL_DIR = root / "python_app"
|
||||
|
||||
print("="*80)
|
||||
print("GROWING WINDOW METHOD ONLY (Real Production Simulation)")
|
||||
print("="*80)
|
||||
|
||||
# Load model
|
||||
print("\n[1] Loading model...")
|
||||
model, config, scalers = load_model_and_config(MODEL_DIR)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Test on field 779
|
||||
test_field = "779"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
print(f"\n[3] Field {test_field}: {len(field_data)} data points")
|
||||
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
|
||||
|
||||
# Simulate growing window (real production)
|
||||
print(f"\n[4] Simulating growing window (expanding daily)...")
|
||||
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
consecutive_above = 0
|
||||
threshold = 0.3
|
||||
consecutive_days = 2
|
||||
model_runs = 0
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above = 0
|
||||
found_harvest = False
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
# Expand window: current_pos to window_end
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
# Extract features for THIS window only
|
||||
features = extract_features(window_data, config['features'], ci_column='value')
|
||||
|
||||
# Normalize
|
||||
features_scaled = features.copy().astype(float)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
|
||||
# Run model on expanding window
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
model_runs += 1
|
||||
|
||||
# Check LAST timestep only
|
||||
last_prob = detected_probs[0, -1].item()
|
||||
last_date = window_data.iloc[-1]['Date'].date()
|
||||
|
||||
# Print every 50th window to track progress
|
||||
if window_end % 50 == 0 or window_end < 10:
|
||||
print(f" Window [{current_pos:3d}:{window_end:3d}] ({last_date}): prob={last_prob:.4f}", end="")
|
||||
if last_prob > threshold:
|
||||
print(" ✓ ABOVE", end="")
|
||||
print()
|
||||
|
||||
# Check threshold
|
||||
if last_prob > threshold:
|
||||
consecutive_above += 1
|
||||
else:
|
||||
consecutive_above = 0
|
||||
|
||||
# Harvest detected
|
||||
if consecutive_above >= consecutive_days:
|
||||
harvest_idx = current_pos + window_end - consecutive_days
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx, last_prob))
|
||||
print(f"\n ✓ HARVEST DETECTED at {harvest_date.date()} (index {harvest_idx})")
|
||||
print(f" {consecutive_days} consecutive days above {threshold}")
|
||||
|
||||
# Jump past this harvest
|
||||
current_pos = harvest_idx + 1
|
||||
found_harvest = True
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR at window [{current_pos}:{window_end}]: {e}")
|
||||
continue
|
||||
|
||||
if not found_harvest:
|
||||
break
|
||||
|
||||
print(f"\n[5] Results:")
|
||||
print(f" Total model runs: {model_runs}")
|
||||
print(f" Harvests found: {len(harvest_dates)}")
|
||||
|
||||
if harvest_dates:
|
||||
print(f"\n Harvest dates:")
|
||||
for date, idx, prob in harvest_dates:
|
||||
print(f" {date.date()}: index {idx}, last_prob={prob:.4f}")
|
||||
else:
|
||||
print(f"\n No harvests detected")
|
||||
|
||||
print(f"\n[6] Analysis:")
|
||||
print(f" Model runs per day: {model_runs / len(field_data):.2f}x")
|
||||
print(f" Expected: ~{len(field_data):.0f} runs (one per day)")
|
||||
|
|
@ -0,0 +1,272 @@
|
|||
"""
|
||||
COMPARISON TEST: Growing Window vs Single Run approach
|
||||
Purpose: Compare harvest dates detected by two different methods
|
||||
|
||||
Method 1 (Growing Window - Current):
|
||||
- Day 1: Run model on [day1]
|
||||
- Day 2: Run model on [day1:2]
|
||||
- Day 3: Run model on [day1:3]
|
||||
- ... up to day 477
|
||||
- This matches real-time production where data arrives daily
|
||||
- Takes ~477 model runs per field (SLOW)
|
||||
|
||||
Method 2 (Single Run - Proposed):
|
||||
- Run model ONCE on full [day1:477] sequence
|
||||
- Use these probabilities to scan for harvests
|
||||
- This is 477x faster but assumes different LSTM context
|
||||
- May produce different harvest dates
|
||||
|
||||
Question: Do these methods detect similar harvest dates or different ones?
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
import time
|
||||
|
||||
|
||||
def method_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
|
||||
"""Original method: expanding window, run model multiple times"""
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||
|
||||
# Apply scalers
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Run model
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
output = model(x_tensor)
|
||||
|
||||
if isinstance(output, tuple):
|
||||
_, detected_probs = output
|
||||
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||
else:
|
||||
detected_probs = output.squeeze(0).cpu().numpy()
|
||||
|
||||
# Check LAST timestep
|
||||
last_prob = detected_probs[-1]
|
||||
|
||||
if last_prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
# Harvest detected
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_idx = current_pos + window_end - consecutive_days - 1
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx, last_prob))
|
||||
|
||||
# Reset to next day after harvest
|
||||
current_pos = current_pos + window_end - consecutive_days
|
||||
break
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
return harvest_dates
|
||||
|
||||
|
||||
def method_single_run(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
|
||||
"""Proposed method: run model once on full sequence"""
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
|
||||
try:
|
||||
# Extract features ONCE for full dataset
|
||||
features = extract_features(field_data, config['features'], ci_column=ci_column)
|
||||
|
||||
# Apply scalers
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Run model ONCE to get all probabilities
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
output = model(x_tensor)
|
||||
|
||||
if isinstance(output, tuple):
|
||||
_, detected_probs = output
|
||||
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||
else:
|
||||
detected_probs = output.squeeze(0).cpu().numpy()
|
||||
|
||||
# Scan forward looking for harvests
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
for pos in range(current_pos, len(field_data)):
|
||||
prob = detected_probs[pos]
|
||||
|
||||
if prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
# Harvest detected
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_idx = pos - consecutive_days + 1
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx, prob))
|
||||
|
||||
# Move anchor point past this harvest
|
||||
current_pos = harvest_idx + 1
|
||||
break
|
||||
else:
|
||||
# No harvest found in remaining data
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return harvest_dates
|
||||
|
||||
|
||||
def compare_field(field_id, ci_data, model, config, scalers, device, threshold=0.3):
|
||||
"""Compare both methods for a single field"""
|
||||
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
if len(field_data) < 10:
|
||||
return None
|
||||
|
||||
ci_column = config['data']['ci_column']
|
||||
|
||||
# Method 1: Growing window (SLOW)
|
||||
print(f"\n Growing Window method...", end=" ", flush=True)
|
||||
start = time.time()
|
||||
growing_harvests = method_growing_window(field_data, model, config, scalers, ci_column, device, threshold)
|
||||
time_growing = time.time() - start
|
||||
print(f"({time_growing:.2f}s, {len(field_data)} model runs)")
|
||||
|
||||
# Method 2: Single run (FAST)
|
||||
print(f" Single Run method...", end=" ", flush=True)
|
||||
start = time.time()
|
||||
single_harvests = method_single_run(field_data, model, config, scalers, ci_column, device, threshold)
|
||||
time_single = time.time() - start
|
||||
print(f"({time_single:.2f}s, 1 model run)")
|
||||
|
||||
return {
|
||||
'field': field_id,
|
||||
'num_days': len(field_data),
|
||||
'growing_harvests': growing_harvests,
|
||||
'single_harvests': single_harvests,
|
||||
'time_growing': time_growing,
|
||||
'time_single': time_single,
|
||||
'speedup': time_growing / time_single if time_single > 0 else 0,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 3
|
||||
threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3
|
||||
|
||||
# Load data
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not CI_DATA_FILE.exists():
|
||||
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||
return
|
||||
|
||||
print(f"Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Get sample of fields
|
||||
all_fields = sorted(ci_data['field'].unique())
|
||||
np.random.seed(42)
|
||||
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
|
||||
|
||||
# Load model
|
||||
print(f"Loading model...")
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"COMPARING METHODS (threshold={threshold}, consecutive_days=2)")
|
||||
print(f"{'='*80}")
|
||||
|
||||
all_results = []
|
||||
for idx, field_id in enumerate(sample_fields, 1):
|
||||
print(f"\n[{idx}/{len(sample_fields)}] Field {field_id}")
|
||||
result = compare_field(field_id, ci_data, model, config, scalers, device, threshold)
|
||||
|
||||
if result:
|
||||
all_results.append(result)
|
||||
|
||||
print(f"\n Growing Window found {len(result['growing_harvests'])} harvests:")
|
||||
for date, idx, prob in result['growing_harvests']:
|
||||
print(f" - {date.date()}: prob={prob:.4f}")
|
||||
|
||||
print(f"\n Single Run found {len(result['single_harvests'])} harvests:")
|
||||
for date, idx, prob in result['single_harvests']:
|
||||
print(f" - {date.date()}: prob={prob:.4f}")
|
||||
|
||||
print(f"\n ⏱ Speed comparison:")
|
||||
print(f" Growing Window: {result['time_growing']:.2f}s ({result['num_days']} days processed)")
|
||||
print(f" Single Run: {result['time_single']:.2f}s (1 run)")
|
||||
print(f" Speedup: {result['speedup']:.0f}x faster")
|
||||
|
||||
# Compare harvests
|
||||
growing_dates = [d for d, _, _ in result['growing_harvests']]
|
||||
single_dates = [d for d, _, _ in result['single_harvests']]
|
||||
|
||||
if growing_dates == single_dates:
|
||||
print(f"\n ✓ IDENTICAL: Both methods found the same harvest dates")
|
||||
else:
|
||||
print(f"\n ✗ DIFFERENT: Methods found different harvest dates")
|
||||
print(f" Growing only: {[d for d in growing_dates if d not in single_dates]}")
|
||||
print(f" Single only: {[d for d in single_dates if d not in growing_dates]}")
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*80}")
|
||||
print(f"SUMMARY ({len(all_results)} fields tested)")
|
||||
print(f"{'='*80}")
|
||||
|
||||
identical = sum(1 for r in all_results if [d for d, _, _ in r['growing_harvests']] == [d for d, _, _ in r['single_harvests']])
|
||||
different = len(all_results) - identical
|
||||
|
||||
print(f"\nIdentical results: {identical}/{len(all_results)}")
|
||||
print(f"Different results: {different}/{len(all_results)}")
|
||||
|
||||
if all_results:
|
||||
avg_speedup = np.mean([r['speedup'] for r in all_results])
|
||||
print(f"\nAverage speedup: {avg_speedup:.0f}x")
|
||||
print(f"\nConclusion:")
|
||||
if identical == len(all_results):
|
||||
print(f" ✓ Methods are EQUIVALENT - Single run approach is safe to use")
|
||||
print(f" Recommend switching to single run for {avg_speedup:.0f}x faster execution")
|
||||
else:
|
||||
print(f" ✗ Methods produce DIFFERENT results - Need to understand why")
|
||||
print(f" Growing window is slower but may be more accurate for live deployment")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete test: Feature extraction + Model inference + Phase 1 detection
|
||||
"""
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
run_phase1_growing_window
|
||||
)
|
||||
|
||||
project_name = "angata"
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
|
||||
print("="*80)
|
||||
print("DEBUG: Model Inference + Phase 1 Detection")
|
||||
print("="*80)
|
||||
|
||||
# Load model
|
||||
print("\n[1] Loading model...")
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f" Device: {device}")
|
||||
print(f" Scalers type: {type(scalers)}")
|
||||
print(f" Number of scalers: {len(scalers) if isinstance(scalers, list) else 'N/A (dict/object)'}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Test on a known field (field 1)
|
||||
test_field = "1"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
print(f"\n[3] Testing on field {test_field}...")
|
||||
print(f" Data points: {len(field_data)}")
|
||||
|
||||
# Test with first 100 days
|
||||
subset_100 = field_data.iloc[:100].copy().reset_index(drop=True)
|
||||
print(f"\n[4] Testing model inference on first 100 days...")
|
||||
|
||||
try:
|
||||
features = extract_features(subset_100, config['features'], ci_column='value')
|
||||
print(f" Features shape: {features.shape}")
|
||||
print(f" Features dtype: {features.dtype}")
|
||||
|
||||
# Apply scalers
|
||||
features_scaled = features.copy().astype(float)
|
||||
print(f" Applying {len(scalers)} scalers...")
|
||||
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
col_data = features[:, fi].reshape(-1, 1)
|
||||
scaled_col = scaler.transform(col_data)
|
||||
features_scaled[:, fi] = scaled_col.flatten()
|
||||
if fi < 3: # Show first 3 scalers
|
||||
print(f" Scaler {fi}: transformed {features[0, fi]:.4f} → {features_scaled[0, fi]:.4f}")
|
||||
except Exception as e:
|
||||
print(f" ERROR in scaler {fi}: {e}")
|
||||
raise
|
||||
|
||||
# Run model
|
||||
print(f"\n Running model inference...")
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
print(f" Tensor shape: {x_tensor.shape}")
|
||||
|
||||
with torch.no_grad():
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
print(f" Imminent probs shape: {imminent_probs.shape}")
|
||||
print(f" Detected probs shape: {detected_probs.shape}")
|
||||
print(f" Detected probs dtype: {detected_probs.dtype}")
|
||||
|
||||
# Analyze detected probs
|
||||
detected_np = detected_probs[0].cpu().numpy() # Get first (only) batch
|
||||
print(f"\n Detected head analysis:")
|
||||
print(f" Min: {detected_np.min():.4f}")
|
||||
print(f" Max: {detected_np.max():.4f}")
|
||||
print(f" Mean: {detected_np.mean():.4f}")
|
||||
print(f" Median: {np.median(detected_np):.4f}")
|
||||
print(f" > 0.1: {(detected_np > 0.1).sum()} days")
|
||||
print(f" > 0.3: {(detected_np > 0.3).sum()} days")
|
||||
print(f" > 0.5: {(detected_np > 0.5).sum()} days")
|
||||
|
||||
# Show top 5 peaks
|
||||
top_indices = np.argsort(detected_np)[-5:][::-1]
|
||||
print(f"\n Top 5 detected peaks:")
|
||||
for idx in top_indices:
|
||||
date = subset_100.iloc[idx]['Date'].date()
|
||||
prob = detected_np[idx]
|
||||
print(f" Day {idx} ({date}): {prob:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
# Test Phase 1 growing window
|
||||
print(f"\n[5] Testing Phase 1 growing window (threshold=0.3, consecutive=2)...")
|
||||
try:
|
||||
phase1_results = run_phase1_growing_window(
|
||||
subset_100, model, config, scalers, 'value', device,
|
||||
threshold=0.3, consecutive_days=2
|
||||
)
|
||||
print(f" ✓ Phase 1 found {len(phase1_results)} harvest(s):")
|
||||
for harvest_date, harvest_idx in phase1_results:
|
||||
print(f" {harvest_date.date()}: index {harvest_idx}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n✓ Model inference test complete")
|
||||
|
|
@ -0,0 +1,276 @@
|
|||
"""
|
||||
TEST SCRIPT: Inspect raw model output distributions for a single field
|
||||
Purpose: Diagnose why thresholding is failing and harvest dates are wrong
|
||||
|
||||
This shows:
|
||||
1. Distribution of harvest probability scores (0-1 range)
|
||||
2. How often consecutive_days=3 is actually achievable
|
||||
3. Actual season boundaries detected
|
||||
4. Recommendations for threshold adjustment
|
||||
|
||||
Usage:
|
||||
python test_model_output_distribution.py angata [field_id]
|
||||
|
||||
Examples:
|
||||
python test_model_output_distribution.py angata 1
|
||||
python test_model_output_distribution.py angata 10042
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
|
||||
|
||||
def analyze_single_field(ci_data, field_id, model, config, scalers, device):
|
||||
"""Analyze raw model outputs for a single field"""
|
||||
|
||||
# Filter to field
|
||||
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
if len(field_data) == 0:
|
||||
print(f"ERROR: No data for field {field_id}")
|
||||
return
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FIELD: {field_id}")
|
||||
print(f"{'='*80}")
|
||||
print(f"Date range: {field_data['Date'].min()} to {field_data['Date'].max()}")
|
||||
print(f"Total days: {len(field_data)}")
|
||||
|
||||
# Extract features (same as main pipeline)
|
||||
try:
|
||||
ci_column = config['data']['ci_column']
|
||||
features = extract_features(field_data, config['features'], ci_column=ci_column)
|
||||
if features is None or len(features) == 0:
|
||||
print(f"ERROR: extract_features returned empty for field {field_id}")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"ERROR extracting features: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
# Apply scalers (CRITICAL - same as production code)
|
||||
try:
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception as e:
|
||||
print(f" Warning: Scaler {fi} failed: {e}")
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"ERROR applying scalers: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
# Run model
|
||||
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
output = model(features_tensor)
|
||||
|
||||
# Convert to numpy (handle different output formats)
|
||||
# Model has TWO heads: imminent_probs, detected_probs
|
||||
if isinstance(output, tuple):
|
||||
imminent_probs = output[0].cpu().numpy().flatten()
|
||||
detected_probs = output[1].cpu().numpy().flatten()
|
||||
probs = detected_probs # Use DETECTED head (current harvest), not imminent (future)
|
||||
print(f"Model outputs TWO heads: imminent + detected")
|
||||
print(f" Using DETECTED head for harvest detection (sparse spikes at harvest)")
|
||||
else:
|
||||
# Fallback for single output
|
||||
probs = output.cpu().numpy().flatten()
|
||||
print(f"Model output: single head")
|
||||
|
||||
print(f"\nModel output shape: {probs.shape}")
|
||||
print(f"Output range: {probs.min():.4f} to {probs.max():.4f}")
|
||||
|
||||
# Statistics
|
||||
print(f"\nHarvest probability statistics:")
|
||||
print(f" Mean: {probs.mean():.4f}")
|
||||
print(f" Median: {np.median(probs):.4f}")
|
||||
print(f" Std Dev: {probs.std():.4f}")
|
||||
print(f" Min: {probs.min():.4f}")
|
||||
print(f" Max: {probs.max():.4f}")
|
||||
|
||||
# Distribution by threshold
|
||||
print(f"\nDistribution by threshold:")
|
||||
thresholds = [0.2, 0.3, 0.4, 0.45, 0.5, 0.6, 0.7]
|
||||
for thresh in thresholds:
|
||||
count = np.sum(probs > thresh)
|
||||
pct = 100 * count / len(probs)
|
||||
print(f" > {thresh}: {count:4d} days ({pct:5.1f}%)")
|
||||
|
||||
# Consecutive days analysis (key metric)
|
||||
print(f"\nConsecutive days above threshold (Phase 1 requirement):")
|
||||
for thresh in [0.3, 0.4, 0.45, 0.5]:
|
||||
above = (probs > thresh).astype(int)
|
||||
# Find consecutive runs
|
||||
changes = np.diff(np.concatenate(([0], above, [0])))
|
||||
starts = np.where(changes == 1)[0]
|
||||
ends = np.where(changes == -1)[0]
|
||||
runs = ends - starts
|
||||
|
||||
if len(runs) > 0:
|
||||
max_run = np.max(runs)
|
||||
print(f" Threshold {thresh}: max consecutive = {max_run} days (need 3 for Phase 1)")
|
||||
if len(runs) > 5:
|
||||
print(f" {len(runs)} separate runs detected (seasons?)")
|
||||
else:
|
||||
print(f" Threshold {thresh}: no runs detected")
|
||||
|
||||
# Show top predictions
|
||||
print(f"\nTop 10 harvest probability peaks:")
|
||||
top_indices = np.argsort(probs)[-10:][::-1]
|
||||
for rank, idx in enumerate(top_indices, 1):
|
||||
date = field_data.iloc[idx]['Date']
|
||||
prob = probs[idx]
|
||||
print(f" {rank:2d}. Day {idx:4d} ({date}): {prob:.4f}")
|
||||
|
||||
# Show timeline
|
||||
print(f"\nTimeline of probabilities (sampling every 10 days):")
|
||||
for idx in range(0, len(probs), max(1, len(probs) // 20)):
|
||||
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
|
||||
ci_value = field_data.iloc[idx]['FitData']
|
||||
prob = probs[idx]
|
||||
bar = '-' * int(prob * 35)
|
||||
try:
|
||||
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f} {bar}")
|
||||
except UnicodeEncodeError:
|
||||
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f}")
|
||||
|
||||
# FULL DAILY PROBABILITIES WITH CI VALUES
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FULL DAILY PROBABILITIES WITH CI VALUES ({len(probs)} days):")
|
||||
print(f"{'='*80}")
|
||||
print(f"{'Day':>4} {'Date':<12} {'CI':>8} {'Probability':>12} {'Visual':<40}")
|
||||
print(f"{'-'*100}")
|
||||
|
||||
for idx in range(len(probs)):
|
||||
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
|
||||
ci_value = field_data.iloc[idx]['FitData']
|
||||
prob = probs[idx]
|
||||
bar = '-' * int(prob * 35) # Use dashes instead of█ for Unicode safety
|
||||
try:
|
||||
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f} {bar}")
|
||||
except UnicodeEncodeError:
|
||||
# Fallback for Windows encoding issues
|
||||
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f}")
|
||||
|
||||
print(f"{'-'*100}")
|
||||
|
||||
# Find valleys (days with low probabilities that could indicate season boundaries)
|
||||
print(f"\nDays with LOWEST probabilities (potential season boundaries):")
|
||||
valleys_threshold = 0.5 # Days below this might be season breaks
|
||||
valley_indices = np.where(probs < valleys_threshold)[0]
|
||||
|
||||
if len(valley_indices) > 0:
|
||||
print(f" Found {len(valley_indices)} days below {valleys_threshold}")
|
||||
# Get valleys sorted by probability
|
||||
valley_data = [(idx, probs[idx], field_data.iloc[idx]['Date']) for idx in valley_indices]
|
||||
valley_data.sort(key=lambda x: x[1]) # Sort by probability (lowest first)
|
||||
|
||||
print(f"\n Bottom 20 lowest-probability days:")
|
||||
for rank, (idx, prob, date) in enumerate(valley_data[:20], 1):
|
||||
print(f" {rank:2d}. Day {idx:3d} ({date}): {prob:.4f}")
|
||||
else:
|
||||
print(f" None - all days above {valleys_threshold}")
|
||||
|
||||
# Identify likely harvest dates by finding local minima (valleys between growing periods)
|
||||
print(f"\nLikely season boundaries (local minima approach):")
|
||||
# Find indices where probability suddenly drops (derivative)
|
||||
if len(probs) > 7:
|
||||
smoothed = pd.Series(probs).rolling(window=7, center=True).mean()
|
||||
derivatives = smoothed.diff().fillna(0)
|
||||
|
||||
# Find big drops (where derivative is very negative)
|
||||
drops = np.where(derivatives < -0.2)[0] # Significant downward moves
|
||||
|
||||
if len(drops) > 0:
|
||||
print(f" Found {len(drops)} significant drops (prob falling by 0.2+):")
|
||||
for idx in drops[:10]: # Show first 10
|
||||
date = field_data.iloc[idx]['Date']
|
||||
before = probs[max(0, idx-1)]
|
||||
after = probs[idx]
|
||||
print(f" Day {idx:3d} ({date}): {before:.4f} → {after:.4f}")
|
||||
else:
|
||||
print(f" No significant drops detected (probabilities don't dip much)")
|
||||
|
||||
# Show which harvest dates would be detected at different thresholds
|
||||
print(f"\nHarvest detection (first day where prob > threshold for N consecutive days):")
|
||||
for thresh in [0.2, 0.3, 0.4, 0.5, 0.6]:
|
||||
for consec in [1, 2, 3]:
|
||||
above = (probs > thresh).astype(int)
|
||||
changes = np.diff(np.concatenate(([0], above, [0])))
|
||||
starts = np.where(changes == 1)[0]
|
||||
|
||||
if len(starts) > 0:
|
||||
# For each harvest start, find where it would trigger with consecutive_days
|
||||
detected_harvests = []
|
||||
for start_idx in starts:
|
||||
# Check if we have enough consecutive days
|
||||
if start_idx + consec - 1 < len(probs):
|
||||
if all(probs[start_idx:start_idx + consec] > thresh):
|
||||
harvest_date = field_data.iloc[start_idx]['Date']
|
||||
detected_harvests.append((start_idx, harvest_date))
|
||||
|
||||
if detected_harvests:
|
||||
first_idx, first_date = detected_harvests[0]
|
||||
print(f" Threshold={thresh}, consecutive={consec}: {first_date} (day {first_idx})")
|
||||
else:
|
||||
print(f" Threshold={thresh}, consecutive={consec}: None detected")
|
||||
|
||||
|
||||
def main():
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
field_id = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
# Paths
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not CI_DATA_FILE.exists():
|
||||
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||
return
|
||||
|
||||
print(f"Loading CI data from {CI_DATA_FILE}...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
if field_id is None:
|
||||
# Show first 10 fields if none specified
|
||||
fields = sorted(ci_data['field'].unique())[:10]
|
||||
field_id = fields[0]
|
||||
print(f"No field specified. Testing first field: {field_id}")
|
||||
print(f"Available fields: {', '.join(fields)}")
|
||||
|
||||
# Load model
|
||||
print(f"\nLoading model...")
|
||||
from harvest_date_pred_utils import load_model_and_config
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f"Device: {device}")
|
||||
|
||||
# Analyze
|
||||
analyze_single_field(ci_data, str(field_id), model, config, scalers, device)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("INTERPRETATION:")
|
||||
print(" If max consecutive < 3 for threshold=0.5:")
|
||||
print(" → Lower threshold to 0.3-0.4, or reduce consecutive_days to 1-2")
|
||||
print(" If multiple runs detected but harvest_date == first date:")
|
||||
print(" → Season detection is failing (check extract_features)")
|
||||
print(" If peaks are scattered randomly:")
|
||||
print(" → Model may need retraining or data validation")
|
||||
print("="*80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script: Test if script 22 logic is working
|
||||
Tests the two-step refinement on a single known field
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
run_phase1_growing_window,
|
||||
)
|
||||
|
||||
project_name = "angata"
|
||||
|
||||
# Find the workspace root by looking for laravel_app folder
|
||||
script_dir = Path(__file__).parent
|
||||
root = script_dir
|
||||
while root != root.parent:
|
||||
if (root / "laravel_app").exists():
|
||||
break
|
||||
root = root.parent
|
||||
|
||||
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
MODEL_DIR = root / "python_app"
|
||||
|
||||
print("="*80)
|
||||
print("DEBUG: Script 22 Two-Step Refinement Logic")
|
||||
print("="*80)
|
||||
|
||||
# Load model
|
||||
print("\n[1] Loading model...")
|
||||
model, config, scalers = load_model_and_config(MODEL_DIR)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f" Device: {device}")
|
||||
print(f" Model features: {config['features']}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
print(f" Total rows: {len(ci_data)}")
|
||||
print(f" Fields: {ci_data['field'].nunique()}")
|
||||
print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
|
||||
|
||||
# Test on a known field (field 779 from our previous tests)
|
||||
test_field = "779"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
print(f"\n[3] Testing on field {test_field}...")
|
||||
print(f" Data points: {len(field_data)}")
|
||||
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
|
||||
|
||||
if len(field_data) == 0:
|
||||
print(f" ERROR: No data for field {test_field}")
|
||||
sys.exit(1)
|
||||
|
||||
# Extract features
|
||||
print(f"\n[4] Extracting features for field {test_field}...")
|
||||
try:
|
||||
features = extract_features(field_data.reset_index(drop=True), config['features'], ci_column='value')
|
||||
print(f" Features shape: {features.shape}")
|
||||
print(f" Features dtype: {features.dtype}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: Could not extract features: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Normalize and run model
|
||||
print(f"\n[5] Running Phase 1 GROWING WINDOW method (threshold=0.5, consecutive=3)...")
|
||||
print(f" This simulates real production: expanding windows, checking each day")
|
||||
print(f" Expected: ~477 model runs for 477 days (SLOW)")
|
||||
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
# Add instrumentation to see how many model runs are happening
|
||||
original_run = run_phase1_growing_window
|
||||
|
||||
def instrumented_run(*args, **kwargs):
|
||||
import sys
|
||||
from harvest_date_pred_utils import extract_features
|
||||
|
||||
field_data = args[0]
|
||||
model = args[1]
|
||||
config = args[2]
|
||||
scalers = args[3]
|
||||
ci_column = args[4]
|
||||
device = args[5]
|
||||
threshold = kwargs.get('threshold', 0.3)
|
||||
consecutive_days = kwargs.get('consecutive_days', 2)
|
||||
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
model_runs = 0
|
||||
|
||||
print(f" Starting growing window loop...")
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
loop_start = current_pos
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||
|
||||
features_scaled = features.copy().astype(float)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception as e:
|
||||
raise ValueError(f"Scaler {fi} failed: {e}")
|
||||
|
||||
import torch
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
model_runs += 1
|
||||
last_prob = detected_probs[0, -1].item()
|
||||
|
||||
if last_prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
|
||||
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
|
||||
current_pos = current_pos + window_end - consecutive_days + 1
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
|
||||
print(f" Model runs performed: {model_runs}")
|
||||
return harvest_dates
|
||||
|
||||
phase1_results = instrumented_run(
|
||||
field_data.reset_index(drop=True),
|
||||
model, config, scalers, 'value', device,
|
||||
threshold=0.5,
|
||||
consecutive_days=3
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f"\n Time elapsed: {elapsed:.2f}s")
|
||||
|
||||
if phase1_results:
|
||||
print(f" ✓ Phase 1 detected {len(phase1_results)} harvest(s):")
|
||||
|
||||
# Get probabilities for display by running model once on full field
|
||||
with torch.no_grad():
|
||||
X = features.reshape(1, -1, len(config['features']))
|
||||
X_normalized = np.zeros_like(X)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
X_normalized[0, :, fi] = scaler.transform(X[0, :, fi].reshape(-1, 1)).flatten()
|
||||
X_tensor = torch.from_numpy(X_normalized).float().to(device)
|
||||
_, detected_probs = model(X_tensor)
|
||||
detected_np = detected_probs[0].cpu().numpy()
|
||||
|
||||
for harvest_date, harvest_idx in phase1_results:
|
||||
prob = detected_np[harvest_idx] if harvest_idx < len(detected_np) else 0.0
|
||||
print(f" {harvest_date.date()}: index {harvest_idx}, probability={prob:.4f}")
|
||||
else:
|
||||
print(f" ✗ Phase 1: No harvest detected")
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
# Merge all CI RDS files into a single CSV
|
||||
library(tidyverse)
|
||||
|
||||
# Paths
|
||||
ci_data_dir <- "r_app/experiments/ci_graph_exploration/CI_data"
|
||||
output_csv <- "python_app/lstm_ci_data_combined.csv"
|
||||
|
||||
# Find all RDS files
|
||||
rds_files <- list.files(ci_data_dir, pattern = "\\.rds$", full.names = TRUE)
|
||||
print(paste("Found", length(rds_files), "RDS files"))
|
||||
|
||||
# Load and combine all files
|
||||
combined_data <- tibble()
|
||||
|
||||
for (file in rds_files) {
|
||||
filename <- basename(file)
|
||||
client_name <- sub("\\.rds$", "", filename) # Extract client name from filename
|
||||
print(paste("Loading:", filename, "- Client:", client_name))
|
||||
data <- readRDS(file)
|
||||
data$client <- client_name
|
||||
combined_data <- bind_rows(combined_data, data)
|
||||
}
|
||||
|
||||
print(paste("Total rows:", nrow(combined_data)))
|
||||
print(paste("Columns:", paste(names(combined_data), collapse = ", ")))
|
||||
|
||||
# Write to CSV
|
||||
write.csv(combined_data, output_csv, row.names = FALSE)
|
||||
print(paste("✓ Saved to:", output_csv))
|
||||
791
python_app/rgb_visualization.py
Normal file
791
python_app/rgb_visualization.py
Normal file
|
|
@ -0,0 +1,791 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
RGB Visualization Tool for Harvest Date Validation
|
||||
|
||||
Creates 3x3 temporal grids showing satellite imagery around registered and predicted harvest dates.
|
||||
Extracts RGB from 8-band Planet scope data and clips to field boundaries from GeoJSON.
|
||||
|
||||
Functions:
|
||||
- load_field_boundaries(): Load field geometries from GeoJSON
|
||||
- find_closest_tiff(): Find available TIFF file closest to target date
|
||||
- load_and_clip_tiff_rgb(): Load TIFF, extract RGB, clip to field boundary
|
||||
- create_temporal_grid(): Create 3x3 grid (4 pre-harvest, 1 near, 2-3 post-harvest)
|
||||
- generate_rgb_grids(): Main orchestration function
|
||||
|
||||
Usage:
|
||||
from rgb_visualization import generate_rgb_grids
|
||||
generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates, output_dir, tiff_dir, geojson_path)
|
||||
"""
|
||||
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
import matplotlib
|
||||
matplotlib.use('Agg') # Use non-interactive backend to avoid display hangs
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.patches as patches
|
||||
from matplotlib.colors import Normalize
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
try:
|
||||
import rasterio
|
||||
from rasterio.mask import mask
|
||||
import shapely.geometry as shgeom
|
||||
except ImportError:
|
||||
print("Warning: rasterio not available. RGB visualization will be skipped.")
|
||||
rasterio = None
|
||||
|
||||
|
||||
def load_field_boundaries(geojson_path, field_id):
|
||||
"""
|
||||
Load field boundary from GeoJSON file.
|
||||
|
||||
Args:
|
||||
geojson_path (Path): Path to pivot.geojson
|
||||
field_id (str): Field identifier (e.g., "13973")
|
||||
|
||||
Returns:
|
||||
dict: GeoJSON feature or None if not found
|
||||
shapely.geometry.Polygon: Field boundary polygon or None
|
||||
"""
|
||||
try:
|
||||
with open(geojson_path) as f:
|
||||
geojson_data = json.load(f)
|
||||
|
||||
# Match field ID in properties
|
||||
for feature in geojson_data.get('features', []):
|
||||
props = feature.get('properties', {})
|
||||
# Try matching on 'field' or 'sub_field'
|
||||
if str(props.get('field', '')) == str(field_id) or \
|
||||
str(props.get('sub_field', '')) == str(field_id):
|
||||
geometry = feature.get('geometry')
|
||||
if geometry:
|
||||
geom_type = geometry.get('type', '')
|
||||
coordinates = geometry.get('coordinates', [])
|
||||
|
||||
# Handle MultiPolygon: coordinates[i] = [[[ring coords]], [[inner ring coords]], ...]
|
||||
if geom_type == 'MultiPolygon':
|
||||
# Use the first polygon from the multipolygon
|
||||
if coordinates and len(coordinates) > 0:
|
||||
coords = coordinates[0][0] # First polygon's exterior ring
|
||||
polygon = shgeom.Polygon(coords)
|
||||
return feature, polygon
|
||||
# Handle Polygon: coordinates = [[[ring coords]], [[inner ring coords]], ...]
|
||||
elif geom_type == 'Polygon':
|
||||
if coordinates and len(coordinates) > 0:
|
||||
coords = coordinates[0] # Exterior ring
|
||||
polygon = shgeom.Polygon(coords)
|
||||
return feature, polygon
|
||||
|
||||
print(f" ⚠ Field {field_id} not found in GeoJSON")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f" ✗ Error loading GeoJSON: {e}")
|
||||
return None, None
|
||||
|
||||
|
||||
def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=None, debug=False):
|
||||
"""
|
||||
Find tile files with actual data (not cloud-masked) for target_date or nearest date.
|
||||
|
||||
Searches by increasing distance from target date until finding tiles with data.
|
||||
Avoids reusing dates in exclude_dates to ensure temporal diversity in grids.
|
||||
|
||||
Args:
|
||||
target_date (pd.Timestamp): Target date to find tiles near
|
||||
tiff_dir (Path): Directory containing 5x5 date subdirectories
|
||||
field_boundary (shapely.Polygon): Field boundary for overlap detection
|
||||
days_window (int): Max days to search before/after target
|
||||
exclude_dates (list): List of dates to skip (avoid repetition)
|
||||
debug (bool): Enable detailed debugging output
|
||||
|
||||
Returns:
|
||||
tuple: (list of tile paths, actual_date, days_diff)
|
||||
"""
|
||||
target_date = pd.Timestamp(target_date)
|
||||
tiff_dir = Path(tiff_dir)
|
||||
exclude_dates = exclude_dates or []
|
||||
exclude_dates = [pd.Timestamp(d) for d in exclude_dates]
|
||||
|
||||
if not tiff_dir.exists():
|
||||
if debug:
|
||||
print(f" [DEBUG] TIFF dir does not exist: {tiff_dir}")
|
||||
return [], None, None
|
||||
|
||||
# Build map of all available dates
|
||||
available_dates = {}
|
||||
date_parse_errors = 0
|
||||
|
||||
for date_dir in tiff_dir.iterdir():
|
||||
if not date_dir.is_dir():
|
||||
continue
|
||||
try:
|
||||
dir_name = date_dir.name
|
||||
date_str = dir_name.split('_')[0]
|
||||
tile_date = pd.Timestamp(date_str)
|
||||
|
||||
tile_files = []
|
||||
for tile_file in date_dir.glob('*.tif'):
|
||||
# Include ALL tiles, regardless of size
|
||||
# Some tiles may be small but still contain valid data for specific fields
|
||||
tile_files.append(tile_file)
|
||||
|
||||
if tile_files:
|
||||
available_dates[tile_date] = (tile_files, dir_name)
|
||||
except Exception as e:
|
||||
date_parse_errors += 1
|
||||
if debug:
|
||||
print(f" [DEBUG] Failed to parse date from {date_dir.name}: {e}")
|
||||
|
||||
if debug:
|
||||
print(f" [DEBUG] Found {len(available_dates)} dates with tile files, {date_parse_errors} parse errors")
|
||||
print(f" [DEBUG] Date range: {min(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'} to {max(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'}")
|
||||
|
||||
if not available_dates:
|
||||
return [], None, None
|
||||
|
||||
# Search dates by increasing distance from target, looking for tiles with actual data
|
||||
sorted_dates = sorted(available_dates.keys(), key=lambda d: abs((d - target_date).days))
|
||||
|
||||
for search_date in sorted_dates:
|
||||
# Skip if this date was recently used (avoid temporal repetition)
|
||||
if search_date in exclude_dates:
|
||||
continue
|
||||
|
||||
tiles, dir_name = available_dates[search_date]
|
||||
days_diff = (search_date - target_date).days
|
||||
|
||||
# Try to find overlapping tiles at this date
|
||||
overlapping_tiles = []
|
||||
tile_check_errors = 0
|
||||
|
||||
for tile_path in tiles:
|
||||
try:
|
||||
with rasterio.open(tile_path) as src:
|
||||
tile_bounds = src.bounds
|
||||
tile_geom = shgeom.box(*tile_bounds)
|
||||
|
||||
# Debug first tile
|
||||
if debug and len(overlapping_tiles) == 0 and tile_check_errors == 0:
|
||||
print(f" [DEBUG] First tile check for {tile_path.name}:")
|
||||
print(f" Tile bounds: {tile_bounds}")
|
||||
print(f" Tile CRS: {src.crs}")
|
||||
print(f" Field bounds: {field_boundary.bounds}")
|
||||
print(f" Field geom type: {field_boundary.geom_type}")
|
||||
print(f" Intersects: {tile_geom.intersects(field_boundary)}")
|
||||
|
||||
if tile_geom.intersects(field_boundary):
|
||||
overlapping_tiles.append(tile_path)
|
||||
except Exception as e:
|
||||
tile_check_errors += 1
|
||||
if debug:
|
||||
print(f" [DEBUG] Error checking tile {tile_path.name}: {e}")
|
||||
|
||||
if debug:
|
||||
print(f" [DEBUG] Date {search_date.strftime('%Y-%m-%d')}: {len(tiles)} tiles, {len(overlapping_tiles)} overlap field, {tile_check_errors} errors")
|
||||
|
||||
if overlapping_tiles:
|
||||
# Found overlapping tiles, return them
|
||||
print(f" [FIND_TILES] Target: {target_date.strftime('%Y-%m-%d')}, Using: {search_date.strftime('%Y-%m-%d')} ({days_diff:+d}d), Tiles: {[Path(t).name for t in overlapping_tiles]}")
|
||||
return overlapping_tiles, search_date, days_diff
|
||||
|
||||
# No overlapping tiles found at all
|
||||
if debug:
|
||||
print(f" [DEBUG] No overlapping tiles found for {target_date.strftime('%Y-%m-%d')} within {len(sorted_dates)} searched dates")
|
||||
|
||||
return [], None, None
|
||||
|
||||
|
||||
def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)):
|
||||
"""
|
||||
Load TIFF and extract RGB bands clipped to field boundary.
|
||||
|
||||
For merged_final_tif files (cloud-masked and filtered):
|
||||
- Band 1: Red
|
||||
- Band 2: Green
|
||||
- Band 3: Blue
|
||||
- Band 4: NIR
|
||||
- Band 5: CI
|
||||
|
||||
Args:
|
||||
tiff_path (Path): Path to TIFF file
|
||||
field_boundary (shapely.Polygon): Field boundary for clipping
|
||||
rgb_bands (tuple): Band indices for RGB (1-indexed, defaults to 1,2,3 for merged_final_tif)
|
||||
|
||||
Returns:
|
||||
np.ndarray: RGB data (height, width, 3) with values 0-1
|
||||
or None if error occurs
|
||||
"""
|
||||
if rasterio is None or field_boundary is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
with rasterio.open(tiff_path) as src:
|
||||
# Check band count
|
||||
if src.count < 3:
|
||||
return None
|
||||
|
||||
# For merged_final_tif: bands 1,2,3 are R,G,B
|
||||
bands_to_read = (1, 2, 3)
|
||||
|
||||
# Mask and read bands - extract ONLY the specific field polygon
|
||||
geom = shgeom.mapping(field_boundary)
|
||||
try:
|
||||
masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read))
|
||||
rgb = np.stack([masked_data[i] for i in range(3)], axis=-1)
|
||||
except (ValueError, RuntimeError) as e:
|
||||
# Mask failed - field doesn't overlap this tile or geometry issue
|
||||
print(f" MASK ERROR on {Path(tiff_path).name}: {str(e)[:50]}")
|
||||
return None
|
||||
|
||||
# Convert to float32 if not already
|
||||
rgb = rgb.astype(np.float32)
|
||||
|
||||
# Normalize to 0-1 range
|
||||
# Data appears to be 8-bit (0-255 range) stored as float32
|
||||
# Check actual max value to determine normalization
|
||||
max_val = np.nanmax(rgb)
|
||||
if max_val > 0:
|
||||
# If max is around 255 or less, assume 8-bit
|
||||
if max_val <= 255:
|
||||
rgb = rgb / 255.0
|
||||
# If max is around 65535, assume 16-bit
|
||||
elif max_val <= 65535:
|
||||
rgb = rgb / 65535.0
|
||||
# Otherwise divide by max to normalize
|
||||
else:
|
||||
rgb = rgb / max_val
|
||||
|
||||
rgb = np.clip(rgb, 0, 1)
|
||||
|
||||
# Check if result is all NaN
|
||||
if np.all(np.isnan(rgb)):
|
||||
return None
|
||||
|
||||
# Replace any remaining NaN with 0 (cloud/invalid pixels)
|
||||
rgb = np.nan_to_num(rgb, nan=0.0)
|
||||
|
||||
return rgb
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def load_and_composite_tiles_rgb(tile_paths, field_boundary):
|
||||
"""
|
||||
Load RGB from multiple overlapping tiles and composite them into a single image.
|
||||
|
||||
Args:
|
||||
tile_paths (list[Path]): List of tile file paths
|
||||
field_boundary (shapely.Polygon): Field boundary for clipping
|
||||
|
||||
Returns:
|
||||
np.ndarray: Composited RGB data (height, width, 3) with values 0-1
|
||||
or None if error occurs
|
||||
"""
|
||||
if rasterio is None or field_boundary is None or not tile_paths:
|
||||
return None
|
||||
|
||||
try:
|
||||
# Load and composite all tiles
|
||||
rgb_arrays = []
|
||||
|
||||
for tile_path in tile_paths:
|
||||
rgb = load_and_clip_tiff_rgb(tile_path, field_boundary)
|
||||
if rgb is not None:
|
||||
rgb_arrays.append(rgb)
|
||||
|
||||
if not rgb_arrays:
|
||||
return None
|
||||
|
||||
# If single tile, return it
|
||||
if len(rgb_arrays) == 1:
|
||||
composited = rgb_arrays[0]
|
||||
else:
|
||||
# If multiple tiles, need to handle different shapes
|
||||
# Find common shape or use max/min approach that handles variable sizes
|
||||
try:
|
||||
# Try to stack if same shape
|
||||
stacked = np.stack(rgb_arrays, axis=0)
|
||||
composited = np.max(stacked, axis=0)
|
||||
except ValueError:
|
||||
# Different shapes - use the largest (most complete) tile
|
||||
# This happens when tiles are masked to different field areas
|
||||
composited = max(rgb_arrays, key=lambda x: x.size)
|
||||
|
||||
composited = composited.astype(np.float32)
|
||||
|
||||
# Stretch contrast: normalize to 0-1 range based on actual min/max in the data
|
||||
# This makes dim images visible
|
||||
valid_data = composited[composited > 0]
|
||||
if len(valid_data) > 0:
|
||||
data_min = np.percentile(valid_data, 2) # 2nd percentile to handle outliers
|
||||
data_max = np.percentile(valid_data, 98) # 98th percentile
|
||||
|
||||
if data_max > data_min:
|
||||
composited = (composited - data_min) / (data_max - data_min)
|
||||
composited = np.clip(composited, 0, 1)
|
||||
|
||||
return composited.astype(np.float32)
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def has_valid_rgb_data(rgb_data, threshold=0.05):
|
||||
"""
|
||||
Check if RGB image has actual data (not black/empty).
|
||||
Returns True if max value > threshold (not all zeros/black).
|
||||
"""
|
||||
if rgb_data is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Check if there's any variation in the RGB data
|
||||
data_max = np.nanmax(rgb_data)
|
||||
data_min = np.nanmin(rgb_data)
|
||||
|
||||
# Image is valid if max > threshold AND there's variation
|
||||
has_data = data_max > threshold and (data_max - data_min) > 0.01
|
||||
return has_data
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary,
|
||||
title, output_dir, harvest_type='registered', model_name=None, harvest_index=None):
|
||||
"""
|
||||
Create 5x3 temporal grid around harvest date (15 images, 7-day intervals).
|
||||
|
||||
Layout:
|
||||
Row 1: T-56d, T-42d, T-35d, T-28d, T-21d (pre-harvest)
|
||||
Row 2: T-14d, T-7d, T~0d, T+7d, T+14d (near harvest)
|
||||
Row 3: T+21d, T+28d, T+35d, T+42d, T+56d (post-harvest progression)
|
||||
|
||||
Args:
|
||||
harvest_date (pd.Timestamp): Target harvest date
|
||||
field_data (pd.DataFrame): Field data with Date column
|
||||
field_id (str): Field identifier
|
||||
tiff_dir (Path): Directory with TIFF files
|
||||
field_boundary (shapely.Polygon): Field boundary
|
||||
title (str): Plot title
|
||||
output_dir (Path): Output directory
|
||||
harvest_type (str): 'registered' or 'predicted'
|
||||
model_name (str): Model name for predicted harvests (e.g., 'Original', 'Long-Season')
|
||||
harvest_index (int): Index of harvest within same model (for multiple harvests)
|
||||
|
||||
Returns:
|
||||
Path: Path to saved PNG or None if failed
|
||||
"""
|
||||
harvest_date = pd.Timestamp(harvest_date)
|
||||
|
||||
# Pre-allocate 15 image slots
|
||||
rgb_images = [None] * 15
|
||||
days_offsets = [None] * 15
|
||||
actual_dates = [None] * 15
|
||||
used_dates = set() # Use set for efficient lookups
|
||||
|
||||
# STEP 0: Debug - List all available dates
|
||||
print(f" [STEP 0] Checking available TIFF dates in {tiff_dir}...")
|
||||
available_dates = []
|
||||
if tiff_dir.exists():
|
||||
for date_folder in sorted(tiff_dir.iterdir()):
|
||||
if date_folder.is_dir():
|
||||
try:
|
||||
date_obj = datetime.strptime(date_folder.name, '%Y-%m-%d').date()
|
||||
available_dates.append(date_obj)
|
||||
except:
|
||||
pass
|
||||
print(f" Found {len(available_dates)} dates with data: {available_dates[:5]}... (showing first 5)")
|
||||
|
||||
# STEP 1: Find anchor image (closest to predicted harvest date) FIRST
|
||||
# Search within ±14 days of predicted harvest date first, then expand if needed
|
||||
print(f" [STEP 1] Finding anchor (closest to harvest {harvest_date.strftime('%Y-%m-%d')}, searching ±14 days)...")
|
||||
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
|
||||
harvest_date, tiff_dir, field_boundary, days_window=14, exclude_dates=[], debug=False
|
||||
)
|
||||
|
||||
anchor_rgb = None
|
||||
anchor_idx = 8 # Position 8 is the center (T~0d / harvest date position)
|
||||
failed_anchor_dates = [] # Track dates that failed validation
|
||||
|
||||
if anchor_tile_paths and anchor_date:
|
||||
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
|
||||
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
|
||||
rgb_images[anchor_idx] = anchor_rgb
|
||||
days_offsets[anchor_idx] = 0 # Anchor is reference point
|
||||
actual_dates[anchor_idx] = anchor_date
|
||||
used_dates.add(anchor_date)
|
||||
print(f" ✓ ANCHOR FOUND (±14d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
|
||||
else:
|
||||
failed_anchor_dates.append(anchor_date)
|
||||
print(f" ⚠ Found date {anchor_date.strftime('%Y-%m-%d')} within ±14d, but image has no valid data")
|
||||
print(f" [RETRY] Expanding anchor search to ±60 days (excluding failed dates)...")
|
||||
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
|
||||
harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=set(failed_anchor_dates), debug=False
|
||||
)
|
||||
if anchor_tile_paths and anchor_date:
|
||||
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
|
||||
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
|
||||
rgb_images[anchor_idx] = anchor_rgb
|
||||
days_offsets[anchor_idx] = 0 # Anchor is reference point
|
||||
actual_dates[anchor_idx] = anchor_date
|
||||
used_dates.add(anchor_date)
|
||||
print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
|
||||
else:
|
||||
failed_anchor_dates.append(anchor_date)
|
||||
print(f" ✗ No valid anchor found even within ±60 days")
|
||||
else:
|
||||
print(f" ✗ No tiles found for any date within ±60 days")
|
||||
else:
|
||||
print(f" ⚠ No tiles found within ±14 days, expanding search...")
|
||||
anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles(
|
||||
harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=[], debug=False
|
||||
)
|
||||
if anchor_tile_paths and anchor_date:
|
||||
anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary)
|
||||
if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb):
|
||||
rgb_images[anchor_idx] = anchor_rgb
|
||||
days_offsets[anchor_idx] = 0 # Anchor is reference point
|
||||
actual_dates[anchor_idx] = anchor_date
|
||||
used_dates.add(anchor_date)
|
||||
print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)")
|
||||
else:
|
||||
print(f" ✗ No valid anchor found even within ±60 days")
|
||||
else:
|
||||
print(f" ✗ No tiles found for any date within ±60 days")
|
||||
|
||||
# STEP 2: Dynamically collect images BEFORE anchor date
|
||||
# Strategy: Go backwards from anchor with progressively larger search windows
|
||||
# Start at 7 days, then try 10, 15, 20, 30+ days apart
|
||||
print(f" [STEP 2] Collecting images BEFORE anchor (going backwards, flexible spacing)...")
|
||||
before_positions = [7, 6, 5, 4, 3, 2, 1, 0] # Will fill in reverse order (7→0)
|
||||
before_images = [] # (position, date, rgb, offset)
|
||||
|
||||
pos_idx = 0 # Index into before_positions
|
||||
last_found_date = anchor_date
|
||||
|
||||
# Progressive search offsets: try these day offsets in order
|
||||
search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days before last found image
|
||||
|
||||
while pos_idx < len(before_positions) and last_found_date.year >= 2024:
|
||||
found_this_iteration = False
|
||||
|
||||
# Try each offset until we find a valid image
|
||||
for days_offset in search_offsets:
|
||||
search_target_date = last_found_date - timedelta(days=days_offset)
|
||||
|
||||
tile_paths, actual_date, days_diff = find_overlapping_tiles(
|
||||
search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False
|
||||
)
|
||||
|
||||
if tile_paths and actual_date:
|
||||
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
|
||||
if rgb is not None and has_valid_rgb_data(rgb):
|
||||
# Found valid image!
|
||||
overall_max = np.nanmax(rgb)
|
||||
overall_min = np.nanmin(rgb)
|
||||
|
||||
offset_from_anchor = (actual_date - anchor_date).days
|
||||
before_images.append((before_positions[pos_idx], actual_date, rgb, offset_from_anchor))
|
||||
used_dates.add(actual_date)
|
||||
last_found_date = actual_date # Move backwards from this date
|
||||
|
||||
print(f" ✓ Before[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}")
|
||||
|
||||
pos_idx += 1
|
||||
found_this_iteration = True
|
||||
break # Found one, stop trying larger offsets
|
||||
|
||||
# If nothing found with any offset, we're done collecting before images
|
||||
if not found_this_iteration:
|
||||
break
|
||||
|
||||
# Store collected before images
|
||||
for pos, actual_date, rgb, offset in before_images:
|
||||
rgb_images[pos] = rgb
|
||||
actual_dates[pos] = actual_date
|
||||
days_offsets[pos] = offset
|
||||
|
||||
# STEP 3: Dynamically collect images AFTER anchor date
|
||||
# Strategy: Go forwards from anchor with progressively larger search windows
|
||||
# Start at 7 days, then try 10, 15, 20, 30+ days apart
|
||||
print(f" [STEP 3] Collecting images AFTER anchor (going forwards, flexible spacing)...")
|
||||
after_positions = [9, 10, 11, 12, 13, 14] # Will fill in order (9→14)
|
||||
after_images = [] # (position, date, rgb, offset)
|
||||
|
||||
pos_idx = 0 # Index into after_positions
|
||||
last_found_date = anchor_date
|
||||
max_search_date = anchor_date + timedelta(days=200) # Don't search beyond 200 days forward
|
||||
|
||||
# Progressive search offsets: try these day offsets in order
|
||||
search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days after last found image
|
||||
|
||||
while pos_idx < len(after_positions) and last_found_date < max_search_date:
|
||||
found_this_iteration = False
|
||||
|
||||
# Try each offset until we find a valid image
|
||||
for days_offset in search_offsets:
|
||||
search_target_date = last_found_date + timedelta(days=days_offset)
|
||||
|
||||
# Don't search beyond max date
|
||||
if search_target_date > max_search_date:
|
||||
break
|
||||
|
||||
tile_paths, actual_date, days_diff = find_overlapping_tiles(
|
||||
search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False
|
||||
)
|
||||
|
||||
if tile_paths and actual_date:
|
||||
rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary)
|
||||
if rgb is not None and has_valid_rgb_data(rgb):
|
||||
# Found valid image!
|
||||
overall_max = np.nanmax(rgb)
|
||||
overall_min = np.nanmin(rgb)
|
||||
|
||||
offset_from_anchor = (actual_date - anchor_date).days
|
||||
after_images.append((after_positions[pos_idx], actual_date, rgb, offset_from_anchor))
|
||||
used_dates.add(actual_date)
|
||||
last_found_date = actual_date # Move forwards from this date
|
||||
|
||||
print(f" ✓ After[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}")
|
||||
|
||||
pos_idx += 1
|
||||
found_this_iteration = True
|
||||
break # Found one, stop trying larger offsets
|
||||
|
||||
# If nothing found with any offset, we're done collecting after images
|
||||
if not found_this_iteration:
|
||||
break
|
||||
|
||||
# Store collected after images
|
||||
for pos, actual_date, rgb, offset in after_images:
|
||||
rgb_images[pos] = rgb
|
||||
actual_dates[pos] = actual_date
|
||||
days_offsets[pos] = offset
|
||||
|
||||
# Create 5x3 grid plot (15 images)
|
||||
fig, axes = plt.subplots(3, 5, figsize=(25, 15))
|
||||
|
||||
# Build title with anchor offset information
|
||||
anchor_offset_from_harvest = (actual_dates[8] - harvest_date).days if actual_dates[8] is not None else None
|
||||
if anchor_offset_from_harvest is not None and anchor_offset_from_harvest != 0:
|
||||
anchor_info = f"(Anchor: {actual_dates[8].strftime('%Y-%m-%d')}, {anchor_offset_from_harvest:+d}d from predicted harvest)"
|
||||
else:
|
||||
anchor_info = f"(Exact match with anchor: {actual_dates[8].strftime('%Y-%m-%d')})" if actual_dates[8] is not None else ""
|
||||
|
||||
fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")} {anchor_info}',
|
||||
fontsize=16, fontweight='bold')
|
||||
|
||||
# Grid positions (5 columns, 3 rows = 15 images)
|
||||
positions = [
|
||||
('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4),
|
||||
('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('HARVEST', 1, 3), ('T+7d', 1, 4),
|
||||
('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4),
|
||||
]
|
||||
|
||||
for idx, (label, row, col) in enumerate(positions):
|
||||
ax = axes[row, col]
|
||||
|
||||
if idx < len(rgb_images) and rgb_images[idx] is not None:
|
||||
rgb_data = rgb_images[idx]
|
||||
# Debug: check data range for ALL bands
|
||||
data_min = np.nanmin(rgb_data)
|
||||
data_max = np.nanmax(rgb_data)
|
||||
data_mean = np.nanmean(rgb_data)
|
||||
data_std = np.nanstd(rgb_data)
|
||||
|
||||
# Check per-band stats
|
||||
r_min, r_max, r_mean = np.nanmin(rgb_data[:,:,0]), np.nanmax(rgb_data[:,:,0]), np.nanmean(rgb_data[:,:,0])
|
||||
g_min, g_max, g_mean = np.nanmin(rgb_data[:,:,1]), np.nanmax(rgb_data[:,:,1]), np.nanmean(rgb_data[:,:,1])
|
||||
b_min, b_max, b_mean = np.nanmin(rgb_data[:,:,2]), np.nanmax(rgb_data[:,:,2]), np.nanmean(rgb_data[:,:,2])
|
||||
|
||||
print(f" DEBUG VALID {label} ({actual_dates[idx].strftime('%Y-%m-%d')}): RGB overall {data_min:.4f}-{data_max:.4f} (mean={data_mean:.4f}, std={data_std:.4f})")
|
||||
print(f" R: {r_min:.4f}-{r_max:.4f} (μ={r_mean:.4f}), G: {g_min:.4f}-{g_max:.4f} (μ={g_mean:.4f}), B: {b_min:.4f}-{b_max:.4f} (μ={b_mean:.4f})")
|
||||
|
||||
# Display with explicit vmin/vmax to handle normalized 0-1 data
|
||||
ax.imshow(rgb_data, vmin=0, vmax=1)
|
||||
|
||||
# Build title: show BOTH anchor offset AND harvest offset
|
||||
if days_offsets[idx] is not None:
|
||||
offset_from_anchor = days_offsets[idx]
|
||||
offset_from_harvest = (actual_dates[idx] - harvest_date).days
|
||||
|
||||
if idx == 8: # ANCHOR/HARVEST position
|
||||
if offset_from_harvest == 0:
|
||||
offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')})"
|
||||
else:
|
||||
offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')}, {offset_from_harvest:+d}d from predicted)"
|
||||
else:
|
||||
# Show both offsets: from anchor and from harvest
|
||||
offset_str = f"{offset_from_anchor:+d}d from anchor\n{offset_from_harvest:+d}d from harvest\n{actual_dates[idx].strftime('%Y-%m-%d')}"
|
||||
else:
|
||||
offset_str = "No Data"
|
||||
|
||||
ax.set_title(offset_str, fontsize=9, fontweight='bold')
|
||||
|
||||
# Add red box around the ANCHOR IMAGE (position 8 is harvest/anchor)
|
||||
if idx == 8: # Position 8 is the anchor
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor('red')
|
||||
spine.set_linewidth(4)
|
||||
else:
|
||||
ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray')
|
||||
ax.set_title('No Data', fontsize=10)
|
||||
print(f" DEBUG EMPTY {label}: No image data collected")
|
||||
|
||||
# Add red box for anchor position even if no data
|
||||
if idx == 8: # Position 8 is the anchor
|
||||
for spine in ax.spines.values():
|
||||
spine.set_edgecolor('red')
|
||||
spine.set_linewidth(4)
|
||||
|
||||
ax.set_xticks([])
|
||||
ax.set_yticks([])
|
||||
|
||||
plt.tight_layout()
|
||||
|
||||
# Save figure with detailed naming: field_ID_harvestdate_model_harvestyle.png
|
||||
harvest_date_str = harvest_date.strftime('%Y%m%d')
|
||||
|
||||
if harvest_type == 'registered':
|
||||
filename = f'field_{field_id}_{harvest_date_str}_registered_harvest_rgb.png'
|
||||
else:
|
||||
# For predicted: include model name and harvest index if multiple
|
||||
if harvest_index is not None and harvest_index > 0:
|
||||
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest{harvest_index}_rgb.png'
|
||||
else:
|
||||
filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png'
|
||||
|
||||
output_path = Path(output_dir) / filename
|
||||
try:
|
||||
plt.savefig(output_path, dpi=100, format='png')
|
||||
plt.close()
|
||||
|
||||
print(f" ✓ Saved: {filename}")
|
||||
return output_path
|
||||
except Exception as e:
|
||||
plt.close()
|
||||
print(f" ✗ Error saving PNG: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates,
|
||||
output_dir, tiff_dir, geojson_path):
|
||||
"""
|
||||
Main orchestration function for RGB visualization.
|
||||
|
||||
Creates 3x3 grids for:
|
||||
1. Registered harvest dates (if available)
|
||||
2. Predicted harvest dates (if available)
|
||||
|
||||
Args:
|
||||
field_data (pd.DataFrame): Field data with Date, CI columns
|
||||
field_id (str): Field identifier
|
||||
registered_harvest_dates (list): List of registered harvest dates (pd.Timestamp)
|
||||
predicted_harvest_dates (list): List of predicted harvest dates (dict or pd.Timestamp)
|
||||
output_dir (Path): Output directory for plots
|
||||
tiff_dir (Path): Directory containing TIFF files
|
||||
geojson_path (Path): Path to pivot.geojson
|
||||
|
||||
Returns:
|
||||
dict: Summary of generated plots with keys 'registered' and 'predicted'
|
||||
"""
|
||||
if rasterio is None:
|
||||
print(" ⚠ Rasterio not available - skipping RGB visualization")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
tiff_dir = Path(tiff_dir)
|
||||
geojson_path = Path(geojson_path)
|
||||
|
||||
if not tiff_dir.exists():
|
||||
print(f" ✗ TIFF directory not found: {tiff_dir}")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
if not geojson_path.exists():
|
||||
print(f" ✗ GeoJSON not found: {geojson_path}")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
# Load field boundary
|
||||
print(f" Loading field boundary for {field_id}...")
|
||||
feature, field_boundary = load_field_boundaries(geojson_path, field_id)
|
||||
|
||||
if field_boundary is None:
|
||||
print(f" ✗ Could not load field boundary for {field_id}")
|
||||
return {'registered': [], 'predicted': []}
|
||||
|
||||
results = {'registered': [], 'predicted': []}
|
||||
|
||||
# Process registered harvest dates
|
||||
if registered_harvest_dates and len(registered_harvest_dates) > 0:
|
||||
print(f" Processing {len(registered_harvest_dates)} registered harvest dates...")
|
||||
for i, harvest_date in enumerate(registered_harvest_dates):
|
||||
if pd.isna(harvest_date):
|
||||
continue
|
||||
|
||||
print(f" [{i+1}/{len(registered_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')}")
|
||||
output_path = create_temporal_rgb_grid(
|
||||
harvest_date, field_data, field_id, tiff_dir, field_boundary,
|
||||
title='Registered Harvest Validation',
|
||||
output_dir=output_dir,
|
||||
harvest_type='registered',
|
||||
model_name=None,
|
||||
harvest_index=i
|
||||
)
|
||||
if output_path:
|
||||
results['registered'].append(output_path)
|
||||
|
||||
# Process predicted harvest dates - grouped by model
|
||||
if predicted_harvest_dates and len(predicted_harvest_dates) > 0:
|
||||
print(f" Processing {len(predicted_harvest_dates)} predicted harvest dates...")
|
||||
|
||||
# Group by model to track index per model
|
||||
harvest_by_model = {}
|
||||
for harvest_info in predicted_harvest_dates:
|
||||
# Handle both dict and Timestamp formats
|
||||
if isinstance(harvest_info, dict):
|
||||
harvest_date = harvest_info.get('harvest_date')
|
||||
model_name = harvest_info.get('model_name', 'predicted')
|
||||
else:
|
||||
harvest_date = harvest_info
|
||||
model_name = 'predicted'
|
||||
|
||||
if model_name not in harvest_by_model:
|
||||
harvest_by_model[model_name] = []
|
||||
harvest_by_model[model_name].append(harvest_date)
|
||||
|
||||
# Process each model's harvests
|
||||
overall_index = 1
|
||||
for model_name, harvest_dates in harvest_by_model.items():
|
||||
for model_harvest_idx, harvest_date in enumerate(harvest_dates):
|
||||
if pd.isna(harvest_date):
|
||||
continue
|
||||
|
||||
print(f" [{overall_index}/{len(predicted_harvest_dates)}] {harvest_date.strftime('%Y-%m-%d')} ({model_name})")
|
||||
output_path = create_temporal_rgb_grid(
|
||||
harvest_date, field_data, field_id, tiff_dir, field_boundary,
|
||||
title=f'Predicted Harvest Validation ({model_name})',
|
||||
output_dir=output_dir,
|
||||
harvest_type='predicted',
|
||||
model_name=model_name,
|
||||
harvest_index=model_harvest_idx
|
||||
)
|
||||
if output_path:
|
||||
results['predicted'].append(output_path)
|
||||
overall_index += 1
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Example usage
|
||||
print("RGB Visualization Tool")
|
||||
print("This module is intended to be imported and called from compare_307_models_production.py")
|
||||
print("\nExample:")
|
||||
print(" from rgb_visualization import generate_rgb_grids")
|
||||
print(" generate_rgb_grids(field_data, field_id, registered_dates, predicted_dates, output_dir, tiff_dir, geojson_path)")
|
||||
|
|
@ -6,14 +6,33 @@
|
|||
#' 2. Create master 5×5 grid covering all TIFFs
|
||||
#' 3. Split each daily TIFF into 25 tiles using the master grid
|
||||
#' 4. Save tiles in date-specific folders: daily_tiles/[DATE]/[DATE]_[TILE_ID].tif
|
||||
#' & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-18
|
||||
|
||||
|
||||
library(terra)
|
||||
library(sf)
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# CONFIGURATION & COMMAND-LINE ARGUMENTS
|
||||
# ============================================================================
|
||||
|
||||
# Parse command-line arguments for date filtering
|
||||
args <- commandArgs(trailingOnly = TRUE)
|
||||
|
||||
# Example: Rscript 10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-17
|
||||
start_date <- NULL
|
||||
end_date <- NULL
|
||||
|
||||
if (length(args) >= 1) {
|
||||
start_date <- as.Date(args[1])
|
||||
cat("Filtering: start date =", as.character(start_date), "\n")
|
||||
}
|
||||
|
||||
if (length(args) >= 2) {
|
||||
end_date <- as.Date(args[2])
|
||||
cat("Filtering: end date =", as.character(end_date), "\n")
|
||||
}
|
||||
|
||||
PROJECT <- "angata"
|
||||
TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b")
|
||||
|
||||
|
|
@ -40,31 +59,50 @@ cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "")
|
|||
cat("\n[PART 1] Creating Master Grid\n")
|
||||
|
||||
# Load field boundaries for overlap checking
|
||||
cat("\n[1] Loading field boundaries from GeoJSON...\n")
|
||||
cat("\n[1] Checking for existing master grid...\n")
|
||||
|
||||
if (!file.exists(GEOJSON_PATH)) {
|
||||
stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n",
|
||||
"Please ensure ", PROJECT, " has a pivot.geojson file.")
|
||||
}
|
||||
# Check if master grid already exists
|
||||
MASTER_GRID_PATH <- file.path(OUTPUT_FOLDER, paste0("master_grid_", GRID_SIZE_LABEL, ".geojson"))
|
||||
|
||||
field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE)
|
||||
field_boundaries_vect <- terra::vect(GEOJSON_PATH)
|
||||
if (file.exists(MASTER_GRID_PATH)) {
|
||||
cat(" ✓ Found existing master grid at:\n ", MASTER_GRID_PATH, "\n", sep = "")
|
||||
master_grid_sf <- st_read(MASTER_GRID_PATH, quiet = TRUE)
|
||||
field_boundaries_sf <- NULL # No need to load pivot.geojson
|
||||
field_boundaries_vect <- NULL
|
||||
|
||||
cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s)\n", sep = "")
|
||||
cat(" ✓ Loaded grid with ", nrow(master_grid_sf), " tiles\n", sep = "")
|
||||
|
||||
# Try to find a name column (could be 'name', 'field', 'field_name', etc.)
|
||||
field_names <- NA
|
||||
if ("name" %in% names(field_boundaries_sf)) {
|
||||
field_names <- field_boundaries_sf$name
|
||||
} else if ("field" %in% names(field_boundaries_sf)) {
|
||||
field_names <- field_boundaries_sf$field
|
||||
} else if ("field_name" %in% names(field_boundaries_sf)) {
|
||||
field_names <- field_boundaries_sf$field_name
|
||||
} else {
|
||||
field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices
|
||||
# No existing grid - need to create one from pivot.geojson
|
||||
cat(" No existing grid found. Creating new one from pivot.geojson...\n")
|
||||
|
||||
if (!file.exists(GEOJSON_PATH)) {
|
||||
stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n",
|
||||
"Please ensure ", PROJECT, " has a pivot.geojson file, or run this script ",
|
||||
"from the same directory as a previous successful run (grid already exists).")
|
||||
}
|
||||
|
||||
field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE)
|
||||
field_boundaries_vect <- terra::vect(GEOJSON_PATH)
|
||||
|
||||
cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s) from GeoJSON\n", sep = "")
|
||||
}
|
||||
|
||||
cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "")
|
||||
# Try to find a name column (only if field_boundaries_sf exists)
|
||||
if (!is.null(field_boundaries_sf)) {
|
||||
field_names <- NA
|
||||
if ("name" %in% names(field_boundaries_sf)) {
|
||||
field_names <- field_boundaries_sf$name
|
||||
} else if ("field" %in% names(field_boundaries_sf)) {
|
||||
field_names <- field_boundaries_sf$field
|
||||
} else if ("field_name" %in% names(field_boundaries_sf)) {
|
||||
field_names <- field_boundaries_sf$field_name
|
||||
} else {
|
||||
field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices
|
||||
}
|
||||
|
||||
cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "")
|
||||
}
|
||||
|
||||
# Helper function: Check if a tile overlaps with any field (simple bbox overlap)
|
||||
tile_overlaps_fields <- function(tile_extent, field_geoms) {
|
||||
|
|
@ -105,6 +143,27 @@ cat("\n[2] Checking TIFF extents...\n")
|
|||
tiff_files <- list.files(TIFF_FOLDER, pattern = "\\.tif$", full.names = FALSE)
|
||||
tiff_files <- sort(tiff_files)
|
||||
|
||||
# Filter by date range if specified
|
||||
if (!is.null(start_date) || !is.null(end_date)) {
|
||||
cat("\nApplying date filter...\n")
|
||||
|
||||
file_dates <- as.Date(sub("\\.tif$", "", tiff_files))
|
||||
|
||||
if (!is.null(start_date) && !is.null(end_date)) {
|
||||
keep_idx <- file_dates >= start_date & file_dates <= end_date
|
||||
cat(" Date range: ", as.character(start_date), " to ", as.character(end_date), "\n", sep = "")
|
||||
} else if (!is.null(start_date)) {
|
||||
keep_idx <- file_dates >= start_date
|
||||
cat(" From: ", as.character(start_date), "\n", sep = "")
|
||||
} else {
|
||||
keep_idx <- file_dates <= end_date
|
||||
cat(" Until: ", as.character(end_date), "\n", sep = "")
|
||||
}
|
||||
|
||||
tiff_files <- tiff_files[keep_idx]
|
||||
cat(" ✓ Filtered to ", length(tiff_files), " file(s)\n", sep = "")
|
||||
}
|
||||
|
||||
if (length(tiff_files) == 0) {
|
||||
stop("No TIFF files found in ", TIFF_FOLDER)
|
||||
}
|
||||
|
|
@ -242,26 +301,33 @@ if (file.exists(master_grid_file)) {
|
|||
|
||||
cat("\n[PART 2] Creating Filtered Grid (only overlapping tiles)\n")
|
||||
|
||||
cat("\n[7] Filtering master grid to only overlapping tiles...\n")
|
||||
# If grid was loaded from file, it's already filtered. Skip filtering.
|
||||
if (!file.exists(MASTER_GRID_PATH)) {
|
||||
cat("\n[7] Filtering master grid to only overlapping tiles...\n")
|
||||
|
||||
# Check which tiles overlap with any field
|
||||
overlapping_tile_indices <- c()
|
||||
for (tile_idx in 1:nrow(master_grid_sf)) {
|
||||
# Check which tiles overlap with any field
|
||||
overlapping_tile_indices <- c()
|
||||
for (tile_idx in 1:nrow(master_grid_sf)) {
|
||||
tile_geom <- master_grid_sf[tile_idx, ]
|
||||
|
||||
# Check overlap with any field
|
||||
if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) {
|
||||
overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx)
|
||||
}
|
||||
}
|
||||
|
||||
cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "")
|
||||
cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "")
|
||||
|
||||
# Create filtered grid with only overlapping tiles
|
||||
filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ]
|
||||
filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices)
|
||||
} else {
|
||||
cat("\n[7] Using pre-filtered grid (already loaded from file)...\n")
|
||||
# Grid was already loaded - it's already filtered
|
||||
filtered_grid_sf <- master_grid_sf
|
||||
}
|
||||
|
||||
cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "")
|
||||
cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "")
|
||||
|
||||
# Create filtered grid with only overlapping tiles
|
||||
filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ]
|
||||
filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices)
|
||||
|
||||
# Convert to SpatVector for makeTiles
|
||||
filtered_grid_vect <- terra::vect(filtered_grid_sf)
|
||||
|
||||
|
|
@ -314,7 +380,7 @@ for (file_idx in seq_along(tiff_files)) {
|
|||
dir.create(date_output_folder, recursive = TRUE, showWarnings = FALSE)
|
||||
}
|
||||
|
||||
cat(" Creating ", length(overlapping_tile_indices), " tiles...\n", sep = "")
|
||||
cat(" Creating ", nrow(filtered_grid_sf), " tiles...\n", sep = "")
|
||||
|
||||
# Use makeTiles with FILTERED grid (only overlapping tiles)
|
||||
tiles_list <- terra::makeTiles(
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@
|
|||
#
|
||||
# Examples:
|
||||
# # Angata 8-band data (with UDM cloud masking)
|
||||
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/02_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
|
||||
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
|
||||
#
|
||||
# # Aura 4-band data
|
||||
# Rscript 02_ci_extraction.R 2025-11-26 7 aura merged_tif
|
||||
# Rscript 20_ci_extraction.R 2025-11-26 7 aura merged_tif
|
||||
#
|
||||
# # Auto-detects and uses tiles if available:
|
||||
# Rscript 02_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists)
|
||||
# Rscript 20_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists)
|
||||
|
||||
# 1. Load required packages
|
||||
# -----------------------
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
#
|
||||
# Usage: Rscript interpolate_growth_model.R [project_dir]
|
||||
# - project_dir: Project directory name (e.g., "chemba")
|
||||
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/03_interpolate_growth_model.R angata
|
||||
# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/30_interpolate_growth_model.R angata
|
||||
|
||||
# 1. Load required packages
|
||||
# -----------------------
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
# - tile_size: Tile size in km (default: 5, only used if use_tiles=TRUE)
|
||||
#
|
||||
# Examples:
|
||||
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata
|
||||
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata week_51.tif TRUE 5 [tile-based]
|
||||
|
||||
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation.R 2026-01-12 7 angata
|
||||
#
|
||||
|
||||
# 1. Load required packages
|
||||
|
|
@ -54,14 +54,12 @@ main <- function() {
|
|||
if (is.na(end_date)) {
|
||||
message("Invalid end_date provided. Using current date.")
|
||||
end_date <- Sys.Date()
|
||||
end_date <- "2026-01-01" # Default date for testing
|
||||
}
|
||||
} else if (exists("end_date_str", envir = .GlobalEnv)) {
|
||||
end_date <- as.Date(get("end_date_str", envir = .GlobalEnv))
|
||||
} else {
|
||||
# Default to current date if no argument is provided
|
||||
end_date <- Sys.Date()
|
||||
end_date <- "2026-01-01" # Default date for testing
|
||||
message("No end_date provided. Using current date: ", format(end_date))
|
||||
}
|
||||
|
||||
|
|
@ -142,15 +140,35 @@ main <- function() {
|
|||
if (use_tile_mosaic) {
|
||||
# TILE-BASED APPROACH: Create per-tile weekly MAX mosaics
|
||||
# This is used for projects like Angata with large ROIs requiring spatial partitioning
|
||||
# Input data comes from merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif (5-band tiles from script 20)
|
||||
tryCatch({
|
||||
safe_log("Starting per-tile mosaic creation (tile-based approach)...")
|
||||
|
||||
# Set output directory for per-tile mosaics
|
||||
tile_output_base <- file.path(laravel_storage, "weekly_tile_max")
|
||||
# Detect grid size from merged_final_tif folder structure
|
||||
# Expected: merged_final_tif/5x5/ or merged_final_tif/10x10/ etc.
|
||||
merged_final_base <- file.path(laravel_storage, "merged_final_tif")
|
||||
grid_subfolders <- list.dirs(merged_final_base, full.names = FALSE, recursive = FALSE)
|
||||
# Look for grid size patterns like "5x5", "10x10", "20x20"
|
||||
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
|
||||
|
||||
if (length(grid_patterns) == 0) {
|
||||
stop("No grid size subfolder found in merged_final_tif/ (expected: 5x5, 10x10, etc.)")
|
||||
}
|
||||
|
||||
grid_size <- grid_patterns[1] # Use first grid size found
|
||||
safe_log(paste("Detected grid size:", grid_size))
|
||||
|
||||
# Point to the grid-specific merged_final_tif directory
|
||||
merged_final_with_grid <- file.path(merged_final_base, grid_size)
|
||||
|
||||
# Set output directory for per-tile mosaics, organized by grid size
|
||||
# Output: weekly_tile_max/{grid_size}/week_WW_YYYY_TT.tif
|
||||
tile_output_base <- file.path(laravel_storage, "weekly_tile_max", grid_size)
|
||||
dir.create(tile_output_base, recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
created_tile_files <- create_weekly_mosaic_from_tiles(
|
||||
dates = dates,
|
||||
merged_final_dir = merged_final,
|
||||
merged_final_dir = merged_final_with_grid,
|
||||
tile_output_dir = tile_output_base,
|
||||
field_boundaries = field_boundaries
|
||||
)
|
||||
|
|
|
|||
|
|
@ -354,18 +354,17 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
}
|
||||
|
||||
# Get filenames of best-coverage images
|
||||
# Match by extracting tile ID from both cloud stats and TIF filenames
|
||||
# Match by full filename from cloud stats to TIF files
|
||||
rasters_to_use <- character()
|
||||
for (idx in best_coverage) {
|
||||
# Extract tile ID from cloud_coverage_stats filename (e.g., "tile_18" → 18)
|
||||
# Get the full filename from cloud coverage stats
|
||||
cc_filename <- cloud_coverage_stats$filename[idx]
|
||||
cc_tile_id <- gsub(".*_([0-9]+).*", "\\1", cc_filename)
|
||||
|
||||
# Find matching TIF file by matching tile ID
|
||||
# Find matching TIF file by full filename
|
||||
matching_tif <- NULL
|
||||
for (tif_file in tif_files) {
|
||||
tif_tile_id <- gsub(".*_([0-9]+)\\.tif", "\\1", basename(tif_file))
|
||||
if (tif_tile_id == cc_tile_id) {
|
||||
tif_basename <- basename(tif_file)
|
||||
if (tif_basename == cc_filename) {
|
||||
matching_tif <- tif_file
|
||||
break
|
||||
}
|
||||
|
|
@ -373,6 +372,8 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
|
||||
if (!is.null(matching_tif)) {
|
||||
rasters_to_use <- c(rasters_to_use, matching_tif)
|
||||
} else {
|
||||
safe_log(paste("Warning: Could not find TIF file matching cloud stats entry:", cc_filename), "WARNING")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -420,17 +421,37 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
mosaic <- tryCatch({
|
||||
safe_log(paste("Creating max composite from", length(all_rasters), "images to fill clouds"))
|
||||
|
||||
# Get extent from field boundaries if available, otherwise use raster intersection
|
||||
# Check if all rasters have identical grids (extent and resolution)
|
||||
# This is likely for per-tile mosaics from the same tiling scheme
|
||||
reference_raster <- all_rasters[[1]]
|
||||
ref_ext <- terra::ext(reference_raster)
|
||||
ref_res <- terra::res(reference_raster)
|
||||
|
||||
grids_match <- all(sapply(all_rasters[-1], function(r) {
|
||||
isTRUE(all.equal(terra::ext(r), ref_ext, tolerance = 1e-6)) &&
|
||||
isTRUE(all.equal(terra::res(r), ref_res, tolerance = 1e-6))
|
||||
}))
|
||||
|
||||
if (grids_match) {
|
||||
# All rasters have matching grids - no cropping/resampling needed!
|
||||
safe_log("All rasters have identical grids - stacking directly for max composite")
|
||||
raster_collection <- terra::sprc(all_rasters)
|
||||
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
|
||||
} else {
|
||||
# Grids don't match - need to crop and resample
|
||||
safe_log("Rasters have different grids - cropping and resampling to common extent")
|
||||
|
||||
# Get extent from field boundaries if available, otherwise use raster union
|
||||
if (!is.null(field_boundaries)) {
|
||||
crop_extent <- terra::ext(field_boundaries)
|
||||
safe_log("Using field boundaries extent for consistent area across all dates")
|
||||
} else {
|
||||
# Fallback: use intersection of all raster extents
|
||||
# Use union of all extents (covers all data)
|
||||
crop_extent <- terra::ext(all_rasters[[1]])
|
||||
for (i in 2:length(all_rasters)) {
|
||||
crop_extent <- terra::intersect(crop_extent, terra::ext(all_rasters[[i]]))
|
||||
crop_extent <- terra::union(crop_extent, terra::ext(all_rasters[[i]]))
|
||||
}
|
||||
safe_log("Using raster intersection extent")
|
||||
safe_log("Using raster union extent")
|
||||
}
|
||||
|
||||
# Crop all rasters to common extent
|
||||
|
|
@ -439,22 +460,20 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
})
|
||||
|
||||
# Resample all cropped rasters to match the first one's grid
|
||||
# This handles pixel grid misalignment from Python's dynamic extent adjustment
|
||||
reference_grid <- cropped_rasters[[1]]
|
||||
safe_log("Resampling rasters to common grid for stacking")
|
||||
|
||||
aligned_rasters <- lapply(cropped_rasters, function(r) {
|
||||
if (identical(terra::ext(r), terra::ext(reference_grid)) &&
|
||||
identical(terra::res(r), terra::res(reference_grid))) {
|
||||
if (isTRUE(all.equal(terra::ext(r), terra::ext(reference_grid), tolerance = 1e-6)) &&
|
||||
isTRUE(all.equal(terra::res(r), terra::res(reference_grid), tolerance = 1e-6))) {
|
||||
return(r) # Already aligned
|
||||
}
|
||||
terra::resample(r, reference_grid, method = "near")
|
||||
})
|
||||
|
||||
# Create max composite using mosaic on aligned rasters
|
||||
# Resample ensures all rasters have matching grids (no resolution mismatch)
|
||||
raster_collection <- terra::sprc(aligned_rasters)
|
||||
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
|
||||
}
|
||||
|
||||
max_mosaic
|
||||
}, error = function(e) {
|
||||
|
|
@ -686,6 +705,16 @@ create_weekly_mosaic_from_tiles <- function(dates, merged_final_dir, tile_output
|
|||
next
|
||||
}
|
||||
|
||||
# DEBUG: Check mosaic content before saving
|
||||
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "dimensions:", nrow(tile_mosaic), "x", ncol(tile_mosaic)))
|
||||
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "bands:", terra::nlyr(tile_mosaic)))
|
||||
|
||||
# Check first band values
|
||||
band1 <- tile_mosaic[[1]]
|
||||
band1_min <- terra::global(band1, fun = "min", na.rm = TRUE)$min
|
||||
band1_max <- terra::global(band1, fun = "max", na.rm = TRUE)$max
|
||||
safe_log(paste(" DEBUG: Band 1 MIN=", round(band1_min, 2), "MAX=", round(band1_max, 2)))
|
||||
|
||||
# Step 2c: Save this tile's weekly MAX mosaic
|
||||
# Filename format: week_WW_YYYY_TT.tif (e.g., week_02_2026_01.tif for week 2, 2026, tile 1)
|
||||
tile_filename <- paste0("week_", sprintf("%02d", dates$week), "_", dates$year, "_",
|
||||
|
|
@ -763,7 +792,7 @@ count_cloud_coverage_for_tile <- function(tile_files, field_boundaries = NULL) {
|
|||
missing_pct <- round(100 - ((total_notna / total_pixels) * 100))
|
||||
|
||||
aggregated_results[[idx]] <- data.frame(
|
||||
filename = paste0("tile_", sprintf("%02d", as.integer(gsub(".*_([0-9]+)\\.tif", "\\1", basename(tile_file))))),
|
||||
filename = basename(tile_file), # Keep full filename: 2026-01-07_03.tif
|
||||
notNA = total_notna,
|
||||
total_pixels = total_pixels,
|
||||
missing_pixels_percentage = missing_pct,
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
258
r_app/80_report_building_utils.R
Normal file
258
r_app/80_report_building_utils.R
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
# 80_REPORT_BUILDING_UTILS.R
|
||||
# ============================================================================
|
||||
# UTILITY FUNCTIONS FOR REPORT GENERATION AND EXCEL/CSV EXPORT
|
||||
#
|
||||
# This file contains reusable functions for:
|
||||
# - Field analysis summary generation
|
||||
# - Excel/CSV/RDS export functionality
|
||||
# - Farm-level KPI aggregation and summary
|
||||
# - Tile-based KPI extraction (alternative calculation method)
|
||||
#
|
||||
# Used by: 80_calculate_kpis.R, run_full_pipeline.R, other reporting scripts
|
||||
# ============================================================================
|
||||
|
||||
# ============================================================================
|
||||
# SUMMARY GENERATION
|
||||
# ============================================================================
|
||||
|
||||
generate_field_analysis_summary <- function(field_df) {
|
||||
message("Generating summary statistics...")
|
||||
|
||||
total_acreage <- sum(field_df$Acreage, na.rm = TRUE)
|
||||
|
||||
germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE)
|
||||
tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE)
|
||||
grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE)
|
||||
maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE)
|
||||
unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE)
|
||||
|
||||
harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE)
|
||||
stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE)
|
||||
recovery_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "strong_recovery"], na.rm = TRUE)
|
||||
growth_on_track_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "growth_on_track"], na.rm = TRUE)
|
||||
germination_complete_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_complete"], na.rm = TRUE)
|
||||
germination_started_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_started"], na.rm = TRUE)
|
||||
no_trigger_acreage <- sum(field_df$Acreage[is.na(field_df$Status_trigger)], na.rm = TRUE)
|
||||
|
||||
clear_fields <- sum(field_df$Cloud_category == "Clear view", na.rm = TRUE)
|
||||
partial_fields <- sum(field_df$Cloud_category == "Partial coverage", na.rm = TRUE)
|
||||
no_image_fields <- sum(field_df$Cloud_category == "No image available", na.rm = TRUE)
|
||||
total_fields <- nrow(field_df)
|
||||
|
||||
clear_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Clear view"], na.rm = TRUE)
|
||||
partial_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Partial coverage"], na.rm = TRUE)
|
||||
no_image_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "No image available"], na.rm = TRUE)
|
||||
|
||||
summary_df <- data.frame(
|
||||
Category = c(
|
||||
"--- PHASE DISTRIBUTION ---",
|
||||
"Germination",
|
||||
"Tillering",
|
||||
"Grand Growth",
|
||||
"Maturation",
|
||||
"Unknown phase",
|
||||
"--- STATUS TRIGGERS ---",
|
||||
"Harvest ready",
|
||||
"Stress detected",
|
||||
"Strong recovery",
|
||||
"Growth on track",
|
||||
"Germination complete",
|
||||
"Germination started",
|
||||
"No trigger",
|
||||
"--- CLOUD COVERAGE (FIELDS) ---",
|
||||
"Clear view",
|
||||
"Partial coverage",
|
||||
"No image available",
|
||||
"--- CLOUD COVERAGE (ACREAGE) ---",
|
||||
"Clear view",
|
||||
"Partial coverage",
|
||||
"No image available",
|
||||
"--- TOTAL ---",
|
||||
"Total Acreage"
|
||||
),
|
||||
Acreage = c(
|
||||
NA,
|
||||
round(germination_acreage, 2),
|
||||
round(tillering_acreage, 2),
|
||||
round(grand_growth_acreage, 2),
|
||||
round(maturation_acreage, 2),
|
||||
round(unknown_phase_acreage, 2),
|
||||
NA,
|
||||
round(harvest_ready_acreage, 2),
|
||||
round(stress_acreage, 2),
|
||||
round(recovery_acreage, 2),
|
||||
round(growth_on_track_acreage, 2),
|
||||
round(germination_complete_acreage, 2),
|
||||
round(germination_started_acreage, 2),
|
||||
round(no_trigger_acreage, 2),
|
||||
NA,
|
||||
paste0(clear_fields, " fields"),
|
||||
paste0(partial_fields, " fields"),
|
||||
paste0(no_image_fields, " fields"),
|
||||
NA,
|
||||
round(clear_acreage, 2),
|
||||
round(partial_acreage, 2),
|
||||
round(no_image_acreage, 2),
|
||||
NA,
|
||||
round(total_acreage, 2)
|
||||
),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
return(summary_df)
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# EXPORT FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, year, reports_dir) {
|
||||
message("Exporting per-field analysis to Excel, CSV, and RDS...")
|
||||
|
||||
field_df_rounded <- field_df %>%
|
||||
mutate(across(where(is.numeric), ~ round(., 2)))
|
||||
|
||||
# Handle NULL summary_df
|
||||
summary_df_rounded <- if (!is.null(summary_df)) {
|
||||
summary_df %>%
|
||||
mutate(across(where(is.numeric), ~ round(., 2)))
|
||||
} else {
|
||||
NULL
|
||||
}
|
||||
|
||||
output_subdir <- file.path(reports_dir, "kpis", "field_analysis")
|
||||
if (!dir.exists(output_subdir)) {
|
||||
dir.create(output_subdir, recursive = TRUE)
|
||||
}
|
||||
|
||||
excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".xlsx")
|
||||
excel_path <- file.path(output_subdir, excel_filename)
|
||||
excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE)
|
||||
|
||||
# Build sheets list dynamically
|
||||
sheets <- list(
|
||||
"Field Data" = field_df_rounded
|
||||
)
|
||||
if (!is.null(summary_df_rounded)) {
|
||||
sheets[["Summary"]] <- summary_df_rounded
|
||||
}
|
||||
|
||||
write_xlsx(sheets, excel_path)
|
||||
message(paste("✓ Field analysis Excel exported to:", excel_path))
|
||||
|
||||
kpi_data <- list(
|
||||
field_analysis = field_df_rounded,
|
||||
field_analysis_summary = summary_df_rounded,
|
||||
metadata = list(
|
||||
current_week = current_week,
|
||||
year = year,
|
||||
project = project_dir,
|
||||
created_at = Sys.time()
|
||||
)
|
||||
)
|
||||
|
||||
rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d_%d", current_week, year), ".rds")
|
||||
rds_path <- file.path(reports_dir, "kpis", rds_filename)
|
||||
|
||||
saveRDS(kpi_data, rds_path)
|
||||
message(paste("✓ Field analysis RDS exported to:", rds_path))
|
||||
|
||||
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".csv")
|
||||
csv_path <- file.path(output_subdir, csv_filename)
|
||||
write_csv(field_df_rounded, csv_path)
|
||||
message(paste("✓ Field analysis CSV exported to:", csv_path))
|
||||
|
||||
return(list(excel = excel_path, rds = rds_path, csv = csv_path))
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# TILE-BASED KPI EXTRACTION (Alternative calculation method)
|
||||
# ============================================================================
|
||||
|
||||
# [COMMENTED OUT / UNUSED - kept for reference]
|
||||
# These functions provide tile-based extraction as an alternative to field_statistics approach
|
||||
# Currently replaced by calculate_field_statistics() in 80_weekly_stats_utils.R
|
||||
# Uncomment if parallel processing of tiles is needed in future
|
||||
|
||||
# calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) {
|
||||
# message("Calculating field-level KPI statistics from tiles...")
|
||||
#
|
||||
# tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
|
||||
# tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
#
|
||||
# if (length(tile_files) == 0) {
|
||||
# message("No tiles found for week", week_num, year)
|
||||
# return(NULL)
|
||||
# }
|
||||
#
|
||||
# message(paste("Processing", length(tile_files), "tiles in parallel..."))
|
||||
#
|
||||
# field_kpi_list <- furrr::future_map(
|
||||
# tile_files,
|
||||
# ~ process_single_kpi_tile(
|
||||
# tile_file = .,
|
||||
# field_boundaries_sf = field_boundaries_sf,
|
||||
# tile_grid = tile_grid
|
||||
# ),
|
||||
# .progress = TRUE,
|
||||
# .options = furrr::furrr_options(seed = TRUE)
|
||||
# )
|
||||
#
|
||||
# field_kpi_stats <- dplyr::bind_rows(field_kpi_list)
|
||||
#
|
||||
# if (nrow(field_kpi_stats) == 0) {
|
||||
# message(" No KPI data extracted from tiles")
|
||||
# return(NULL)
|
||||
# }
|
||||
#
|
||||
# message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields"))
|
||||
# return(field_kpi_stats)
|
||||
# }
|
||||
|
||||
# process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) {
|
||||
# # Helper function for calculate_field_kpis_from_tiles
|
||||
# tryCatch({
|
||||
# tile_basename <- basename(tile_file)
|
||||
# tile_raster <- terra::rast(tile_file)
|
||||
# ci_band <- tile_raster[[1]]
|
||||
#
|
||||
# field_bbox <- sf::st_bbox(field_boundaries_sf)
|
||||
# ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out")
|
||||
#
|
||||
# extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE)
|
||||
#
|
||||
# tile_results <- data.frame()
|
||||
# tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename))
|
||||
#
|
||||
# for (field_idx in seq_len(nrow(field_boundaries_sf))) {
|
||||
# field_id <- field_boundaries_sf$field[field_idx]
|
||||
# mean_ci <- extracted_vals[field_idx, 2]
|
||||
#
|
||||
# if (is.na(mean_ci)) {
|
||||
# next
|
||||
# }
|
||||
#
|
||||
# tile_results <- rbind(tile_results, data.frame(
|
||||
# field = field_id,
|
||||
# tile_id = tile_id_match,
|
||||
# tile_file = tile_basename,
|
||||
# mean_ci = round(mean_ci, 4),
|
||||
# stringsAsFactors = FALSE
|
||||
# ))
|
||||
# }
|
||||
#
|
||||
# return(tile_results)
|
||||
#
|
||||
# }, error = function(e) {
|
||||
# message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message))
|
||||
# return(data.frame())
|
||||
# })
|
||||
# }
|
||||
|
||||
# calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf,
|
||||
# harvesting_data, cumulative_CI_vals_dir,
|
||||
# weekly_CI_mosaic, reports_dir, current_week, year,
|
||||
# tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") {
|
||||
# # Farm-level KPI calculation using tile-based extraction (alternative approach)
|
||||
# # [Implementation kept as reference for alternative calculation method]
|
||||
# }
|
||||
941
r_app/80_weekly_stats_utils.R
Normal file
941
r_app/80_weekly_stats_utils.R
Normal file
|
|
@ -0,0 +1,941 @@
|
|||
# 80_WEEKLY_STATS_UTILS.R
|
||||
# ============================================================================
|
||||
# UTILITY FUNCTIONS FOR WEEKLY STATISTICS CALCULATION
|
||||
#
|
||||
# This file contains reusable functions for:
|
||||
# - Tile grid management
|
||||
# - Tile loading and merging
|
||||
# - Field-level statistics calculation from CI rasters
|
||||
# - Weekly stats caching (RDS/CSV export/import)
|
||||
# - KPI trend calculations
|
||||
# - Historical data loading and auto-generation from mosaics
|
||||
#
|
||||
# Used by: 80_calculate_kpis.R, run_full_pipeline.R, other reporting scripts
|
||||
# ============================================================================
|
||||
|
||||
# ============================================================================
|
||||
# TILE-AWARE HELPER FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) {
|
||||
if (inherits(field_geom, "sf")) {
|
||||
field_bbox <- sf::st_bbox(field_geom)
|
||||
field_xmin <- field_bbox["xmin"]
|
||||
field_xmax <- field_bbox["xmax"]
|
||||
field_ymin <- field_bbox["ymin"]
|
||||
field_ymax <- field_bbox["ymax"]
|
||||
} else if (inherits(field_geom, "SpatVector")) {
|
||||
field_bbox <- terra::ext(field_geom)
|
||||
field_xmin <- field_bbox$xmin
|
||||
field_xmax <- field_bbox$xmax
|
||||
field_ymin <- field_bbox$ymin
|
||||
field_ymax <- field_bbox$ymax
|
||||
} else {
|
||||
stop("field_geom must be sf or terra::vect object")
|
||||
}
|
||||
|
||||
intersecting_tiles <- tile_grid$id[
|
||||
!(tile_grid$xmax < field_xmin |
|
||||
tile_grid$xmin > field_xmax |
|
||||
tile_grid$ymax < field_ymin |
|
||||
tile_grid$ymin > field_ymax)
|
||||
]
|
||||
|
||||
return(as.numeric(intersecting_tiles))
|
||||
}
|
||||
|
||||
load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_dir) {
|
||||
if (length(tile_ids) == 0) {
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
tiles_list <- list()
|
||||
for (tile_id in sort(tile_ids)) {
|
||||
tile_filename <- sprintf("week_%02d_%d_%02d.tif", week_num, year, tile_id)
|
||||
tile_path <- file.path(mosaic_dir, tile_filename)
|
||||
|
||||
if (file.exists(tile_path)) {
|
||||
tryCatch({
|
||||
tile_rast <- terra::rast(tile_path)
|
||||
ci_band <- terra::subset(tile_rast, 5)
|
||||
tiles_list[[length(tiles_list) + 1]] <- ci_band
|
||||
}, error = function(e) {
|
||||
message(paste(" Warning: Could not load tile", tile_id, ":", e$message))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if (length(tiles_list) == 0) {
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
if (length(tiles_list) == 1) {
|
||||
return(tiles_list[[1]])
|
||||
} else {
|
||||
tryCatch({
|
||||
rsrc <- terra::sprc(tiles_list)
|
||||
merged <- terra::mosaic(rsrc, fun = "max")
|
||||
return(merged)
|
||||
}, error = function(e) {
|
||||
message(paste(" Warning: Could not merge tiles:", e$message))
|
||||
return(tiles_list[[1]])
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
build_tile_grid <- function(mosaic_dir, week_num, year) {
|
||||
# Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/)
|
||||
detected_grid_size <- NA
|
||||
if (dir.exists(mosaic_dir)) {
|
||||
subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
|
||||
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
|
||||
|
||||
if (length(grid_patterns) > 0) {
|
||||
detected_grid_size <- grid_patterns[1]
|
||||
mosaic_dir <- file.path(mosaic_dir, detected_grid_size)
|
||||
message(paste(" Using grid-size subdirectory:", detected_grid_size))
|
||||
}
|
||||
}
|
||||
|
||||
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
|
||||
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
|
||||
if (length(tile_files) == 0) {
|
||||
stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir))
|
||||
}
|
||||
|
||||
tile_grid <- data.frame(
|
||||
id = integer(),
|
||||
xmin = numeric(),
|
||||
xmax = numeric(),
|
||||
ymin = numeric(),
|
||||
ymax = numeric(),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
for (tile_file in tile_files) {
|
||||
tryCatch({
|
||||
matches <- regmatches(basename(tile_file), regexpr("_([0-9]{2})\\.tif$", basename(tile_file)))
|
||||
if (length(matches) > 0) {
|
||||
tile_id <- as.integer(sub("_|\\.tif", "", matches[1]))
|
||||
tile_rast <- terra::rast(tile_file)
|
||||
tile_ext <- terra::ext(tile_rast)
|
||||
tile_grid <- rbind(tile_grid, data.frame(
|
||||
id = tile_id,
|
||||
xmin = tile_ext$xmin,
|
||||
xmax = tile_ext$xmax,
|
||||
ymin = tile_ext$ymin,
|
||||
ymax = tile_ext$ymax,
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
}, error = function(e) {
|
||||
message(paste(" Warning: Could not process tile", basename(tile_file), ":", e$message))
|
||||
})
|
||||
}
|
||||
|
||||
if (nrow(tile_grid) == 0) {
|
||||
stop("Could not extract extents from any tile files")
|
||||
}
|
||||
|
||||
return(list(
|
||||
tile_grid = tile_grid,
|
||||
mosaic_dir = mosaic_dir,
|
||||
grid_size = detected_grid_size
|
||||
))
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# STATISTICAL CATEGORIZATION FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
categorize_four_week_trend <- function(ci_values_list) {
|
||||
if (is.null(ci_values_list) || length(ci_values_list) < 2) {
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
ci_values_list <- ci_values_list[!is.na(ci_values_list)]
|
||||
if (length(ci_values_list) < 2) {
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
weekly_changes <- diff(ci_values_list)
|
||||
avg_weekly_change <- mean(weekly_changes, na.rm = TRUE)
|
||||
|
||||
if (avg_weekly_change >= FOUR_WEEK_TREND_STRONG_GROWTH_MIN) {
|
||||
return("strong growth")
|
||||
} else if (avg_weekly_change >= FOUR_WEEK_TREND_GROWTH_MIN &&
|
||||
avg_weekly_change < FOUR_WEEK_TREND_GROWTH_MAX) {
|
||||
return("growth")
|
||||
} else if (abs(avg_weekly_change) <= FOUR_WEEK_TREND_NO_GROWTH_RANGE) {
|
||||
return("no growth")
|
||||
} else if (avg_weekly_change <= FOUR_WEEK_TREND_DECLINE_MIN &&
|
||||
avg_weekly_change > FOUR_WEEK_TREND_STRONG_DECLINE_MAX) {
|
||||
return("decline")
|
||||
} else if (avg_weekly_change < FOUR_WEEK_TREND_STRONG_DECLINE_MAX) {
|
||||
return("strong decline")
|
||||
} else {
|
||||
return("no growth")
|
||||
}
|
||||
}
|
||||
|
||||
round_cloud_to_intervals <- function(cloud_pct_clear) {
|
||||
if (is.na(cloud_pct_clear)) {
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
if (cloud_pct_clear < 10) return("0-10%")
|
||||
if (cloud_pct_clear < 20) return("10-20%")
|
||||
if (cloud_pct_clear < 30) return("20-30%")
|
||||
if (cloud_pct_clear < 40) return("30-40%")
|
||||
if (cloud_pct_clear < 50) return("40-50%")
|
||||
if (cloud_pct_clear < 60) return("50-60%")
|
||||
if (cloud_pct_clear < 70) return("60-70%")
|
||||
if (cloud_pct_clear < 80) return("70-80%")
|
||||
if (cloud_pct_clear < 90) return("80-90%")
|
||||
if (cloud_pct_clear < 95) return("90-95%")
|
||||
return("95-100%")
|
||||
}
|
||||
|
||||
get_ci_percentiles <- function(ci_values) {
|
||||
if (is.null(ci_values) || length(ci_values) == 0) {
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
ci_values <- ci_values[!is.na(ci_values)]
|
||||
if (length(ci_values) == 0) {
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
p10 <- quantile(ci_values, CI_PERCENTILE_LOW, na.rm = TRUE)
|
||||
p90 <- quantile(ci_values, CI_PERCENTILE_HIGH, na.rm = TRUE)
|
||||
|
||||
return(sprintf("%.1f-%.1f", p10, p90))
|
||||
}
|
||||
|
||||
calculate_cv_trend <- function(cv_current, cv_previous) {
|
||||
if (is.na(cv_current) || is.na(cv_previous)) {
|
||||
return(NA_real_)
|
||||
}
|
||||
return(round(cv_current - cv_previous, 4))
|
||||
}
|
||||
|
||||
calculate_four_week_trend <- function(mean_ci_values) {
|
||||
#' Calculate four-week CI trend from available weeks
|
||||
#' Uses whatever weeks are available (1-4 weeks) to estimate trend
|
||||
|
||||
if (is.null(mean_ci_values) || length(mean_ci_values) == 0) {
|
||||
return(NA_real_)
|
||||
}
|
||||
|
||||
ci_clean <- mean_ci_values[!is.na(mean_ci_values)]
|
||||
|
||||
if (length(ci_clean) < 2) {
|
||||
return(NA_real_)
|
||||
}
|
||||
|
||||
trend <- ci_clean[length(ci_clean)] - ci_clean[1]
|
||||
return(round(trend, 2))
|
||||
}
|
||||
|
||||
categorize_cv_slope <- function(slope) {
|
||||
#' Categorize CV slope (8-week regression) into field uniformity interpretation
|
||||
|
||||
if (is.na(slope)) {
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
if (slope <= CV_SLOPE_IMPROVEMENT_MIN) {
|
||||
return("Excellent uniformity")
|
||||
} else if (slope < CV_SLOPE_HOMOGENOUS_MIN) {
|
||||
return("Homogenous growth")
|
||||
} else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) {
|
||||
return("Homogenous growth")
|
||||
} else if (slope <= CV_SLOPE_PATCHINESS_MAX) {
|
||||
return("Minor patchiness")
|
||||
} else {
|
||||
return("Severe fragmentation")
|
||||
}
|
||||
}
|
||||
|
||||
calculate_cv_trend_long_term <- function(cv_values) {
|
||||
#' Calculate 8-week CV trend via linear regression slope
|
||||
|
||||
if (is.null(cv_values) || length(cv_values) == 0) {
|
||||
return(NA_real_)
|
||||
}
|
||||
|
||||
cv_clean <- cv_values[!is.na(cv_values)]
|
||||
|
||||
if (length(cv_clean) < 2) {
|
||||
return(NA_real_)
|
||||
}
|
||||
|
||||
weeks <- seq_along(cv_clean)
|
||||
|
||||
tryCatch({
|
||||
lm_fit <- lm(cv_clean ~ weeks)
|
||||
slope <- coef(lm_fit)["weeks"]
|
||||
return(round(as.numeric(slope), 4))
|
||||
}, error = function(e) {
|
||||
return(NA_real_)
|
||||
})
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# HELPER FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
get_phase_by_age <- function(age_weeks) {
|
||||
if (is.na(age_weeks)) return(NA_character_)
|
||||
for (i in seq_len(nrow(PHASE_DEFINITIONS))) {
|
||||
if (age_weeks >= PHASE_DEFINITIONS$age_start[i] &&
|
||||
age_weeks <= PHASE_DEFINITIONS$age_end[i]) {
|
||||
return(PHASE_DEFINITIONS$phase[i])
|
||||
}
|
||||
}
|
||||
return("Unknown")
|
||||
}
|
||||
|
||||
get_status_trigger <- function(ci_values, ci_change, age_weeks) {
|
||||
if (is.na(age_weeks) || length(ci_values) == 0) return(NA_character_)
|
||||
|
||||
ci_values <- ci_values[!is.na(ci_values)]
|
||||
if (length(ci_values) == 0) return(NA_character_)
|
||||
|
||||
pct_above_2 <- sum(ci_values > 2) / length(ci_values) * 100
|
||||
pct_at_or_above_2 <- sum(ci_values >= 2) / length(ci_values) * 100
|
||||
ci_cv <- if (mean(ci_values, na.rm = TRUE) > 0) sd(ci_values) / mean(ci_values, na.rm = TRUE) else 0
|
||||
mean_ci <- mean(ci_values, na.rm = TRUE)
|
||||
|
||||
if (age_weeks >= 0 && age_weeks <= 6) {
|
||||
if (pct_at_or_above_2 >= 70) {
|
||||
return("germination_complete")
|
||||
} else if (pct_above_2 > 10) {
|
||||
return("germination_started")
|
||||
}
|
||||
}
|
||||
|
||||
if (age_weeks >= 45) {
|
||||
return("harvest_ready")
|
||||
}
|
||||
|
||||
if (age_weeks > 6 && !is.na(ci_change) && ci_change < -1.5 && ci_cv < 0.25) {
|
||||
return("stress_detected_whole_field")
|
||||
}
|
||||
|
||||
if (age_weeks > 6 && !is.na(ci_change) && ci_change > 1.5) {
|
||||
return("strong_recovery")
|
||||
}
|
||||
|
||||
if (age_weeks >= 4 && age_weeks < 39 && !is.na(ci_change) && ci_change > 0.2) {
|
||||
return("growth_on_track")
|
||||
}
|
||||
|
||||
if (age_weeks >= 39 && age_weeks < 45 && mean_ci > 3.5) {
|
||||
return("maturation_progressing")
|
||||
}
|
||||
|
||||
return(NA_character_)
|
||||
}
|
||||
|
||||
extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
|
||||
# Extract planting dates from harvest.xlsx (season_start column)
|
||||
# Returns: data.frame with columns (field_id, planting_date)
|
||||
|
||||
if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
|
||||
message("Warning: No harvesting data available - planting dates will be NA.")
|
||||
if (!is.null(field_boundaries_sf)) {
|
||||
return(data.frame(
|
||||
field_id = field_boundaries_sf$field,
|
||||
planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
tryCatch({
|
||||
planting_dates <- harvesting_data %>%
|
||||
arrange(field, desc(season_start)) %>%
|
||||
distinct(field, .keep_all = TRUE) %>%
|
||||
select(field, season_start) %>%
|
||||
rename(field_id = field, planting_date = season_start) %>%
|
||||
filter(!is.na(planting_date)) %>%
|
||||
as.data.frame()
|
||||
|
||||
message(paste("Extracted planting dates for", nrow(planting_dates), "fields from harvest.xlsx"))
|
||||
return(planting_dates)
|
||||
}, error = function(e) {
|
||||
message(paste("Error extracting planting dates:", e$message))
|
||||
return(NULL)
|
||||
})
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# MODULAR STATISTICS CALCULATION
|
||||
# ============================================================================
|
||||
|
||||
calculate_field_statistics <- function(field_boundaries_sf, week_num, year,
|
||||
mosaic_dir, report_date = Sys.Date()) {
|
||||
|
||||
message(paste("Calculating statistics for all fields - Week", week_num, year))
|
||||
|
||||
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
|
||||
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
|
||||
if (length(tile_files) == 0) {
|
||||
stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir))
|
||||
}
|
||||
|
||||
message(paste(" Found", length(tile_files), "tiles for week", week_num))
|
||||
|
||||
results_list <- list()
|
||||
fields_processed <- 0
|
||||
|
||||
for (tile_idx in seq_along(tile_files)) {
|
||||
tile_file <- tile_files[tile_idx]
|
||||
|
||||
tryCatch({
|
||||
current_rast <- terra::rast(tile_file)
|
||||
ci_band <- current_rast[["CI"]]
|
||||
|
||||
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
|
||||
message(paste(" [SKIP] Tile", basename(tile_file), "- CI band not found"))
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
extracted <- terra::extract(ci_band, field_boundaries_sf, na.rm = FALSE)
|
||||
unique_field_ids <- unique(extracted$ID[!is.na(extracted$ID)])
|
||||
|
||||
for (field_poly_idx in unique_field_ids) {
|
||||
field_id <- field_boundaries_sf$field[field_poly_idx]
|
||||
ci_vals <- extracted$CI[extracted$ID == field_poly_idx]
|
||||
ci_vals <- ci_vals[!is.na(ci_vals)]
|
||||
|
||||
if (length(ci_vals) == 0) {
|
||||
next
|
||||
}
|
||||
|
||||
mean_ci <- mean(ci_vals, na.rm = TRUE)
|
||||
ci_std <- sd(ci_vals, na.rm = TRUE)
|
||||
cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_
|
||||
range_min <- min(ci_vals, na.rm = TRUE)
|
||||
range_max <- max(ci_vals, na.rm = TRUE)
|
||||
range_str <- sprintf("%.1f-%.1f", range_min, range_max)
|
||||
ci_percentiles_str <- get_ci_percentiles(ci_vals)
|
||||
|
||||
# Count pixels with CI >= 2 (germination threshold)
|
||||
GERMINATION_CI_THRESHOLD <- 2.0
|
||||
num_pixels_gte_2 <- sum(ci_vals >= GERMINATION_CI_THRESHOLD, na.rm = TRUE)
|
||||
num_pixels_total <- length(ci_vals)
|
||||
pct_pixels_gte_2 <- if (num_pixels_total > 0) round((num_pixels_gte_2 / num_pixels_total) * 100, 1) else 0
|
||||
|
||||
field_rows <- extracted[extracted$ID == field_poly_idx, ]
|
||||
num_total <- nrow(field_rows)
|
||||
num_data <- sum(!is.na(field_rows$CI))
|
||||
pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0
|
||||
cloud_cat <- if (num_data == 0) "No image available"
|
||||
else if (pct_clear >= 95) "Clear view"
|
||||
else "Partial coverage"
|
||||
|
||||
# Age_week and Phase are now calculated in main script using actual planting dates
|
||||
# Germination_progress is calculated in main script after Age_week is known
|
||||
|
||||
existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id)
|
||||
|
||||
if (length(existing_idx) > 0) {
|
||||
next
|
||||
}
|
||||
|
||||
results_list[[length(results_list) + 1]] <- data.frame(
|
||||
Field_id = field_id,
|
||||
Mean_CI = round(mean_ci, 2),
|
||||
CV = round(cv * 100, 2),
|
||||
CI_range = range_str,
|
||||
CI_Percentiles = ci_percentiles_str,
|
||||
Pct_pixels_CI_gte_2 = pct_pixels_gte_2,
|
||||
Cloud_pct_clear = pct_clear,
|
||||
Cloud_category = cloud_cat,
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
fields_processed <- fields_processed + 1
|
||||
}
|
||||
|
||||
message(paste(" Tile", tile_idx, "of", length(tile_files), "processed"))
|
||||
|
||||
}, error = function(e) {
|
||||
message(paste(" [ERROR] Tile", basename(tile_file), ":", e$message))
|
||||
})
|
||||
}
|
||||
|
||||
if (length(results_list) == 0) {
|
||||
stop(paste("No fields processed successfully for week", week_num))
|
||||
}
|
||||
|
||||
stats_df <- dplyr::bind_rows(results_list)
|
||||
message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields"))
|
||||
|
||||
return(stats_df)
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# CALCULATE KPI TRENDS
|
||||
# ============================================================================
|
||||
|
||||
calculate_kpi_trends <- function(current_stats, prev_stats = NULL,
|
||||
project_dir = NULL, reports_dir = NULL,
|
||||
current_week = NULL, year = NULL) {
|
||||
|
||||
message("Calculating KPI trends from current and previous week data")
|
||||
|
||||
current_stats$Weekly_ci_change <- NA_real_
|
||||
current_stats$CV_Trend_Short_Term <- NA_real_
|
||||
current_stats$Four_week_trend <- NA_real_
|
||||
current_stats$CV_Trend_Long_Term <- NA_real_
|
||||
current_stats$nmr_of_weeks_analysed <- 1L
|
||||
|
||||
if (is.null(prev_stats) || nrow(prev_stats) == 0) {
|
||||
message(" No previous week data available - using defaults")
|
||||
return(current_stats)
|
||||
}
|
||||
|
||||
message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns"))
|
||||
|
||||
prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id)
|
||||
|
||||
prev_field_analysis <- NULL
|
||||
|
||||
tryCatch({
|
||||
analysis_dir <- file.path(reports_dir, "kpis", "field_analysis")
|
||||
if (dir.exists(analysis_dir)) {
|
||||
analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE)
|
||||
if (length(analysis_files) > 0) {
|
||||
recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)]
|
||||
prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE,
|
||||
col_select = c(Field_id, nmr_of_weeks_analysed, Phase))
|
||||
}
|
||||
}
|
||||
}, error = function(e) {
|
||||
message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message))
|
||||
})
|
||||
|
||||
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
|
||||
message(paste(" Using previous field_analysis to track nmr_of_weeks_analysed"))
|
||||
}
|
||||
|
||||
historical_4weeks <- list()
|
||||
historical_8weeks <- list()
|
||||
|
||||
if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) {
|
||||
message(" Loading historical field_stats for 4-week and 8-week trends...")
|
||||
|
||||
for (lookback in 1:4) {
|
||||
target_week <- current_week - lookback
|
||||
target_year <- year
|
||||
if (target_week < 1) {
|
||||
target_week <- target_week + 52
|
||||
target_year <- target_year - 1
|
||||
}
|
||||
|
||||
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
|
||||
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
|
||||
|
||||
if (file.exists(rds_path)) {
|
||||
tryCatch({
|
||||
stats_data <- readRDS(rds_path)
|
||||
historical_4weeks[[length(historical_4weeks) + 1]] <- list(
|
||||
week = target_week,
|
||||
stats = stats_data
|
||||
)
|
||||
}, error = function(e) {
|
||||
message(paste(" Warning: Could not load week", target_week, ":", e$message))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
for (lookback in 1:8) {
|
||||
target_week <- current_week - lookback
|
||||
target_year <- year
|
||||
if (target_week < 1) {
|
||||
target_week <- target_week + 52
|
||||
target_year <- target_year - 1
|
||||
}
|
||||
|
||||
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year)
|
||||
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
|
||||
|
||||
if (file.exists(rds_path)) {
|
||||
tryCatch({
|
||||
stats_data <- readRDS(rds_path)
|
||||
historical_8weeks[[length(historical_8weeks) + 1]] <- list(
|
||||
week = target_week,
|
||||
stats = stats_data
|
||||
)
|
||||
}, error = function(e) {
|
||||
# Silently skip
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if (length(historical_4weeks) > 0) {
|
||||
message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend"))
|
||||
}
|
||||
if (length(historical_8weeks) > 0) {
|
||||
message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend"))
|
||||
}
|
||||
}
|
||||
|
||||
cv_trends_calculated <- 0
|
||||
four_week_trends_calculated <- 0
|
||||
cv_long_term_calculated <- 0
|
||||
|
||||
for (i in seq_len(nrow(current_stats))) {
|
||||
field_id <- current_stats$Field_id[i]
|
||||
prev_idx <- prev_lookup[field_id]
|
||||
|
||||
if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) {
|
||||
prev_row <- prev_stats[prev_idx, , drop = FALSE]
|
||||
|
||||
prev_ci <- prev_row$Mean_CI[1]
|
||||
if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) {
|
||||
current_stats$Weekly_ci_change[i] <-
|
||||
round(current_stats$Mean_CI[i] - prev_ci, 2)
|
||||
}
|
||||
|
||||
prev_cv <- prev_row$CV[1]
|
||||
if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) {
|
||||
current_stats$CV_Trend_Short_Term[i] <-
|
||||
calculate_cv_trend(current_stats$CV[i], prev_cv)
|
||||
cv_trends_calculated <- cv_trends_calculated + 1
|
||||
}
|
||||
|
||||
if (length(historical_4weeks) > 0) {
|
||||
ci_values_4week <- numeric()
|
||||
|
||||
for (hist_idx in rev(seq_along(historical_4weeks))) {
|
||||
hist_data <- historical_4weeks[[hist_idx]]$stats
|
||||
hist_field <- which(hist_data$Field_id == field_id)
|
||||
if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) {
|
||||
ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]])
|
||||
}
|
||||
}
|
||||
|
||||
ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i])
|
||||
|
||||
if (length(ci_values_4week) >= 2) {
|
||||
current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week)
|
||||
four_week_trends_calculated <- four_week_trends_calculated + 1
|
||||
}
|
||||
}
|
||||
|
||||
if (length(historical_8weeks) > 0) {
|
||||
cv_values_8week <- numeric()
|
||||
|
||||
for (hist_idx in rev(seq_along(historical_8weeks))) {
|
||||
hist_data <- historical_8weeks[[hist_idx]]$stats
|
||||
hist_field <- which(hist_data$Field_id == field_id)
|
||||
if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) {
|
||||
cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]])
|
||||
}
|
||||
}
|
||||
|
||||
cv_values_8week <- c(cv_values_8week, current_stats$CV[i])
|
||||
|
||||
if (length(cv_values_8week) >= 2) {
|
||||
slope <- calculate_cv_trend_long_term(cv_values_8week)
|
||||
current_stats$CV_Trend_Long_Term[i] <- slope
|
||||
cv_long_term_calculated <- cv_long_term_calculated + 1
|
||||
}
|
||||
}
|
||||
|
||||
if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) {
|
||||
prev_analysis_row <- prev_field_analysis %>%
|
||||
dplyr::filter(Field_id == field_id)
|
||||
|
||||
if (nrow(prev_analysis_row) > 0) {
|
||||
prev_nmr_weeks_analysis <- prev_analysis_row$nmr_of_weeks_analysed[1]
|
||||
|
||||
# Only increment nmr_of_weeks_analysed if we have previous data
|
||||
if (!is.na(prev_nmr_weeks_analysis)) {
|
||||
current_stats$nmr_of_weeks_analysed[i] <- prev_nmr_weeks_analysis + 1L
|
||||
} else {
|
||||
current_stats$nmr_of_weeks_analysed[i] <- 1L
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields"))
|
||||
message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields"))
|
||||
message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields"))
|
||||
return(current_stats)
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# LOAD OR CALCULATE WEEKLY STATISTICS
|
||||
# ============================================================================
|
||||
|
||||
load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf,
|
||||
mosaic_dir, reports_dir, report_date = Sys.Date()) {
|
||||
|
||||
rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year)
|
||||
rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename)
|
||||
|
||||
if (file.exists(rds_path)) {
|
||||
message(paste("Loading cached statistics from:", basename(rds_path)))
|
||||
return(readRDS(rds_path))
|
||||
}
|
||||
|
||||
message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num))
|
||||
stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year,
|
||||
mosaic_dir, report_date)
|
||||
|
||||
output_dir <- file.path(reports_dir, "kpis", "field_stats")
|
||||
if (!dir.exists(output_dir)) {
|
||||
dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
|
||||
}
|
||||
|
||||
saveRDS(stats_df, rds_path)
|
||||
message(paste("Saved weekly statistics RDS:", basename(rds_path)))
|
||||
|
||||
csv_filename <- sprintf("%s_field_stats_week%02d_%d.csv", project_dir, week_num, year)
|
||||
csv_path <- file.path(output_dir, csv_filename)
|
||||
readr::write_csv(stats_df, csv_path)
|
||||
message(paste("Saved weekly statistics CSV:", basename(csv_path)))
|
||||
|
||||
return(stats_df)
|
||||
}
|
||||
|
||||
load_historical_field_data <- function(project_dir, current_week, reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL) {
|
||||
historical_data <- list()
|
||||
loaded_weeks <- c()
|
||||
missing_weeks <- c()
|
||||
|
||||
for (lookback in 0:(num_weeks - 1)) {
|
||||
target_week <- current_week - lookback
|
||||
if (target_week < 1) target_week <- target_week + 52
|
||||
|
||||
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", target_week), ".csv")
|
||||
csv_path <- file.path(reports_dir, "kpis", "field_analysis", csv_filename)
|
||||
|
||||
if (file.exists(csv_path)) {
|
||||
tryCatch({
|
||||
data <- read_csv(csv_path, show_col_types = FALSE)
|
||||
historical_data[[lookback + 1]] <- list(
|
||||
week = target_week,
|
||||
data = data
|
||||
)
|
||||
loaded_weeks <- c(loaded_weeks, target_week)
|
||||
}, error = function(e) {
|
||||
message(paste(" Warning: Could not load week", target_week, ":", e$message))
|
||||
missing_weeks <<- c(missing_weeks, target_week)
|
||||
})
|
||||
} else {
|
||||
missing_weeks <- c(missing_weeks, target_week)
|
||||
}
|
||||
}
|
||||
|
||||
if (length(missing_weeks) > 0 && auto_generate) {
|
||||
message(paste("⚠ Missing weeks:", paste(missing_weeks, collapse = ", ")))
|
||||
message("Scanning for ALL available weekly mosaics and calculating stats...\n")
|
||||
|
||||
if (is.null(field_boundaries_sf)) {
|
||||
message(" Error: field_boundaries_sf not provided - cannot auto-generate")
|
||||
return(historical_data)
|
||||
}
|
||||
|
||||
if (!exists("weekly_tile_max")) {
|
||||
message(" ✗ weekly_tile_max path not defined")
|
||||
return(historical_data)
|
||||
}
|
||||
|
||||
check_paths <- c(file.path(weekly_tile_max, "5x5"), weekly_tile_max)
|
||||
mosaic_scan_dir <- NA
|
||||
|
||||
for (check_path in check_paths) {
|
||||
if (dir.exists(check_path)) {
|
||||
tif_files <- list.files(check_path, pattern = "week_.*\\.tif$", full.names = TRUE)
|
||||
if (length(tif_files) > 0) {
|
||||
mosaic_scan_dir <- check_path
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (is.na(mosaic_scan_dir)) {
|
||||
message(" ✗ No mosaic files found in weekly_tile_max")
|
||||
return(historical_data)
|
||||
}
|
||||
|
||||
weeks_to_load <- 8
|
||||
today <- Sys.Date()
|
||||
target_dates <- today - (0:(weeks_to_load - 1)) * 7
|
||||
|
||||
expected_weeks <- data.frame(
|
||||
date = target_dates,
|
||||
week = as.numeric(format(target_dates, "%V")),
|
||||
year = as.numeric(format(target_dates, "%Y")),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
expected_weeks <- unique(expected_weeks)
|
||||
|
||||
message(paste(" Expected weeks (last 8 from", format(today, "%Y-%m-%d"), "):"))
|
||||
for (i in seq_len(nrow(expected_weeks))) {
|
||||
message(paste(" Week", sprintf("%02d", expected_weeks$week[i]), expected_weeks$year[i]))
|
||||
}
|
||||
message("")
|
||||
|
||||
tif_files <- list.files(mosaic_scan_dir, pattern = "week_([0-9]{2})_([0-9]{4})_[0-9]{2}\\.tif$",
|
||||
full.names = FALSE)
|
||||
|
||||
available_weeks <- data.frame()
|
||||
for (filename in tif_files) {
|
||||
matches <- regmatches(filename, gregexpr("week_([0-9]{2})_([0-9]{4})", filename))[[1]]
|
||||
if (length(matches) > 0) {
|
||||
week_year <- strsplit(matches[1], "_")[[1]]
|
||||
if (length(week_year) == 3) {
|
||||
week_num <- as.numeric(week_year[2])
|
||||
year_num <- as.numeric(week_year[3])
|
||||
|
||||
if (week_num %in% expected_weeks$week && year_num %in% expected_weeks$year) {
|
||||
available_weeks <- rbind(available_weeks,
|
||||
data.frame(week = week_num, year = year_num))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
available_weeks <- unique(available_weeks)
|
||||
available_weeks <- merge(available_weeks, expected_weeks[, c("week", "year", "date")], by = c("week", "year"))
|
||||
available_weeks <- available_weeks[order(available_weeks$date, decreasing = TRUE), ]
|
||||
|
||||
if (nrow(available_weeks) == 0) {
|
||||
message(" ✗ No matching mosaic files found")
|
||||
message(paste(" Scanned directory:", mosaic_scan_dir))
|
||||
return(historical_data)
|
||||
}
|
||||
|
||||
message(paste(" Found", nrow(available_weeks), "week(s) with available mosaics:"))
|
||||
|
||||
for (i in seq_len(nrow(available_weeks))) {
|
||||
week_to_calc <- available_weeks$week[i]
|
||||
year_to_calc <- available_weeks$year[i]
|
||||
date_to_calc <- available_weeks$date[i]
|
||||
|
||||
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_to_calc, year_to_calc)
|
||||
tile_files <- list.files(mosaic_scan_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
|
||||
if (length(tile_files) == 0) {
|
||||
message(paste(" ✗ Week", sprintf("%02d", week_to_calc), year_to_calc, "- no tiles found"))
|
||||
next
|
||||
}
|
||||
|
||||
message(paste(" ✓ Week", sprintf("%02d", week_to_calc), year_to_calc, "-", length(tile_files), "mosaics"))
|
||||
|
||||
tryCatch({
|
||||
week_stats <- load_or_calculate_weekly_stats(
|
||||
week_num = week_to_calc,
|
||||
year = year_to_calc,
|
||||
project_dir = project_dir,
|
||||
field_boundaries_sf = field_boundaries_sf,
|
||||
mosaic_dir = mosaic_scan_dir,
|
||||
reports_dir = reports_dir,
|
||||
report_date = date_to_calc
|
||||
)
|
||||
|
||||
if (!is.null(week_stats) && nrow(week_stats) > 0) {
|
||||
message(paste(" ✓ Calculated stats for", nrow(week_stats), "fields"))
|
||||
|
||||
historical_data[[length(historical_data) + 1]] <- list(
|
||||
week = week_to_calc,
|
||||
year = year_to_calc,
|
||||
data = week_stats
|
||||
)
|
||||
loaded_weeks <- c(loaded_weeks, paste0(week_to_calc, "_", year_to_calc))
|
||||
}
|
||||
}, error = function(e) {
|
||||
message(paste(" ✗ Error:", e$message))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if (length(historical_data) == 0) {
|
||||
message(paste("Error: No historical field data found and could not auto-generate weeks"))
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
message(paste("✓ Loaded", length(historical_data), "weeks of historical data:",
|
||||
paste(loaded_weeks, collapse = ", ")))
|
||||
|
||||
return(historical_data)
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# HELPER: Extract field-level statistics from CI raster
|
||||
# ============================================================================
|
||||
|
||||
extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
|
||||
#' Extract CI statistics for all fields from a single CI raster band
|
||||
|
||||
extract_result <- terra::extract(ci_band, field_boundaries_sf)
|
||||
|
||||
stats_list <- list()
|
||||
|
||||
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
|
||||
field_pixels <- extract_result[extract_result$ID == field_idx, 2]
|
||||
pixels <- as.numeric(field_pixels[!is.na(field_pixels)])
|
||||
|
||||
if (length(pixels) == 0) {
|
||||
stats_list[[field_idx]] <- data.frame(
|
||||
field_idx = field_idx,
|
||||
mean_ci = NA_real_,
|
||||
cv = NA_real_,
|
||||
p10 = NA_real_,
|
||||
p90 = NA_real_,
|
||||
min_ci = NA_real_,
|
||||
max_ci = NA_real_,
|
||||
pixel_count_valid = 0,
|
||||
pixel_count_total = 0,
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
next
|
||||
}
|
||||
|
||||
mean_val <- mean(pixels, na.rm = TRUE)
|
||||
cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_
|
||||
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
|
||||
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
|
||||
min_val <- min(pixels, na.rm = TRUE)
|
||||
max_val <- max(pixels, na.rm = TRUE)
|
||||
|
||||
stats_list[[field_idx]] <- data.frame(
|
||||
field_idx = field_idx,
|
||||
mean_ci = mean_val,
|
||||
cv = cv_val,
|
||||
p10 = p10_val,
|
||||
p90 = p90_val,
|
||||
min_ci = min_val,
|
||||
max_ci = max_val,
|
||||
pixel_count_valid = length(pixels),
|
||||
pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
}
|
||||
|
||||
return(dplyr::bind_rows(stats_list))
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# COMMENTED OUT / UNUSED FUNCTIONS (kept for future use)
|
||||
# ============================================================================
|
||||
|
||||
# analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week_num, year,
|
||||
# mosaic_dir, historical_data = NULL, planting_dates = NULL,
|
||||
# report_date = Sys.Date(), harvest_imminence_data = NULL,
|
||||
# harvesting_data = NULL) {
|
||||
# # [Function kept as reference for parallel field analysis]
|
||||
# # Currently replaced by calculate_field_statistics() for efficiency
|
||||
# }
|
||||
|
|
@ -50,7 +50,7 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
|
|||
}
|
||||
|
||||
# PRIORITY 2: File-based detection (fallback if metadata not found)
|
||||
# Check if merged_final_tif/ contains tile-named files
|
||||
# Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories
|
||||
|
||||
if (!dir.exists(merged_final_tif_dir)) {
|
||||
return(list(
|
||||
|
|
@ -61,6 +61,30 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
|
|||
))
|
||||
}
|
||||
|
||||
# First check if there are grid-size subdirectories (5x5, 10x10, etc.)
|
||||
# This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif
|
||||
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
|
||||
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
|
||||
|
||||
if (length(grid_patterns) > 0) {
|
||||
# Found grid-size subdirectories - tiles exist!
|
||||
grid_size <- grid_patterns[1]
|
||||
grid_dir <- file.path(merged_final_tif_dir, grid_size)
|
||||
|
||||
# List sample tile files from the grid directory
|
||||
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
|
||||
|
||||
return(list(
|
||||
has_tiles = TRUE,
|
||||
detected_tiles = sample_tiles,
|
||||
total_files = length(sample_tiles),
|
||||
source = "grid_subdirectory_detection",
|
||||
grid_size = grid_size,
|
||||
grid_path = grid_dir
|
||||
))
|
||||
}
|
||||
|
||||
# Fall back to checking for tile-named files directly in merged_final_tif
|
||||
# List all .tif files in merged_final_tif
|
||||
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,31 +1,40 @@
|
|||
# ==============================================================================
|
||||
# FULL PIPELINE RUNNER
|
||||
# ==============================================================================
|
||||
# Runs scripts 02, 03, 04, 09 (KPIs), 09 (Weekly), and 10 (CI Report Simple)
|
||||
# Mixed Python/R pipeline:
|
||||
# 1. Python: Download Planet images
|
||||
# 2. R 10: Create master grid and split TIFFs
|
||||
# 3. R 20: CI Extraction
|
||||
# 4. R 21: Convert CI RDS to CSV
|
||||
# 5. R 30: Interpolate growth model
|
||||
# 6. Python 31: Harvest imminent weekly
|
||||
# 7. R 40: Mosaic creation
|
||||
# 8. R 80: Calculate KPIs
|
||||
#
|
||||
# ==============================================================================
|
||||
# HOW TO RUN THIS SCRIPT
|
||||
# ==============================================================================
|
||||
#
|
||||
# In PowerShell or Command Prompt:
|
||||
# Run from the smartcane/ directory:
|
||||
#
|
||||
# Option 1 (Recommended - shows real-time output):
|
||||
# Rscript run_full_pipeline.R
|
||||
# Rscript r_app/run_full_pipeline.R
|
||||
#
|
||||
# Option 2 (Full path to Rscript - use & in PowerShell for paths with spaces):
|
||||
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" run_full_pipeline.R
|
||||
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R
|
||||
#
|
||||
# Option 3 (Batch mode - output saved to .Rout file):
|
||||
# R CMD BATCH --vanilla run_full_pipeline.R
|
||||
# R CMD BATCH --vanilla r_app/run_full_pipeline.R
|
||||
#
|
||||
# ==============================================================================
|
||||
# ==============================================================================
|
||||
|
||||
# *** EDIT THESE VARIABLES ***
|
||||
end_date <- "2025-12-24" # or specify: "2025-12-02", Sys.Date()
|
||||
end_date <- as.Date("2026-01-27") # or specify: as.Date("2026-01-27") , Sys.Date()
|
||||
offset <- 7 # days to look back
|
||||
project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba"
|
||||
data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif"
|
||||
force_rerun <- FALSE # Set to TRUE to force all scripts to run even if outputs exist
|
||||
# ***************************
|
||||
|
||||
# Format dates
|
||||
|
|
@ -35,117 +44,350 @@ end_date_str <- format(as.Date(end_date), "%Y-%m-%d")
|
|||
pipeline_success <- TRUE
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 02: CI EXTRACTION
|
||||
# INTELLIGENT CHECKING: What has already been completed?
|
||||
# ==============================================================================
|
||||
cat("\n========== RUNNING SCRIPT 02: CI EXTRACTION ==========\n")
|
||||
tryCatch({
|
||||
source("r_app/02_ci_extraction.R")
|
||||
main() # Call the main function
|
||||
cat("✓ Script 02 completed\n")
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 02:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
cat("\n========== CHECKING EXISTING OUTPUTS ==========\n")
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 03: INTERPOLATE GROWTH MODEL
|
||||
# ==============================================================================
|
||||
cat("\n========== RUNNING SCRIPT 03: INTERPOLATE GROWTH MODEL ==========\n")
|
||||
tryCatch({
|
||||
source("r_app/03_interpolate_growth_model.R")
|
||||
main() # Call the main function
|
||||
cat("✓ Script 03 completed\n")
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 03:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 04: MOSAIC CREATION
|
||||
# ==============================================================================
|
||||
cat("\n========== RUNNING SCRIPT 04: MOSAIC CREATION ==========\n")
|
||||
tryCatch({
|
||||
source("r_app/04_mosaic_creation.R")
|
||||
main() # Call the main function
|
||||
cat("✓ Script 04 completed\n")
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 04:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 09: CALCULATE KPIs
|
||||
# ==============================================================================
|
||||
cat("\n========== RUNNING SCRIPT 09: CALCULATE KPIs ==========\n")
|
||||
tryCatch({
|
||||
source("r_app/09_calculate_kpis.R")
|
||||
main() # Call the main function
|
||||
cat("✓ Script 09 (KPIs) completed\n")
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 09 (KPIs):", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 09: FIELD ANALYSIS WEEKLY
|
||||
# ==============================================================================
|
||||
# Only run field analysis weekly for angata project
|
||||
if (project_dir == "angata") {
|
||||
cat("\n========== RUNNING SCRIPT 09: FIELD ANALYSIS WEEKLY ==========\n")
|
||||
tryCatch({
|
||||
source("r_app/09_field_analysis_weekly.R")
|
||||
main() # Call the main function
|
||||
cat("✓ Script 09 (Weekly) completed\n")
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 09 (Weekly):", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
# Check Script 10 outputs (tiled splits)
|
||||
tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5")
|
||||
tiles_dates <- if (dir.exists(tiles_dir)) {
|
||||
list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE)
|
||||
} else {
|
||||
cat("\n========== SKIPPING SCRIPT 09: FIELD ANALYSIS WEEKLY (only runs for angata) ==========\n")
|
||||
c()
|
||||
}
|
||||
cat(sprintf("Script 10: %d dates already tiled\n", length(tiles_dates)))
|
||||
|
||||
# Check Script 20 outputs (CI extraction) - daily RDS files
|
||||
ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals")
|
||||
ci_files <- if (dir.exists(ci_daily_dir)) {
|
||||
list.files(ci_daily_dir, pattern = "\\.rds$")
|
||||
} else {
|
||||
c()
|
||||
}
|
||||
cat(sprintf("Script 20: %d CI daily RDS files exist\n", length(ci_files)))
|
||||
|
||||
# Check Script 21 outputs (CSV conversion) - note: this gets overwritten each time, so we don't skip based on this
|
||||
# Instead, check if CI RDS files exist - if they do, 21 should also run
|
||||
# For now, just note that CSV is time-dependent, not a good skip indicator
|
||||
cat("Script 21: CSV file exists but gets overwritten - will run if Script 20 runs\n")
|
||||
|
||||
# Check Script 40 outputs (mosaics in weekly_tile_max/5x5)
|
||||
mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5")
|
||||
mosaic_files <- if (dir.exists(mosaic_dir)) {
|
||||
list.files(mosaic_dir, pattern = "\\.tif$")
|
||||
} else {
|
||||
c()
|
||||
}
|
||||
cat(sprintf("Script 40: %d mosaic files exist\n", length(mosaic_files)))
|
||||
|
||||
# Check Script 80 outputs (KPIs in reports/kpis/field_stats)
|
||||
kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats")
|
||||
kpi_files <- if (dir.exists(kpi_dir)) {
|
||||
list.files(kpi_dir, pattern = "\\.csv$|\\.json$")
|
||||
} else {
|
||||
c()
|
||||
}
|
||||
cat(sprintf("Script 80: %d KPI files exist\n", length(kpi_files)))
|
||||
|
||||
# Determine if scripts should run based on outputs
|
||||
skip_10 <- length(tiles_dates) > 0 && !force_rerun
|
||||
skip_20 <- length(ci_files) > 0 && !force_rerun
|
||||
skip_21 <- length(ci_files) > 0 && !force_rerun # Skip 21 if 20 is skipped
|
||||
skip_40 <- length(mosaic_files) > 0 && !force_rerun
|
||||
skip_80 <- FALSE # Always run Script 80 - it calculates KPIs for the current week (end_date), not historical weeks
|
||||
|
||||
cat("\nSkipping decisions:\n")
|
||||
cat(sprintf(" Script 10: %s\n", if(skip_10) "SKIP (tiles exist)" else "RUN"))
|
||||
cat(sprintf(" Script 20: %s\n", if(skip_20) "SKIP (CI exists)" else "RUN"))
|
||||
cat(sprintf(" Script 21: %s\n", if(skip_21) "SKIP (CI exists)" else "RUN"))
|
||||
cat(sprintf(" Script 40: %s\n", if(skip_40) "SKIP (mosaics exist)" else "RUN"))
|
||||
cat(sprintf(" Script 80: %s\n", if(skip_80) "SKIP (KPIs exist)" else "RUN"))
|
||||
|
||||
# ==============================================================================
|
||||
# PYTHON: DOWNLOAD PLANET IMAGES (MISSING DATES ONLY)
|
||||
# ==============================================================================
|
||||
cat("\n========== DOWNLOADING PLANET IMAGES (MISSING DATES ONLY) ==========\n")
|
||||
tryCatch({
|
||||
# Setup paths
|
||||
base_path <- file.path("laravel_app", "storage", "app", project_dir)
|
||||
merged_tifs_dir <- file.path(base_path, data_source)
|
||||
|
||||
# Get existing dates from raw TIFFs
|
||||
existing_tiff_files <- list.files(merged_tifs_dir, pattern = "^\\d{4}-\\d{2}-\\d{2}\\.tif$")
|
||||
existing_tiff_dates <- sub("\\.tif$", "", existing_tiff_files)
|
||||
|
||||
# Get existing dates from tiles (better indicator of completion)
|
||||
existing_tile_dates <- tiles_dates
|
||||
|
||||
# Find missing dates in the window
|
||||
start_date <- end_date - offset
|
||||
date_seq <- seq(start_date, end_date, by = "day")
|
||||
target_dates <- format(date_seq, "%Y-%m-%d")
|
||||
|
||||
# Only download if tiles don't exist yet (more reliable than checking raw TIFFs)
|
||||
missing_dates <- target_dates[!(target_dates %in% existing_tile_dates)]
|
||||
|
||||
cat(sprintf(" Existing tiled dates: %d\n", length(existing_tile_dates)))
|
||||
cat(sprintf(" Missing dates in window: %d\n", length(missing_dates)))
|
||||
|
||||
# Download each missing date
|
||||
download_count <- 0
|
||||
download_failed <- 0
|
||||
|
||||
if (length(missing_dates) > 0) {
|
||||
# Save current directory
|
||||
original_dir <- getwd()
|
||||
|
||||
# Change to python_app directory so relative paths work correctly
|
||||
setwd("python_app")
|
||||
|
||||
for (date_str in missing_dates) {
|
||||
cmd <- sprintf('python 00_download_8band_pu_optimized.py "%s" --date "%s" --resolution 3 --cleanup', project_dir, date_str)
|
||||
result <- system(cmd, ignore.stdout = FALSE, ignore.stderr = FALSE)
|
||||
if (result == 0) {
|
||||
download_count <- download_count + 1
|
||||
} else {
|
||||
download_failed <- download_failed + 1
|
||||
}
|
||||
}
|
||||
|
||||
# Change back to original directory
|
||||
setwd(original_dir)
|
||||
}
|
||||
|
||||
cat(sprintf("✓ Downloaded %d dates, %d failed\n", download_count, download_failed))
|
||||
if (download_failed > 0) {
|
||||
cat("⚠ Some downloads failed, but continuing pipeline\n")
|
||||
}
|
||||
|
||||
# Force Script 10 to run ONLY if downloads actually succeeded (not just attempted)
|
||||
if (download_count > 0) {
|
||||
skip_10 <- FALSE
|
||||
}
|
||||
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in planet download:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs
|
||||
# ==============================================================================
|
||||
if (pipeline_success && !skip_10) {
|
||||
cat("\n========== RUNNING SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs ==========\n")
|
||||
tryCatch({
|
||||
# Set environment variables for the script (Script 10 uses these for filtering)
|
||||
assign("PROJECT", project_dir, envir = .GlobalEnv)
|
||||
|
||||
# Suppress verbose per-date output, show only summary
|
||||
sink(nullfile())
|
||||
source("r_app/10_create_master_grid_and_split_tiffs.R")
|
||||
sink()
|
||||
|
||||
# Verify output
|
||||
tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5")
|
||||
if (dir.exists(tiles_dir)) {
|
||||
subdirs <- list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE)
|
||||
cat(sprintf("✓ Script 10 completed - created tiles for %d dates\n", length(subdirs)))
|
||||
} else {
|
||||
cat("✓ Script 10 completed\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
sink()
|
||||
cat("✗ Error in Script 10:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
} else if (skip_10) {
|
||||
cat("\n========== SKIPPING SCRIPT 10 (tiles already exist) ==========\n")
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 91: CI REPORT ANGATA (only for angata)
|
||||
# SCRIPT 20: CI EXTRACTION
|
||||
# ==============================================================================
|
||||
if (project_dir == "angata") {
|
||||
cat("\n========== RUNNING SCRIPT 91: CI REPORT ANGATA ==========\n")
|
||||
if (pipeline_success) {
|
||||
if (pipeline_success && !skip_20) {
|
||||
cat("\n========== RUNNING SCRIPT 20: CI EXTRACTION ==========\n")
|
||||
tryCatch({
|
||||
rmarkdown::render("r_app/91_CI_report_with_kpis_Angata.Rmd",
|
||||
output_format = "word_document",
|
||||
params = list(data_dir = project_dir, report_date = end_date_str))
|
||||
cat("✓ Script 91 (Report) completed\n")
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 91 (Report):", e$message, "\n")
|
||||
})
|
||||
# Set environment variables for the script
|
||||
assign("end_date", end_date, envir = .GlobalEnv)
|
||||
assign("offset", offset, envir = .GlobalEnv)
|
||||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||||
assign("data_source", data_source, envir = .GlobalEnv)
|
||||
|
||||
source("r_app/20_ci_extraction.R")
|
||||
main() # Call main() to execute the script with the environment variables
|
||||
|
||||
# Verify CI output was created
|
||||
ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals")
|
||||
if (dir.exists(ci_daily_dir)) {
|
||||
files <- list.files(ci_daily_dir, pattern = "\\.rds$")
|
||||
cat(sprintf("✓ Script 20 completed - generated %d CI files\n", length(files)))
|
||||
} else {
|
||||
cat("✗ Skipping Script 91: Previous pipeline scripts failed\n")
|
||||
cat("✓ Script 20 completed\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 20:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
} else if (skip_20) {
|
||||
cat("\n========== SKIPPING SCRIPT 20 (CI already extracted) ==========\n")
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 10: CI REPORT (SIMPLE)
|
||||
# SCRIPT 21: CONVERT CI RDS TO CSV
|
||||
# ==============================================================================
|
||||
# Only run CI report for non-angata projects
|
||||
|
||||
|
||||
if (project_dir != "angata") {
|
||||
cat("\n========== RUNNING SCRIPT 10: CI REPORT SIMPLE ==========\n")
|
||||
if (pipeline_success) {
|
||||
if (pipeline_success && !skip_21) {
|
||||
cat("\n========== RUNNING SCRIPT 21: CONVERT CI RDS TO CSV ==========\n")
|
||||
tryCatch({
|
||||
rmarkdown::render("r_app/10_CI_report_with_kpis_simple.Rmd",
|
||||
output_format = "word_document",
|
||||
params = list(data_dir = project_dir, report_date = end_date_str))
|
||||
cat("✓ Script 10 (Report) completed\n")
|
||||
# Set environment variables for the script
|
||||
assign("end_date", end_date, envir = .GlobalEnv)
|
||||
assign("offset", offset, envir = .GlobalEnv)
|
||||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||||
|
||||
source("r_app/21_convert_ci_rds_to_csv.R")
|
||||
main() # Call main() to execute the script with the environment variables
|
||||
|
||||
# Verify CSV output was created
|
||||
ci_csv_path <- file.path("laravel_app", "storage", "app", project_dir, "ci_extracted")
|
||||
if (dir.exists(ci_csv_path)) {
|
||||
csv_files <- list.files(ci_csv_path, pattern = "\\.csv$")
|
||||
cat(sprintf("✓ Script 21 completed - converted to %d CSV files\n", length(csv_files)))
|
||||
} else {
|
||||
cat("✓ Script 21 completed\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 10 (Report):", e$message, "\n")
|
||||
cat("✗ Error in Script 21:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
} else if (skip_21) {
|
||||
cat("\n========== SKIPPING SCRIPT 21 (CSV already created) ==========\n")
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 30: INTERPOLATE GROWTH MODEL
|
||||
# ==============================================================================
|
||||
if (pipeline_success) {
|
||||
cat("\n========== RUNNING SCRIPT 30: INTERPOLATE GROWTH MODEL ==========\n")
|
||||
tryCatch({
|
||||
# Set environment variables for the script
|
||||
assign("end_date", end_date, envir = .GlobalEnv)
|
||||
assign("offset", offset, envir = .GlobalEnv)
|
||||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||||
assign("data_source", data_source, envir = .GlobalEnv)
|
||||
|
||||
source("r_app/30_interpolate_growth_model.R")
|
||||
main() # Call main() to execute the script with the environment variables
|
||||
|
||||
# Verify interpolated output
|
||||
growth_dir <- file.path("laravel_app", "storage", "app", project_dir, "growth_model_interpolated")
|
||||
if (dir.exists(growth_dir)) {
|
||||
files <- list.files(growth_dir, pattern = "\\.rds$|\\.csv$")
|
||||
cat(sprintf("✓ Script 30 completed - generated %d growth model files\n", length(files)))
|
||||
} else {
|
||||
cat("✗ Skipping Script 10: Previous pipeline scripts failed\n")
|
||||
cat("✓ Script 30 completed\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 30:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# PYTHON 31: HARVEST IMMINENT WEEKLY
|
||||
# ==============================================================================
|
||||
if (pipeline_success) {
|
||||
cat("\n========== RUNNING PYTHON 31: HARVEST IMMINENT WEEKLY ==========\n")
|
||||
tryCatch({
|
||||
# Run Python script in pytorch_gpu conda environment
|
||||
# Script expects positional project name (not --project flag)
|
||||
# Run from smartcane root so conda can find the environment
|
||||
cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir)
|
||||
cat("DEBUG: Running command:", cmd, "\n")
|
||||
result <- system(cmd)
|
||||
|
||||
if (result == 0) {
|
||||
# Verify harvest output - check for THIS WEEK's specific file
|
||||
current_week <- as.numeric(format(end_date, "%V"))
|
||||
current_year <- as.numeric(format(end_date, "%Y"))
|
||||
expected_file <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats",
|
||||
sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year))
|
||||
|
||||
if (file.exists(expected_file)) {
|
||||
cat(sprintf("✓ Script 31 completed - generated harvest imminent file for week %02d\n", current_week))
|
||||
} else {
|
||||
cat("✓ Script 31 completed (check if harvest.xlsx is available)\n")
|
||||
}
|
||||
} else {
|
||||
cat("\n========== SKIPPING SCRIPT 10: CI REPORT SIMPLE (not applicable for angata) ==========\n")
|
||||
cat("⚠ Script 31 completed with errors (check harvest.xlsx availability)\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
setwd(original_dir)
|
||||
cat("⚠ Script 31 error:", e$message, "\n")
|
||||
})
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 40: MOSAIC CREATION
|
||||
# ==============================================================================
|
||||
if (pipeline_success && !skip_40) {
|
||||
cat("\n========== RUNNING SCRIPT 40: MOSAIC CREATION ==========\n")
|
||||
tryCatch({
|
||||
# Set environment variables for the script
|
||||
assign("end_date", end_date, envir = .GlobalEnv)
|
||||
assign("offset", offset, envir = .GlobalEnv)
|
||||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||||
assign("data_source", data_source, envir = .GlobalEnv)
|
||||
|
||||
source("r_app/40_mosaic_creation.R")
|
||||
main() # Call main() to execute the script with the environment variables
|
||||
|
||||
# Verify mosaic output
|
||||
mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5")
|
||||
if (dir.exists(mosaic_dir)) {
|
||||
files <- list.files(mosaic_dir, pattern = "\\.tif$")
|
||||
cat(sprintf("✓ Script 40 completed - generated %d mosaic files\n", length(files)))
|
||||
} else {
|
||||
cat("✓ Script 40 completed\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 40:", e$message, "\n")
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
} else if (skip_40) {
|
||||
cat("\n========== SKIPPING SCRIPT 40 (mosaics already created) ==========\n")
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# SCRIPT 80: CALCULATE KPIs
|
||||
# ==============================================================================
|
||||
if (pipeline_success) { # Always run Script 80 - it calculates KPIs for the current week
|
||||
cat("\n========== RUNNING SCRIPT 80: CALCULATE KPIs ==========\n")
|
||||
tryCatch({
|
||||
# Set environment variables for the script (Script 80's main() uses these as fallbacks)
|
||||
# NOTE: end_date is already a Date, just assign directly without as.Date()
|
||||
assign("end_date", end_date, envir = .GlobalEnv)
|
||||
assign("end_date_str", end_date_str, envir = .GlobalEnv)
|
||||
assign("offset", offset, envir = .GlobalEnv)
|
||||
assign("project_dir", project_dir, envir = .GlobalEnv)
|
||||
assign("data_source", data_source, envir = .GlobalEnv)
|
||||
|
||||
source("r_app/80_calculate_kpis.R")
|
||||
main() # Call main() to execute the script with the environment variables
|
||||
|
||||
# Verify KPI output
|
||||
kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats")
|
||||
if (dir.exists(kpi_dir)) {
|
||||
files <- list.files(kpi_dir, pattern = "\\.csv$|\\.json$")
|
||||
cat(sprintf("✓ Script 80 completed - generated %d KPI files\n", length(files)))
|
||||
} else {
|
||||
cat("✓ Script 80 completed\n")
|
||||
}
|
||||
}, error = function(e) {
|
||||
cat("✗ Error in Script 80:", e$message, "\n")
|
||||
cat("Full error:\n")
|
||||
print(e)
|
||||
pipeline_success <<- FALSE
|
||||
})
|
||||
}
|
||||
|
||||
# ==============================================================================
|
||||
# SUMMARY
|
||||
|
|
@ -154,4 +396,9 @@ cat("\n========== PIPELINE COMPLETE ==========\n")
|
|||
cat(sprintf("Project: %s\n", project_dir))
|
||||
cat(sprintf("End Date: %s\n", end_date_str))
|
||||
cat(sprintf("Offset: %d days\n", offset))
|
||||
cat("Scripts executed: 02, 03, 04, 09 (KPIs), 09 (Weekly), 10 (CI Report)\n")
|
||||
if (pipeline_success) {
|
||||
cat("Status: ✓ All scripts completed successfully\n")
|
||||
} else {
|
||||
cat("Status: ✗ Pipeline failed - check errors above\n")
|
||||
}
|
||||
cat("Pipeline sequence: Python Download → R 10 → R 20 → R 21 → R 30 → Python 31 → R 40 → R 80\n")
|
||||
|
|
|
|||
447
r_app/system_architecture/PIPELINE_OVERVIEW.md
Normal file
447
r_app/system_architecture/PIPELINE_OVERVIEW.md
Normal file
|
|
@ -0,0 +1,447 @@
|
|||
# SmartCane Processing Pipeline - Complete Script Overview
|
||||
|
||||
## Pipeline Execution Order
|
||||
|
||||
## Complete Pipeline Mermaid Diagram
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
%% ===== INPUTS =====
|
||||
API["🔑 Planet API<br/>Credentials"]
|
||||
GeoJSON["🗺️ pivot.geojson<br/>(Field Boundaries)"]
|
||||
HarvestIn["📊 harvest.xlsx<br/>(from Stage 23)"]
|
||||
|
||||
%% ===== STAGE 00: DOWNLOAD =====
|
||||
Stage00["<b>Stage 00: Python</b><br/>00_download_8band_pu_optimized.py"]
|
||||
Out00["📦 merged_tif_8b/<br/>YYYY-MM-DD.tif<br/>(4-band uint16)"]
|
||||
|
||||
%% ===== STAGE 10: OPTIONAL TILING =====
|
||||
Stage10["<b>Stage 10: R</b><br/>10_create_master_grid...<br/>(Optional)"]
|
||||
Out10["📦 daily_tiles_split/5x5/<br/>YYYY-MM-DD/*.tif<br/>(25 tiles)"]
|
||||
|
||||
%% ===== STAGE 20: CI EXTRACTION =====
|
||||
Stage20["<b>Stage 20: R</b><br/>20_ci_extraction.R"]
|
||||
Out20a["📦 combined_CI_data.rds<br/>(wide: fields × dates)"]
|
||||
Out20b["📦 daily RDS files<br/>(per-date stats)"]
|
||||
|
||||
%% ===== STAGE 21: RDS → CSV =====
|
||||
Stage21["<b>Stage 21: R</b><br/>21_convert_ci_rds_to_csv.R"]
|
||||
Out21["📦 ci_data_for_python.csv<br/>(long format + DOY)"]
|
||||
|
||||
%% ===== STAGE 22: BASELINE HARVEST =====
|
||||
Stage22["<b>Stage 22: Python</b><br/>22_harvest_baseline_prediction.py<br/>(RUN ONCE)"]
|
||||
Out22["📦 harvest_production_export.xlsx<br/>(baseline predictions)"]
|
||||
|
||||
%% ===== STAGE 23: HARVEST FORMAT =====
|
||||
Stage23["<b>Stage 23: Python</b><br/>23_convert_harvest_format.py"]
|
||||
Out23["📦 harvest.xlsx<br/>(standard format)<br/>→ Feeds back to Stage 80"]
|
||||
|
||||
%% ===== STAGE 30: GROWTH MODEL =====
|
||||
Stage30["<b>Stage 30: R</b><br/>30_interpolate_growth_model.R"]
|
||||
Out30["📦 All_pivots_Cumulative_CI...<br/>_quadrant_year_v2.rds<br/>(interpolated daily)"]
|
||||
|
||||
%% ===== STAGE 31: WEEKLY HARVEST =====
|
||||
Stage31["<b>Stage 31: Python</b><br/>31_harvest_imminent_weekly.py<br/>(Weekly)"]
|
||||
Out31["📦 harvest_imminent_weekly.csv<br/>(probabilities)"]
|
||||
|
||||
%% ===== STAGE 40: MOSAIC =====
|
||||
Stage40["<b>Stage 40: R</b><br/>40_mosaic_creation.R"]
|
||||
Out40["📦 weekly_mosaic/<br/>week_WW_YYYY.tif<br/>(5-band composite)"]
|
||||
|
||||
%% ===== STAGE 80: KPI =====
|
||||
Stage80["<b>Stage 80: R</b><br/>80_calculate_kpis.R"]
|
||||
Out80a["📦 field_analysis_week{WW}.xlsx"]
|
||||
Out80b["📦 kpi_summary_tables_week{WW}.rds"]
|
||||
|
||||
%% ===== STAGE 90: REPORT =====
|
||||
Stage90["<b>Stage 90: R/RMarkdown</b><br/>90_CI_report_with_kpis_simple.Rmd"]
|
||||
Out90["📦 SmartCane_Report_week{WW}_{YYYY}.docx<br/>(FINAL OUTPUT)"]
|
||||
|
||||
%% ===== CONNECTIONS: INPUTS TO STAGE 00 =====
|
||||
API --> Stage00
|
||||
GeoJSON --> Stage00
|
||||
|
||||
%% ===== STAGE 00 → 10 OR 20 =====
|
||||
Stage00 --> Out00
|
||||
Out00 --> Stage10
|
||||
Out00 --> Stage20
|
||||
|
||||
%% ===== STAGE 10 → 20 =====
|
||||
Stage10 --> Out10
|
||||
Out10 --> Stage20
|
||||
|
||||
%% ===== STAGE 20 → 21, 30, 40 =====
|
||||
GeoJSON --> Stage20
|
||||
Stage20 --> Out20a
|
||||
Stage20 --> Out20b
|
||||
Out20a --> Stage21
|
||||
Out20a --> Stage30
|
||||
Out00 --> Stage40
|
||||
|
||||
%% ===== STAGE 21 → 22, 31 =====
|
||||
Stage21 --> Out21
|
||||
Out21 --> Stage22
|
||||
Out21 --> Stage31
|
||||
|
||||
%% ===== STAGE 22 → 23 =====
|
||||
Stage22 --> Out22
|
||||
Out22 --> Stage23
|
||||
|
||||
%% ===== STAGE 23 → 80 & FEEDBACK =====
|
||||
Stage23 --> Out23
|
||||
Out23 -.->|"Feeds back<br/>(Season context)"| Stage80
|
||||
|
||||
%% ===== STAGE 30 → 80 =====
|
||||
Stage30 --> Out30
|
||||
Out30 --> Stage80
|
||||
|
||||
%% ===== STAGE 31 (PARALLEL) =====
|
||||
Stage31 --> Out31
|
||||
|
||||
%% ===== STAGE 40 → 80, 90 =====
|
||||
Stage40 --> Out40
|
||||
Out40 --> Stage80
|
||||
Out40 --> Stage90
|
||||
|
||||
%% ===== STAGE 80 → 90 =====
|
||||
Stage80 --> Out80a
|
||||
Stage80 --> Out80b
|
||||
Out80a --> Stage90
|
||||
Out80b --> Stage90
|
||||
|
||||
%% ===== STAGE 90 FINAL =====
|
||||
Stage90 --> Out90
|
||||
|
||||
%% ===== ADDITIONAL INPUTS =====
|
||||
HarvestIn --> Stage30
|
||||
HarvestIn --> Stage80
|
||||
GeoJSON --> Stage30
|
||||
GeoJSON --> Stage40
|
||||
GeoJSON --> Stage80
|
||||
|
||||
%% ===== STYLING =====
|
||||
classDef input fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
|
||||
classDef pyStage fill:#fff3e0,stroke:#f57c00,stroke-width:2px
|
||||
classDef rStage fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
|
||||
classDef output fill:#e8f5e9,stroke:#388e3c,stroke-width:2px
|
||||
classDef finalOutput fill:#ffebee,stroke:#c62828,stroke-width:3px
|
||||
|
||||
class API,GeoJSON,HarvestIn input
|
||||
class Stage00,Stage22,Stage23,Stage31 pyStage
|
||||
class Stage10,Stage20,Stage21,Stage30,Stage40,Stage80,Stage90 rStage
|
||||
class Out00,Out10,Out20a,Out20b,Out21,Out22,Out30,Out31,Out40,Out80a,Out80b output
|
||||
class Out23,Out90 finalOutput
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Detailed Stage Descriptions
|
||||
|
||||
```
|
||||
Stage 00: PYTHON - Download Satellite Data
|
||||
└─ 00_download_8band_pu_optimized.py
|
||||
INPUT: Planet API credentials, field boundaries (pivot.geojson), date range
|
||||
OUTPUT: laravel_app/storage/app/{project}/merged_tif_8b/{YYYY-MM-DD}.tif (4-band uint16)
|
||||
RUN FREQUENCY: Daily or as-needed
|
||||
NOTES: 8-band includes UDM cloud masking, optimized for PU cost
|
||||
|
||||
Stage 10: R - (Optional) Create Master Grid & Split TIFFs into Tiles
|
||||
└─ 10_create_master_grid_and_split_tiffs.R
|
||||
INPUT: Daily GeoTIFFs from merged_tif_8b/
|
||||
OUTPUT: laravel_app/storage/app/{project}/daily_tiles_split/5x5/{YYYY-MM-DD}/*.tif
|
||||
RUN FREQUENCY: Optional - only if tile-based processing desired
|
||||
NOTES: Creates 25 tiles per day for memory-efficient processing; 5x5 grid hardcoded
|
||||
|
||||
Stage 20: R - Extract Canopy Index (CI) from Daily Imagery
|
||||
└─ 20_ci_extraction.R
|
||||
INPUT: Daily GeoTIFFs (merged_tif_8b/ or daily_tiles_split/)
|
||||
Field boundaries (pivot.geojson)
|
||||
Data source parameter (merged_tif_8b, merged_tif, merged_final_tif)
|
||||
OUTPUT: RDS files:
|
||||
- laravel_app/storage/app/{project}/Data/extracted_ci/daily_vals/extracted_{YYYY-MM-DD}_{suffix}.rds
|
||||
- laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/combined_CI_data.rds (wide format)
|
||||
RUN FREQUENCY: Daily or on-demand
|
||||
COMMAND: Rscript 20_ci_extraction.R [end_date] [offset] [project_dir] [data_source]
|
||||
EXAMPLE: Rscript 20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b
|
||||
NOTES: Auto-detects tiles if daily_tiles_split/ exists; outputs cumulative CI (fields × dates)
|
||||
|
||||
Stage 21: R - Convert CI RDS to CSV for Python Harvest Detection
|
||||
└─ 21_convert_ci_rds_to_csv.R
|
||||
INPUT: combined_CI_data.rds (from Stage 20)
|
||||
OUTPUT: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv
|
||||
RUN FREQUENCY: After Stage 20
|
||||
COMMAND: Rscript 21_convert_ci_rds_to_csv.R [project_dir]
|
||||
EXAMPLE: Rscript 21_convert_ci_rds_to_csv.R angata
|
||||
NOTES: Converts wide RDS (fields × dates) to long CSV; interpolates missing dates; adds DOY column
|
||||
|
||||
Stage 22: PYTHON - Baseline Harvest Prediction (LSTM Model 307)
|
||||
└─ 22_harvest_baseline_prediction.py
|
||||
INPUT: ci_data_for_python.csv (complete historical CI data)
|
||||
OUTPUT: laravel_app/storage/app/{project}/Data/HarvestData/harvest_production_export.xlsx
|
||||
RUN FREQUENCY: ONCE - establishes ground truth baseline for all fields
|
||||
COMMAND: python 22_harvest_baseline_prediction.py [project_name]
|
||||
EXAMPLE: python 22_harvest_baseline_prediction.py angata
|
||||
NOTES: Two-step detection (Phase 1: growing window, Phase 2: ±40 day argmax refinement)
|
||||
Tuned parameters: threshold=0.3, consecutive_days=2
|
||||
Uses LSTM Model 307 dual output heads (imminent + detected)
|
||||
|
||||
Stage 23: PYTHON - Convert Harvest Format to Standard Structure
|
||||
└─ 23_convert_harvest_format.py
|
||||
INPUT: harvest_production_export.xlsx (from Stage 22)
|
||||
CI data date range (determines season_start for first season)
|
||||
OUTPUT: laravel_app/storage/app/{project}/Data/harvest.xlsx (standard format)
|
||||
RUN FREQUENCY: After Stage 22
|
||||
COMMAND: python 23_convert_harvest_format.py [project_name]
|
||||
EXAMPLE: python 23_convert_harvest_format.py angata
|
||||
NOTES: Converts to standard harvest.xlsx format with columns:
|
||||
field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha
|
||||
Season format: "Data{year} : {field}"
|
||||
Only includes completed seasons (with season_end filled)
|
||||
|
||||
Stage 30: R - Growth Model Interpolation (Smooth CI Time Series)
|
||||
└─ 30_interpolate_growth_model.R
|
||||
INPUT: combined_CI_data.rds (from Stage 20)
|
||||
harvest.xlsx (optional, for seasonal context)
|
||||
OUTPUT: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/
|
||||
All_pivots_Cumulative_CI_quadrant_year_v2.rds
|
||||
RUN FREQUENCY: Weekly or after CI extraction updates
|
||||
COMMAND: Rscript 30_interpolate_growth_model.R [project_dir]
|
||||
EXAMPLE: Rscript 30_interpolate_growth_model.R angata
|
||||
NOTES: Linear interpolation across gaps; calculates daily change and cumulative CI
|
||||
Outputs long-format data (Date, DOY, field, value, season, etc.)
|
||||
|
||||
Stage 31: PYTHON - Weekly Harvest Monitoring (Real-Time Alerts)
|
||||
└─ 31_harvest_imminent_weekly.py
|
||||
INPUT: ci_data_for_python.csv (recent CI data, last ~300 days)
|
||||
harvest_production_export.xlsx (optional baseline reference)
|
||||
OUTPUT: laravel_app/storage/app/{project}/Data/HarvestData/harvest_imminent_weekly.csv
|
||||
RUN FREQUENCY: Weekly or daily for operational alerts
|
||||
COMMAND: python 31_harvest_imminent_weekly.py [project_name]
|
||||
EXAMPLE: python 31_harvest_imminent_weekly.py angata
|
||||
NOTES: Single-run inference on recent data; outputs probabilities (imminent_prob, detected_prob)
|
||||
Used for real-time decision support; compared against baseline from Stage 22
|
||||
|
||||
Stage 40: R - Create Weekly 5-Band Mosaics
|
||||
└─ 40_mosaic_creation.R
|
||||
INPUT: Daily GeoTIFFs (merged_tif_8b/ or daily_tiles_split/)
|
||||
Field boundaries (pivot.geojson)
|
||||
OUTPUT: laravel_app/storage/app/{project}/weekly_mosaic/week_{WW}_{YYYY}.tif
|
||||
RUN FREQUENCY: Weekly
|
||||
COMMAND: Rscript 40_mosaic_creation.R [end_date] [offset] [project_dir]
|
||||
EXAMPLE: Rscript 40_mosaic_creation.R 2026-01-14 7 angata
|
||||
NOTES: Composites daily images using MAX function; 5 bands (R, G, B, NIR, CI)
|
||||
Automatically selects images with acceptable cloud coverage
|
||||
Output uses ISO week numbering (week_WW_YYYY)
|
||||
|
||||
Stage 80: R - Calculate KPIs & Per-Field Analysis
|
||||
└─ 80_calculate_kpis.R
|
||||
INPUT: Weekly mosaic (from Stage 40)
|
||||
Growth model data (from Stage 30)
|
||||
Field boundaries (pivot.geojson)
|
||||
Harvest data (harvest.xlsx)
|
||||
OUTPUT: laravel_app/storage/app/{project}/reports/
|
||||
- {project}_field_analysis_week{WW}.xlsx
|
||||
- {project}_kpi_summary_tables_week{WW}.rds
|
||||
RUN FREQUENCY: Weekly
|
||||
COMMAND: Rscript 80_calculate_kpis.R [end_date] [project_dir] [offset_days]
|
||||
EXAMPLE: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
|
||||
NOTES: Parallel processing for 1000+ fields; calculates:
|
||||
- Per-field uniformity (CV), phase assignment, growth trends
|
||||
- Status triggers (germination, rapid growth, disease, harvest imminence)
|
||||
- Farm-level KPI metrics (6 high-level indicators)
|
||||
TEST_MODE=TRUE uses only recent weeks for development
|
||||
|
||||
Stage 90: R (RMarkdown) - Generate Executive Report (Word Document)
|
||||
└─ 90_CI_report_with_kpis_simple.Rmd
|
||||
INPUT: Weekly mosaic (from Stage 40)
|
||||
KPI summary data (from Stage 80)
|
||||
Field analysis (from Stage 80)
|
||||
Field boundaries & harvest data (for context)
|
||||
OUTPUT: laravel_app/storage/app/{project}/reports/
|
||||
SmartCane_Report_week{WW}_{YYYY}.docx (PRIMARY OUTPUT)
|
||||
SmartCane_Report_week{WW}_{YYYY}.html (optional)
|
||||
RUN FREQUENCY: Weekly
|
||||
RENDERING: R/RMarkdown with officer + flextable packages
|
||||
NOTES: Executive summary with KPI overview, phase distribution, status triggers
|
||||
Field-by-field detail pages with CI metrics and interpretation guides
|
||||
Automatic unit conversion (hectares ↔ acres)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Storage & Persistence
|
||||
|
||||
All data persists to the file system. No database writes occur during analysis—reads only for metadata.
|
||||
|
||||
```
|
||||
laravel_app/storage/app/{project}/
|
||||
├── Data/
|
||||
│ ├── pivot.geojson # Field boundaries (read-only input)
|
||||
│ ├── harvest.xlsx # Season dates & yield (standard format from Stage 23)
|
||||
│ ├── vrt/ # Virtual raster files (daily VRTs from Stage 20)
|
||||
│ │ └── YYYY-MM-DD.vrt
|
||||
│ ├── extracted_ci/
|
||||
│ │ ├── ci_data_for_python/
|
||||
│ │ │ └── ci_data_for_python.csv # CSV for Python (from Stage 21)
|
||||
│ │ ├── daily_vals/
|
||||
│ │ │ └── extracted_YYYY-MM-DD_{suffix}.rds # Daily field CI stats (from Stage 20)
|
||||
│ │ └── cumulative_vals/
|
||||
│ │ ├── combined_CI_data.rds # Cumulative CI, wide format (from Stage 20)
|
||||
│ │ └── All_pivots_Cumulative_CI_quadrant_year_v2.rds # Interpolated daily (from Stage 30)
|
||||
│ └── HarvestData/
|
||||
│ ├── harvest_production_export.xlsx # Baseline harvest predictions (from Stage 22)
|
||||
│ └── harvest_imminent_weekly.csv # Weekly monitoring output (from Stage 31)
|
||||
│
|
||||
├── merged_tif_8b/ # Raw 4-band satellite imagery (Stage 00 output)
|
||||
│ └── YYYY-MM-DD.tif # 4 bands: R, G, B, NIR (uint16 with UDM cloud masking)
|
||||
│
|
||||
├── daily_tiles_split/ # (Optional) Tile-based processing (Stage 10 output)
|
||||
│ ├── 5x5/
|
||||
│ │ ├── tiling_config.json # Metadata about tiling parameters
|
||||
│ │ └── YYYY-MM-DD/ # Date-specific folder
|
||||
│ │ └── YYYY-MM-DD_{00-24}.tif # 25 tiles per day
|
||||
│
|
||||
├── weekly_mosaic/ # Weekly composite mosaics (Stage 40 output)
|
||||
│ └── week_WW_YYYY.tif # 5 bands: R, G, B, NIR, CI (composite)
|
||||
│
|
||||
└── reports/ # Analysis outputs & reports (Stage 80, 90 outputs)
|
||||
├── SmartCane_Report_week{WW}_{YYYY}.docx # FINAL REPORT (Stage 90)
|
||||
├── SmartCane_Report_week{WW}_{YYYY}.html # Alternative format
|
||||
├── {project}_field_analysis_week{WW}.xlsx # Field-by-field data (Stage 80)
|
||||
├── {project}_kpi_summary_tables_week{WW}.rds # Summary RDS (Stage 80)
|
||||
└── kpis/
|
||||
└── week_WW_YYYY/ # Week-specific KPI folder
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key File Formats
|
||||
|
||||
| Format | Stage | Purpose | Example |
|
||||
|--------|-------|---------|---------|
|
||||
| `.tif` (GeoTIFF) | 00, 10, 40 | Geospatial raster imagery | `2026-01-14.tif` (4-band), `week_02_2026.tif` (5-band) |
|
||||
| `.vrt` (Virtual Raster) | 20 | Virtual pointer to TIFFs | `2026-01-14.vrt` |
|
||||
| `.rds` (R Binary) | 20, 21, 30, 80 | R serialized data objects | `combined_CI_data.rds`, `All_pivots_Cumulative_CI_quadrant_year_v2.rds` |
|
||||
| `.csv` (Comma-Separated) | 21, 31 | Tabular data for Python | `ci_data_for_python.csv`, `harvest_imminent_weekly.csv` |
|
||||
| `.xlsx` (Excel) | 22, 23, 80 | Tabular reports & harvest data | `harvest.xlsx`, `harvest_production_export.xlsx`, field analysis |
|
||||
| `.docx` (Word) | 90 | Executive report (final output) | `SmartCane_Report_week02_2026.docx` |
|
||||
| `.json` | 10 | Tiling metadata | `tiling_config.json` |
|
||||
| `.geojson` | Input | Field boundaries (read-only) | `pivot.geojson` |
|
||||
|
||||
---
|
||||
|
||||
## Script Dependencies & Utility Files
|
||||
|
||||
```
|
||||
parameters_project.R
|
||||
├─ Loaded by: 20_ci_extraction.R, 30_interpolate_growth_model.R,
|
||||
│ 40_mosaic_creation.R, 80_calculate_kpis.R, 90_CI_report_with_kpis_simple.Rmd
|
||||
└─ Purpose: Initializes project config (paths, field boundaries, harvest data)
|
||||
|
||||
harvest_date_pred_utils.py
|
||||
├─ Used by: 22_harvest_baseline_prediction.py, 23_convert_harvest_format.py, 31_harvest_imminent_weekly.py
|
||||
└─ Purpose: LSTM model loading, feature extraction, two-step harvest detection
|
||||
|
||||
20_ci_extraction_utils.R
|
||||
├─ Used by: 20_ci_extraction.R
|
||||
└─ Purpose: CI calculation, field masking, RDS I/O, tile detection
|
||||
|
||||
30_growth_model_utils.R
|
||||
├─ Used by: 30_interpolate_growth_model.R
|
||||
└─ Purpose: Linear interpolation, daily metrics, seasonal grouping
|
||||
|
||||
40_mosaic_creation_utils.R, 40_mosaic_creation_tile_utils.R
|
||||
├─ Used by: 40_mosaic_creation.R
|
||||
└─ Purpose: Weekly composite creation, cloud assessment, raster masking
|
||||
|
||||
kpi_utils.R
|
||||
├─ Used by: 80_calculate_kpis.R
|
||||
└─ Purpose: Per-field statistics, phase assignment, trigger detection
|
||||
|
||||
report_utils.R
|
||||
├─ Used by: 90_CI_report_with_kpis_simple.Rmd
|
||||
└─ Purpose: Report building, table formatting, Word document generation
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Command-Line Execution Examples
|
||||
|
||||
### Daily/Weekly Workflow
|
||||
|
||||
```bash
|
||||
# Stage 00: Download today's satellite data
|
||||
cd python_app
|
||||
python 00_download_8band_pu_optimized.py angata --cleanup
|
||||
|
||||
# Stage 20: Extract CI from daily imagery (last 7 days)
|
||||
cd ../r_app
|
||||
Rscript 20_ci_extraction.R 2026-01-14 7 angata merged_tif_8b
|
||||
|
||||
# Stage 21: Convert CI to CSV for harvest detection
|
||||
Rscript 21_convert_ci_rds_to_csv.R angata
|
||||
|
||||
# Stage 31: Weekly harvest monitoring (real-time alerts)
|
||||
cd ../python_app
|
||||
python 31_harvest_imminent_weekly.py angata
|
||||
|
||||
# Back to R for mosaic and KPIs
|
||||
cd ../r_app
|
||||
Rscript 40_mosaic_creation.R 2026-01-14 7 angata
|
||||
Rscript 80_calculate_kpis.R 2026-01-14 angata 7
|
||||
|
||||
# Stage 90: Generate report
|
||||
Rscript -e "rmarkdown::render('90_CI_report_with_kpis_simple.Rmd')"
|
||||
```
|
||||
|
||||
### One-Time Setup (Baseline Harvest Detection)
|
||||
|
||||
```bash
|
||||
# Only run ONCE to establish baseline
|
||||
cd python_app
|
||||
python 22_harvest_baseline_prediction.py angata
|
||||
|
||||
# Convert to standard format
|
||||
python 23_convert_harvest_format.py angata
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Processing Notes
|
||||
|
||||
### CI Extraction (Stage 20)
|
||||
- Calculates CI = (NIR - Green) / (NIR + Green)
|
||||
- Supports both 4-band and 8-band imagery with auto-detection
|
||||
- Handles cloud masking via UDM band (8-band) or manual thresholding (4-band)
|
||||
- Outputs cumulative RDS in wide format (fields × dates) for fast lookups
|
||||
|
||||
### Growth Model (Stage 30)
|
||||
- Linear interpolation across missing dates
|
||||
- Maintains seasonal context for agricultural lifecycle tracking
|
||||
- Outputs long-format data for trend analysis
|
||||
|
||||
### Harvest Detection (Stages 22 & 31)
|
||||
- **Model 307**: Unidirectional LSTM with dual output heads
|
||||
- Imminent Head: Probability field will be harvestable in next 28 days
|
||||
- Detected Head: Probability of immediate harvest event
|
||||
- **Stage 22 (Baseline)**: Two-step detection on complete historical data
|
||||
- Phase 1: Growing window expansion (real-time simulation)
|
||||
- Phase 2: ±40 day refinement (argmax harvest signal)
|
||||
- **Stage 31 (Weekly)**: Single-run inference on recent data (~300 days)
|
||||
- Compares against baseline for anomaly detection
|
||||
|
||||
### KPI Calculation (Stage 80)
|
||||
- **Per-field metrics**: Uniformity (CV), phase, growth trends, 4-week trends
|
||||
- **Status triggers**: Germination, rapid growth, slow growth, non-uniform, weed pressure, harvest imminence
|
||||
- **Farm-level KPIs**: 6 high-level indicators for executive summary
|
||||
- **Parallel processing**: ~1000+ fields processed in <5 minutes
|
||||
|
||||
---
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
- **Real-Time Monitoring**: Daily harvest probability updates integrated into web dashboard
|
||||
- **SAR Integration**: Radar satellite data (Sentinel-1) for all-weather monitoring
|
||||
- **IoT Sensors**: Ground-based soil moisture and weather integration
|
||||
- **Advanced Yield Models**: Enhanced harvest forecasting with satellite + ground truth
|
||||
- **Automated Alerts**: WhatsApp/email dispatch of critical agricultural advice
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
<!-- filepath: c:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane\r_app\system_architecture.md -->
|
||||
# SmartCane System Architecture - R Pipeline & File-Based Processing
|
||||
# SmartCane System Architecture - Python + R Pipeline & File-Based Processing
|
||||
|
||||
## Overview
|
||||
|
||||
The SmartCane system is a file-based agricultural intelligence platform that processes satellite imagery through a multi-stage R-script pipeline. Raw satellite imagery flows through sequential processing steps (CI extraction, growth model interpolation, mosaic creation, KPI analysis) with outputs persisted as GeoTIFFs, RDS files, and Excel/Word reports.
|
||||
The SmartCane system is a file-based agricultural intelligence platform that processes satellite imagery through sequential Python and R scripts. Raw satellite imagery is downloaded via Planet API (Python), then flows through R processing stages (CI extraction, growth model interpolation, mosaic creation, KPI analysis, harvest detection) with outputs persisted as GeoTIFFs, RDS files, Excel sheets, and Word reports. Harvest monitoring is performed via ML-based harvest detection using LSTM models trained on historical CI sequences.
|
||||
|
||||
## Processing Pipeline Overview
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue