Enhance harvest detection logic and testing framework

- Updated `detect_mosaic_mode` function to check for grid-size subdirectories in addition to tile-named files.
- Added comprehensive tests for DOY reset logic in `test_doy_logic.py`.
- Implemented feature extraction tests in `test_feature_extraction.py`.
- Created tests for growing window method in `test_growing_window_only.py`.
- Developed a complete model inference test in `test_model_inference.py`.
- Added a debug script for testing two-step refinement logic in `test_script22_debug.py`.
This commit is contained in:
Timon 2026-01-15 14:30:54 +01:00
parent d6448b8703
commit fabbf3214d
11 changed files with 1416 additions and 187 deletions

View file

@ -26,7 +26,9 @@ This is your GROUND TRUTH - compare all future predictions against this baseline
Usage:
python 01_harvest_baseline_prediction.py [project_name]
conda activate pytorch_gpu; cd "C:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane_v2\smartcane\python_app"; python 22_harvest_baseline_prediction.py angata
conda activate pytorch_gpu
cd python_app
python 22_harvest_baseline_prediction.py angata
Examples:
python 01_harvest_baseline_prediction.py angata
@ -108,7 +110,7 @@ def main():
# [3/4] Run model predictions with two-step detection
print("\n[3/4] Running two-step harvest detection...")
print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)")
print(" (Using threshold=0.3, consecutive_days=2 - tuned baseline with DOY reset)")
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device,
phase1_threshold=0.3, phase1_consecutive=2)

View file

@ -184,18 +184,47 @@ def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> p
return features.fillna(0)
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
"""Extract and return specified features as numpy array."""
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData',
season_anchor_day: int = None, lookback_start: int = 0) -> np.ndarray:
"""
Extract and return specified features as numpy array.
Args:
data_df: DataFrame with Date and CI data (may be a window after a harvest)
feature_names: List of feature names to extract
ci_column: Name of CI column
season_anchor_day: Day in FULL sequence where this season started (for DOY reset)
DOY will be recalculated as: 1, 2, 3, ... from this point
lookback_start: Starting index in original full data (for season reset calculation)
Returns:
NumPy array of shape (len(data_df), len(feature_names))
"""
# Compute all CI features
ci_series = data_df[ci_column].astype(float)
doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
# Compute DOY (age/days since season start) - NOT day-of-year!
# DOY is a continuous counter: 1, 2, 3, ..., 475 (doesn't cycle at 365)
# It only resets to 1 after a harvest is detected (new season)
doy_series = None
if 'DOY_normalized' in feature_names:
if season_anchor_day is not None and lookback_start >= season_anchor_day:
# Season was reset after harvest. Recalculate DOY as simple counter from 1
# This is a window starting at or after harvest, so DOY should be: 1, 2, 3, ...
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
elif 'DOY' in data_df.columns:
# Use DOY directly from CSV - already calculated as continuous age counter
doy_series = pd.Series(data_df['DOY'].astype(float).values, index=data_df.index)
else:
# Fallback: create continuous age counter (1, 2, 3, ...)
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
all_features = compute_ci_features(ci_series, doy_series)
# Select requested features
requested = [f for f in feature_names if f in all_features.columns]
if not requested:
raise ValueError(f"No valid features found. Requested: {feature_names}")
raise ValueError(f"No valid features found. Requested: {feature_names}, Available: {all_features.columns.tolist()}")
return all_features[requested].values
@ -274,42 +303,63 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame:
def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device,
threshold=0.45, consecutive_days=2):
"""
Phase 1: Growing window detection with threshold crossing.
Expand window day-by-day, check last timestep's detected_prob.
When N consecutive days have prob > threshold, harvest detected.
Phase 1: Growing window detection with DOY season reset.
For each detected harvest, reset DOY counter for the next season.
This allows the model to detect multiple consecutive harvests in multi-year data.
Algorithm:
1. Start with season_anchor_day = 0 (DOY 1 at day 0)
2. Expand window: [0:1], [0:2], [0:3], ... until threshold crossed
3. When harvest detected: record date, set new season_anchor = day after harvest
4. Continue from next season start
Args:
threshold (float): Probability threshold (default 0.45, tuned for Model 307)
consecutive_days (int): Required consecutive days above threshold (default 2, reduced from 3 for robustness)
threshold (float): Probability threshold (default 0.45)
consecutive_days (int): Required consecutive days above threshold (default 2)
Returns list of (harvest_date, harvest_idx) tuples.
"""
harvest_dates = []
season_anchor_day = 0 # DOY 1 starts at day 0
current_pos = 0
while current_pos < len(field_data):
consecutive_above_threshold = 0
min_window_size = 120 # Need at least 120 days (~4 months) for patterns to establish
for window_end in range(current_pos + 1, len(field_data) + 1):
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
# Skip if window is too small (model needs long sequences for pattern learning)
if len(window_data) < min_window_size:
continue
try:
features = extract_features(window_data, config['features'], ci_column=ci_column)
# CRITICAL: Pass season_anchor_day so DOY resets after harvest
features = extract_features(
window_data,
config['features'],
ci_column=ci_column,
season_anchor_day=season_anchor_day,
lookback_start=current_pos
)
# Apply scalers
features_scaled = features.copy().astype(float)
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception:
pass
# Run model
# Run model on expanding window
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
# Check LAST timestep
last_prob = detected_probs[-1]
# Check LAST timestep only
last_prob = detected_probs[0, -1].item()
if last_prob > threshold:
consecutive_above_threshold += 1
@ -318,16 +368,21 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev
# Harvest detected: N consecutive days above threshold
if consecutive_above_threshold >= consecutive_days:
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
harvest_idx = current_pos + window_end - consecutive_days
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx))
# Reset to next day after harvest
current_pos = current_pos + window_end - consecutive_days + 1
# CRITICAL: Reset season anchor for next season
# DOY 1 starts at day after harvest
season_anchor_day = harvest_idx + 1
current_pos = harvest_idx + 1
break
except Exception:
except Exception as e:
# Skip window on error
continue
else:
# No more harvests found
break
return harvest_dates
@ -413,7 +468,9 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
results = []
ci_column = config['data']['ci_column']
# CI column is 'FitData' (interpolated CI) - NOT 'value' (raw with NAs)
# 'FitData' is already gap-filled by R stage 03, ready for ML
ci_column = 'FitData'
# Group by field and count total fields for progress
field_groups = list(df.groupby('field'))

View file

@ -0,0 +1,65 @@
"""
Test DOY reset logic for harvest detection.
Verify that DOY resets to 1, 2, 3, ... after harvest is detected.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd()))
import pandas as pd
import numpy as np
from harvest_date_pred_utils import extract_features, load_model_and_config
import torch
# Load sample data
ci_data = pd.read_csv('../laravel_app/storage/app/angata/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv')
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Get field 779 data
field_779 = ci_data[ci_data['field'] == '779'].reset_index(drop=True)
print(f"Field 779: {len(field_779)} days of data")
print(f"Date range: {field_779['Date'].min().date()} to {field_779['Date'].max().date()}\n")
# Load model config
model, config, scalers = load_model_and_config(Path.cwd())
# Test 1: First season (season_anchor_day = 0)
print("=" * 80)
print("TEST 1: First season (season_anchor_day=0, lookback_start=0)")
print("=" * 80)
window = field_779.iloc[0:20].copy().reset_index(drop=True)
features = extract_features(window, config['features'], ci_column='FitData',
season_anchor_day=0, lookback_start=0)
# Extract DOY_normalized column (index 13 or find it)
feature_names = config['features']
doy_idx = feature_names.index('DOY_normalized') if 'DOY_normalized' in feature_names else -1
if doy_idx >= 0:
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
print(f"Window size: {len(window)} days")
print(f"DOY values: {doy_values[:10]}")
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]")
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not incrementing correctly!"
print("✓ DOY incrementing correctly for first season\n")
# Test 2: After harvest detected at day 100, next season starts
print("=" * 80)
print("TEST 2: After harvest at day 100, new season starts (season_anchor_day=101, lookback_start=101)")
print("=" * 80)
harvest_day = 100
window = field_779.iloc[harvest_day:harvest_day+20].copy().reset_index(drop=True)
features = extract_features(window, config['features'], ci_column='FitData',
season_anchor_day=harvest_day+1, lookback_start=harvest_day+1)
if doy_idx >= 0:
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
print(f"Window size: {len(window)} days (starting at day {harvest_day})")
print(f"DOY values: {doy_values[:10]}")
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (fresh season)")
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not reset after harvest!"
print("✓ DOY reset correctly for new season\n")
print("=" * 80)
print("ALL TESTS PASSED! DOY logic is correct.")
print("=" * 80)

View file

@ -0,0 +1,73 @@
#!/usr/bin/env python3
"""
Quick test: Verify feature extraction works
"""
import sys
import pandas as pd
import numpy as np
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from harvest_date_pred_utils import extract_features, load_model_and_config
project_name = "angata"
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
print("="*80)
print("DEBUG: Feature Extraction Test")
print("="*80)
# Load model config
print("\n[1] Loading model config...")
model, config, scalers = load_model_and_config(Path("."))
print(f" Config features: {config['features']}")
print(f" Number of features: {len(config['features'])}")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
print(f" Columns: {ci_data.columns.tolist()}")
print(f" Total rows: {len(ci_data)}")
# Test on a single field
test_field = "1"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Testing on field {test_field}...")
print(f" Data points: {len(field_data)}")
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
print(f" Columns in field data: {field_data.columns.tolist()}")
print(f" Sample values:")
print(field_data[['Date', 'value']].head())
# Test feature extraction on first 50 days
print(f"\n[4] Extracting features for first 50 days...")
try:
subset = field_data.iloc[:50].copy()
features = extract_features(subset, config['features'], ci_column='value')
print(f" ✓ Success!")
print(f" Feature shape: {features.shape}")
print(f" Expected shape: (50, {len(config['features'])})")
print(f" Feature values sample (first 5 days):")
for i in range(min(5, features.shape[0])):
print(f" Day {i}: {features[i]}")
except Exception as e:
print(f" ✗ Error: {e}")
import traceback
traceback.print_exc()
print("\n[5] Testing on growing windows...")
try:
for window_size in [10, 20, 30, 50]:
window_data = field_data.iloc[:window_size].copy()
features = extract_features(window_data, config['features'], ci_column='value')
print(f" Window size {window_size}: shape={features.shape}, min={features.min():.4f}, max={features.max():.4f}")
except Exception as e:
print(f" ✗ Error: {e}")
import traceback
traceback.print_exc()
print("\n✓ Feature extraction test complete")

View file

@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Test ONLY the growing window method (what production actually uses)
Never run model on full sequence - only on expanding windows
This matches real deployment where data arrives daily
"""
import sys
import pandas as pd
import numpy as np
import torch
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
project_name = "angata"
# Find root by walking up until we find laravel_app
script_dir = Path(__file__).parent
root = script_dir
while root != root.parent: # Stop at filesystem root
if (root / "laravel_app").exists():
break
root = root.parent
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
MODEL_DIR = root / "python_app"
print("="*80)
print("GROWING WINDOW METHOD ONLY (Real Production Simulation)")
print("="*80)
# Load model
print("\n[1] Loading model...")
model, config, scalers = load_model_and_config(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Test on field 779
test_field = "779"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Field {test_field}: {len(field_data)} data points")
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
# Simulate growing window (real production)
print(f"\n[4] Simulating growing window (expanding daily)...")
harvest_dates = []
current_pos = 0
consecutive_above = 0
threshold = 0.3
consecutive_days = 2
model_runs = 0
while current_pos < len(field_data):
consecutive_above = 0
found_harvest = False
for window_end in range(current_pos + 1, len(field_data) + 1):
# Expand window: current_pos to window_end
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
try:
# Extract features for THIS window only
features = extract_features(window_data, config['features'], ci_column='value')
# Normalize
features_scaled = features.copy().astype(float)
for fi, scaler in enumerate(scalers):
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
# Run model on expanding window
with torch.no_grad():
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
model_runs += 1
# Check LAST timestep only
last_prob = detected_probs[0, -1].item()
last_date = window_data.iloc[-1]['Date'].date()
# Print every 50th window to track progress
if window_end % 50 == 0 or window_end < 10:
print(f" Window [{current_pos:3d}:{window_end:3d}] ({last_date}): prob={last_prob:.4f}", end="")
if last_prob > threshold:
print(" ✓ ABOVE", end="")
print()
# Check threshold
if last_prob > threshold:
consecutive_above += 1
else:
consecutive_above = 0
# Harvest detected
if consecutive_above >= consecutive_days:
harvest_idx = current_pos + window_end - consecutive_days
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx, last_prob))
print(f"\n ✓ HARVEST DETECTED at {harvest_date.date()} (index {harvest_idx})")
print(f" {consecutive_days} consecutive days above {threshold}")
# Jump past this harvest
current_pos = harvest_idx + 1
found_harvest = True
break
except Exception as e:
print(f" ERROR at window [{current_pos}:{window_end}]: {e}")
continue
if not found_harvest:
break
print(f"\n[5] Results:")
print(f" Total model runs: {model_runs}")
print(f" Harvests found: {len(harvest_dates)}")
if harvest_dates:
print(f"\n Harvest dates:")
for date, idx, prob in harvest_dates:
print(f" {date.date()}: index {idx}, last_prob={prob:.4f}")
else:
print(f"\n No harvests detected")
print(f"\n[6] Analysis:")
print(f" Model runs per day: {model_runs / len(field_data):.2f}x")
print(f" Expected: ~{len(field_data):.0f} runs (one per day)")

View file

@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Complete test: Feature extraction + Model inference + Phase 1 detection
"""
import sys
import pandas as pd
import numpy as np
import torch
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
run_phase1_growing_window
)
project_name = "angata"
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
print("="*80)
print("DEBUG: Model Inference + Phase 1 Detection")
print("="*80)
# Load model
print("\n[1] Loading model...")
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Device: {device}")
print(f" Scalers type: {type(scalers)}")
print(f" Number of scalers: {len(scalers) if isinstance(scalers, list) else 'N/A (dict/object)'}")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Test on a known field (field 1)
test_field = "1"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Testing on field {test_field}...")
print(f" Data points: {len(field_data)}")
# Test with first 100 days
subset_100 = field_data.iloc[:100].copy().reset_index(drop=True)
print(f"\n[4] Testing model inference on first 100 days...")
try:
features = extract_features(subset_100, config['features'], ci_column='value')
print(f" Features shape: {features.shape}")
print(f" Features dtype: {features.dtype}")
# Apply scalers
features_scaled = features.copy().astype(float)
print(f" Applying {len(scalers)} scalers...")
for fi, scaler in enumerate(scalers):
try:
col_data = features[:, fi].reshape(-1, 1)
scaled_col = scaler.transform(col_data)
features_scaled[:, fi] = scaled_col.flatten()
if fi < 3: # Show first 3 scalers
print(f" Scaler {fi}: transformed {features[0, fi]:.4f}{features_scaled[0, fi]:.4f}")
except Exception as e:
print(f" ERROR in scaler {fi}: {e}")
raise
# Run model
print(f"\n Running model inference...")
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
print(f" Tensor shape: {x_tensor.shape}")
with torch.no_grad():
imminent_probs, detected_probs = model(x_tensor)
print(f" Imminent probs shape: {imminent_probs.shape}")
print(f" Detected probs shape: {detected_probs.shape}")
print(f" Detected probs dtype: {detected_probs.dtype}")
# Analyze detected probs
detected_np = detected_probs[0].cpu().numpy() # Get first (only) batch
print(f"\n Detected head analysis:")
print(f" Min: {detected_np.min():.4f}")
print(f" Max: {detected_np.max():.4f}")
print(f" Mean: {detected_np.mean():.4f}")
print(f" Median: {np.median(detected_np):.4f}")
print(f" > 0.1: {(detected_np > 0.1).sum()} days")
print(f" > 0.3: {(detected_np > 0.3).sum()} days")
print(f" > 0.5: {(detected_np > 0.5).sum()} days")
# Show top 5 peaks
top_indices = np.argsort(detected_np)[-5:][::-1]
print(f"\n Top 5 detected peaks:")
for idx in top_indices:
date = subset_100.iloc[idx]['Date'].date()
prob = detected_np[idx]
print(f" Day {idx} ({date}): {prob:.4f}")
except Exception as e:
print(f" ERROR: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Test Phase 1 growing window
print(f"\n[5] Testing Phase 1 growing window (threshold=0.3, consecutive=2)...")
try:
phase1_results = run_phase1_growing_window(
subset_100, model, config, scalers, 'value', device,
threshold=0.3, consecutive_days=2
)
print(f" ✓ Phase 1 found {len(phase1_results)} harvest(s):")
for harvest_date, harvest_idx in phase1_results:
print(f" {harvest_date.date()}: index {harvest_idx}")
except Exception as e:
print(f" ERROR: {e}")
import traceback
traceback.print_exc()
print("\n✓ Model inference test complete")

View file

@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""
Debug script: Test if script 22 logic is working
Tests the two-step refinement on a single known field
"""
import sys
import time
import pandas as pd
import numpy as np
import torch
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
run_phase1_growing_window,
)
project_name = "angata"
# Find the workspace root by looking for laravel_app folder
script_dir = Path(__file__).parent
root = script_dir
while root != root.parent:
if (root / "laravel_app").exists():
break
root = root.parent
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
MODEL_DIR = root / "python_app"
print("="*80)
print("DEBUG: Script 22 Two-Step Refinement Logic")
print("="*80)
# Load model
print("\n[1] Loading model...")
model, config, scalers = load_model_and_config(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Device: {device}")
print(f" Model features: {config['features']}")
# Load CI data
print("\n[2] Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
print(f" Total rows: {len(ci_data)}")
print(f" Fields: {ci_data['field'].nunique()}")
print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
# Test on a known field (field 779 from our previous tests)
test_field = "779"
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
print(f"\n[3] Testing on field {test_field}...")
print(f" Data points: {len(field_data)}")
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
if len(field_data) == 0:
print(f" ERROR: No data for field {test_field}")
sys.exit(1)
# Extract features
print(f"\n[4] Extracting features for field {test_field}...")
try:
features = extract_features(field_data.reset_index(drop=True), config['features'], ci_column='value')
print(f" Features shape: {features.shape}")
print(f" Features dtype: {features.dtype}")
except Exception as e:
print(f" ERROR: Could not extract features: {e}")
sys.exit(1)
# Normalize and run model
print(f"\n[5] Running Phase 1 GROWING WINDOW method (threshold=0.5, consecutive=3)...")
print(f" This simulates real production: expanding windows, checking each day")
print(f" Expected: ~477 model runs for 477 days (SLOW)")
import time
start_time = time.time()
# Add instrumentation to see how many model runs are happening
original_run = run_phase1_growing_window
def instrumented_run(*args, **kwargs):
import sys
from harvest_date_pred_utils import extract_features
field_data = args[0]
model = args[1]
config = args[2]
scalers = args[3]
ci_column = args[4]
device = args[5]
threshold = kwargs.get('threshold', 0.3)
consecutive_days = kwargs.get('consecutive_days', 2)
harvest_dates = []
current_pos = 0
model_runs = 0
print(f" Starting growing window loop...")
while current_pos < len(field_data):
consecutive_above_threshold = 0
loop_start = current_pos
for window_end in range(current_pos + 1, len(field_data) + 1):
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
try:
features = extract_features(window_data, config['features'], ci_column=ci_column)
features_scaled = features.copy().astype(float)
for fi, scaler in enumerate(scalers):
try:
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception as e:
raise ValueError(f"Scaler {fi} failed: {e}")
import torch
with torch.no_grad():
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
model_runs += 1
last_prob = detected_probs[0, -1].item()
if last_prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
if consecutive_above_threshold >= consecutive_days:
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
current_pos = current_pos + window_end - consecutive_days + 1
break
except Exception as e:
pass
else:
break
print(f" Model runs performed: {model_runs}")
return harvest_dates
phase1_results = instrumented_run(
field_data.reset_index(drop=True),
model, config, scalers, 'value', device,
threshold=0.5,
consecutive_days=3
)
elapsed = time.time() - start_time
print(f"\n Time elapsed: {elapsed:.2f}s")
if phase1_results:
print(f" ✓ Phase 1 detected {len(phase1_results)} harvest(s):")
# Get probabilities for display by running model once on full field
with torch.no_grad():
X = features.reshape(1, -1, len(config['features']))
X_normalized = np.zeros_like(X)
for fi, scaler in enumerate(scalers):
X_normalized[0, :, fi] = scaler.transform(X[0, :, fi].reshape(-1, 1)).flatten()
X_tensor = torch.from_numpy(X_normalized).float().to(device)
_, detected_probs = model(X_tensor)
detected_np = detected_probs[0].cpu().numpy()
for harvest_date, harvest_idx in phase1_results:
prob = detected_np[harvest_idx] if harvest_idx < len(detected_np) else 0.0
print(f" {harvest_date.date()}: index {harvest_idx}, probability={prob:.4f}")
else:
print(f" ✗ Phase 1: No harvest detected")

View file

@ -14,8 +14,8 @@
# - tile_size: Tile size in km (default: 5, only used if use_tiles=TRUE)
#
# Examples:
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata week_51.tif TRUE 5 [tile-based]
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation.R 2026-01-12 7 angata
#
# 1. Load required packages
@ -142,15 +142,35 @@ main <- function() {
if (use_tile_mosaic) {
# TILE-BASED APPROACH: Create per-tile weekly MAX mosaics
# This is used for projects like Angata with large ROIs requiring spatial partitioning
# Input data comes from merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif (5-band tiles from script 20)
tryCatch({
safe_log("Starting per-tile mosaic creation (tile-based approach)...")
# Set output directory for per-tile mosaics
tile_output_base <- file.path(laravel_storage, "weekly_tile_max")
# Detect grid size from merged_final_tif folder structure
# Expected: merged_final_tif/5x5/ or merged_final_tif/10x10/ etc.
merged_final_base <- file.path(laravel_storage, "merged_final_tif")
grid_subfolders <- list.dirs(merged_final_base, full.names = FALSE, recursive = FALSE)
# Look for grid size patterns like "5x5", "10x10", "20x20"
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) == 0) {
stop("No grid size subfolder found in merged_final_tif/ (expected: 5x5, 10x10, etc.)")
}
grid_size <- grid_patterns[1] # Use first grid size found
safe_log(paste("Detected grid size:", grid_size))
# Point to the grid-specific merged_final_tif directory
merged_final_with_grid <- file.path(merged_final_base, grid_size)
# Set output directory for per-tile mosaics, organized by grid size
# Output: weekly_tile_max/{grid_size}/week_WW_YYYY_TT.tif
tile_output_base <- file.path(laravel_storage, "weekly_tile_max", grid_size)
dir.create(tile_output_base, recursive = TRUE, showWarnings = FALSE)
created_tile_files <- create_weekly_mosaic_from_tiles(
dates = dates,
merged_final_dir = merged_final,
merged_final_dir = merged_final_with_grid,
tile_output_dir = tile_output_base,
field_boundaries = field_boundaries
)

View file

@ -354,18 +354,17 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
}
# Get filenames of best-coverage images
# Match by extracting tile ID from both cloud stats and TIF filenames
# Match by full filename from cloud stats to TIF files
rasters_to_use <- character()
for (idx in best_coverage) {
# Extract tile ID from cloud_coverage_stats filename (e.g., "tile_18" → 18)
# Get the full filename from cloud coverage stats
cc_filename <- cloud_coverage_stats$filename[idx]
cc_tile_id <- gsub(".*_([0-9]+).*", "\\1", cc_filename)
# Find matching TIF file by matching tile ID
# Find matching TIF file by full filename
matching_tif <- NULL
for (tif_file in tif_files) {
tif_tile_id <- gsub(".*_([0-9]+)\\.tif", "\\1", basename(tif_file))
if (tif_tile_id == cc_tile_id) {
tif_basename <- basename(tif_file)
if (tif_basename == cc_filename) {
matching_tif <- tif_file
break
}
@ -373,6 +372,8 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
if (!is.null(matching_tif)) {
rasters_to_use <- c(rasters_to_use, matching_tif)
} else {
safe_log(paste("Warning: Could not find TIF file matching cloud stats entry:", cc_filename), "WARNING")
}
}
@ -420,42 +421,60 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
mosaic <- tryCatch({
safe_log(paste("Creating max composite from", length(all_rasters), "images to fill clouds"))
# Get extent from field boundaries if available, otherwise use raster intersection
if (!is.null(field_boundaries)) {
crop_extent <- terra::ext(field_boundaries)
safe_log("Using field boundaries extent for consistent area across all dates")
# Check if all rasters have identical grids (extent and resolution)
# This is likely for per-tile mosaics from the same tiling scheme
reference_raster <- all_rasters[[1]]
ref_ext <- terra::ext(reference_raster)
ref_res <- terra::res(reference_raster)
grids_match <- all(sapply(all_rasters[-1], function(r) {
isTRUE(all.equal(terra::ext(r), ref_ext, tolerance = 1e-6)) &&
isTRUE(all.equal(terra::res(r), ref_res, tolerance = 1e-6))
}))
if (grids_match) {
# All rasters have matching grids - no cropping/resampling needed!
safe_log("All rasters have identical grids - stacking directly for max composite")
raster_collection <- terra::sprc(all_rasters)
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
} else {
# Fallback: use intersection of all raster extents
crop_extent <- terra::ext(all_rasters[[1]])
for (i in 2:length(all_rasters)) {
crop_extent <- terra::intersect(crop_extent, terra::ext(all_rasters[[i]]))
# Grids don't match - need to crop and resample
safe_log("Rasters have different grids - cropping and resampling to common extent")
# Get extent from field boundaries if available, otherwise use raster union
if (!is.null(field_boundaries)) {
crop_extent <- terra::ext(field_boundaries)
safe_log("Using field boundaries extent for consistent area across all dates")
} else {
# Use union of all extents (covers all data)
crop_extent <- terra::ext(all_rasters[[1]])
for (i in 2:length(all_rasters)) {
crop_extent <- terra::union(crop_extent, terra::ext(all_rasters[[i]]))
}
safe_log("Using raster union extent")
}
safe_log("Using raster intersection extent")
# Crop all rasters to common extent
cropped_rasters <- lapply(all_rasters, function(r) {
terra::crop(r, crop_extent)
})
# Resample all cropped rasters to match the first one's grid
reference_grid <- cropped_rasters[[1]]
aligned_rasters <- lapply(cropped_rasters, function(r) {
if (isTRUE(all.equal(terra::ext(r), terra::ext(reference_grid), tolerance = 1e-6)) &&
isTRUE(all.equal(terra::res(r), terra::res(reference_grid), tolerance = 1e-6))) {
return(r) # Already aligned
}
terra::resample(r, reference_grid, method = "near")
})
# Create max composite using mosaic on aligned rasters
raster_collection <- terra::sprc(aligned_rasters)
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
}
# Crop all rasters to common extent
cropped_rasters <- lapply(all_rasters, function(r) {
terra::crop(r, crop_extent)
})
# Resample all cropped rasters to match the first one's grid
# This handles pixel grid misalignment from Python's dynamic extent adjustment
reference_grid <- cropped_rasters[[1]]
safe_log("Resampling rasters to common grid for stacking")
aligned_rasters <- lapply(cropped_rasters, function(r) {
if (identical(terra::ext(r), terra::ext(reference_grid)) &&
identical(terra::res(r), terra::res(reference_grid))) {
return(r) # Already aligned
}
terra::resample(r, reference_grid, method = "near")
})
# Create max composite using mosaic on aligned rasters
# Resample ensures all rasters have matching grids (no resolution mismatch)
raster_collection <- terra::sprc(aligned_rasters)
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
max_mosaic
}, error = function(e) {
safe_log(paste("Max composite creation failed:", e$message), "WARNING")
@ -686,6 +705,16 @@ create_weekly_mosaic_from_tiles <- function(dates, merged_final_dir, tile_output
next
}
# DEBUG: Check mosaic content before saving
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "dimensions:", nrow(tile_mosaic), "x", ncol(tile_mosaic)))
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "bands:", terra::nlyr(tile_mosaic)))
# Check first band values
band1 <- tile_mosaic[[1]]
band1_min <- terra::global(band1, fun = "min", na.rm = TRUE)$min
band1_max <- terra::global(band1, fun = "max", na.rm = TRUE)$max
safe_log(paste(" DEBUG: Band 1 MIN=", round(band1_min, 2), "MAX=", round(band1_max, 2)))
# Step 2c: Save this tile's weekly MAX mosaic
# Filename format: week_WW_YYYY_TT.tif (e.g., week_02_2026_01.tif for week 2, 2026, tile 1)
tile_filename <- paste0("week_", sprintf("%02d", dates$week), "_", dates$year, "_",
@ -763,7 +792,7 @@ count_cloud_coverage_for_tile <- function(tile_files, field_boundaries = NULL) {
missing_pct <- round(100 - ((total_notna / total_pixels) * 100))
aggregated_results[[idx]] <- data.frame(
filename = paste0("tile_", sprintf("%02d", as.integer(gsub(".*_([0-9]+)\\.tif", "\\1", basename(tile_file))))),
filename = basename(tile_file), # Keep full filename: 2026-01-07_03.tif
notNA = total_notna,
total_pixels = total_pixels,
missing_pixels_percentage = missing_pct,

View file

@ -20,6 +20,8 @@
# Option 2: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
# Arguments: [end_date] [project_dir] [offset_days]
#
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-01-12 angata 7
#
# Usage in run_full_pipeline.R:
# source("r_app/80_calculate_kpis.R")
# main()
@ -122,7 +124,7 @@ STATUS_TRIGGERS <- data.frame(
# TILE-AWARE HELPER FUNCTIONS
# ============================================================================
get_tile_ids_for_field <- function(field_geom, tile_grid) {
get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) {
if (inherits(field_geom, "sf")) {
field_bbox <- sf::st_bbox(field_geom)
field_xmin <- field_bbox["xmin"]
@ -139,6 +141,17 @@ get_tile_ids_for_field <- function(field_geom, tile_grid) {
stop("field_geom must be sf or terra::vect object")
}
# DEBUG: Print bbox info for first field
if (!is.null(field_id) && field_id == "1391") {
message(paste("[DEBUG get_tile_ids] Field bbox - xmin:", field_xmin, "xmax:", field_xmax,
"ymin:", field_ymin, "ymax:", field_ymax))
message(paste("[DEBUG get_tile_ids] tile_grid sample: id=", tile_grid$id[1],
"xmin=", tile_grid$xmin[1], "xmax=", tile_grid$xmax[1],
"ymin=", tile_grid$ymin[1], "ymax=", tile_grid$ymax[1]))
message(paste("[DEBUG get_tile_ids] tile_grid CRS:", sf::st_crs(tile_grid)))
message(paste("[DEBUG get_tile_ids] field CRS:", sf::st_crs(field_geom)))
}
intersecting_tiles <- tile_grid$id[
!(tile_grid$xmax < field_xmin |
tile_grid$xmin > field_xmax |
@ -189,6 +202,21 @@ load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_di
}
build_tile_grid <- function(mosaic_dir, week_num, year) {
# Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/)
# First check if mosaic_dir contains grid-size subdirectories
detected_grid_size <- NA
if (dir.exists(mosaic_dir)) {
subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
# Use the first grid-size subdirectory found
detected_grid_size <- grid_patterns[1]
mosaic_dir <- file.path(mosaic_dir, detected_grid_size)
message(paste(" Using grid-size subdirectory:", detected_grid_size))
}
}
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
@ -230,7 +258,12 @@ build_tile_grid <- function(mosaic_dir, week_num, year) {
stop("Could not extract extents from any tile files")
}
return(tile_grid)
# RETURN BOTH the grid AND the corrected mosaic directory path
return(list(
tile_grid = tile_grid,
mosaic_dir = mosaic_dir,
grid_size = detected_grid_size
))
}
# ============================================================================
@ -399,14 +432,20 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n
USE_UNIFORM_AGE <- TRUE
UNIFORM_PLANTING_DATE <- as.Date("2025-01-01")
extract_planting_dates <- function(harvesting_data) {
extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
if (USE_UNIFORM_AGE) {
message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE))
return(data.frame(
field_id = character(),
planting_date = as.Date(character()),
stringsAsFactors = FALSE
))
# Return a data frame with all field IDs mapped to uniform planting date
if (!is.null(field_boundaries_sf)) {
return(data.frame(
field_id = field_boundaries_sf$field,
date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)),
stringsAsFactors = FALSE
))
} else {
# Fallback if field_boundaries_sf not provided
return(NULL)
}
}
if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
@ -449,6 +488,11 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
}
field_name <- field_id
# DEBUG: Print for first few fields
if (field_idx <= 3) {
message(paste("[DEBUG] Field", field_idx, ":", field_id))
}
field_sf <- field_boundaries_sf[field_idx, ]
if (sf::st_is_empty(field_sf) || any(is.na(sf::st_geometry(field_sf)))) {
return(data.frame(
@ -460,7 +504,14 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000
field_area_acres <- field_area_ha / 0.404686
tile_ids <- get_tile_ids_for_field(field_sf, tile_grid)
tile_ids <- get_tile_ids_for_field(field_sf, tile_grid, field_id = field_id)
# DEBUG: Print tile IDs for first field
if (field_idx == 1) {
message(paste("[DEBUG] First field tile_ids:", paste(tile_ids, collapse=",")))
message(paste("[DEBUG] tile_grid nrows:", nrow(tile_grid), "ncols:", ncol(tile_grid)))
message(paste("[DEBUG] mosaic_dir:", mosaic_dir))
}
current_ci <- load_tiles_for_field(field_sf, tile_ids, week_num, year, mosaic_dir)
@ -471,10 +522,26 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
))
}
field_vect <- terra::vect(sf::as_Spatial(field_sf))
terra::crs(field_vect) <- terra::crs(current_ci)
# Extract CI values: EXACTLY LIKE SCRIPT 20
# Crop to field bounding box first, then extract with sf directly (not terra::vect conversion)
field_bbox <- sf::st_bbox(field_sf)
ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out")
extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE)
all_extracted <- terra::extract(current_ci, field_vect)[, 2]
# extracted_vals is a data.frame with ID column (field index) + mean value
mean_ci_current <- as.numeric(extracted_vals[1, 2])
if (is.na(mean_ci_current)) {
return(data.frame(
Field_id = field_id,
error = "No CI values extracted from tiles"
))
}
# For per-tile extraction, we only have mean from the aggregation function
# To get variance/CV, we need to extract all pixels without the fun parameter
# But for farm-level purposes, the mean CI is sufficient
all_extracted <- terra::extract(ci_cropped, field_sf)[, 2]
current_ci_vals <- all_extracted[!is.na(all_extracted)]
num_total <- length(all_extracted)
@ -509,7 +576,9 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
tryCatch({
previous_ci <- load_tiles_for_field(field_sf, tile_ids, week_num - 1, year, mosaic_dir)
if (!is.null(previous_ci)) {
prev_extracted <- terra::extract(previous_ci, field_vect)[, 2]
prev_bbox <- sf::st_bbox(field_sf)
prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out")
prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2]
previous_ci_vals <- prev_extracted[!is.na(prev_extracted)]
if (length(previous_ci_vals) > 0) {
mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE)
@ -763,6 +832,13 @@ generate_field_analysis_summary <- function(field_df) {
export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) {
message("Exporting per-field analysis to Excel, CSV, and RDS...")
# Round all numeric columns to 2 decimals
field_df_rounded <- field_df %>%
mutate(across(where(is.numeric), ~ round(., 2)))
summary_df_rounded <- summary_df %>%
mutate(across(where(is.numeric), ~ round(., 2)))
output_subdir <- file.path(reports_dir, "kpis", "field_analysis")
if (!dir.exists(output_subdir)) {
dir.create(output_subdir, recursive = TRUE)
@ -773,16 +849,16 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre
excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE)
sheets <- list(
"Field Data" = field_df,
"Summary" = summary_df
"Field Data" = field_df_rounded,
"Summary" = summary_df_rounded
)
write_xlsx(sheets, excel_path)
message(paste("✓ Field analysis Excel exported to:", excel_path))
kpi_data <- list(
field_analysis = field_df,
field_analysis_summary = summary_df,
field_analysis = field_df_rounded,
field_analysis_summary = summary_df_rounded,
metadata = list(
current_week = current_week,
project = project_dir,
@ -798,84 +874,254 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv")
csv_path <- file.path(output_subdir, csv_filename)
write_csv(field_df, csv_path)
write_csv(field_df_rounded, csv_path)
message(paste("✓ Field analysis CSV exported to:", csv_path))
return(list(excel = excel_path, rds = rds_path, csv = csv_path))
}
# ============================================================================
# FARM-LEVEL KPI CALCULATION (FROM OLD 09_CALCULATE_KPIS.R)
# TILE-BASED KPI EXTRACTION FUNCTION
# ============================================================================
calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) {
# Loop through tiles, extract KPI statistics per field per tile
# Follows the same pattern as extract_ci_from_tiles in CI extraction
message("Calculating field-level KPI statistics from tiles...")
# Get all tile files for this week
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
message("No tiles found for week", week_num, year)
return(NULL)
}
# Process tiles in parallel using furrr (same as CI extraction)
message(paste("Processing", length(tile_files), "tiles in parallel..."))
field_kpi_list <- furrr::future_map(
tile_files,
~ process_single_kpi_tile(
tile_file = .,
field_boundaries_sf = field_boundaries_sf,
tile_grid = tile_grid
),
.progress = TRUE,
.options = furrr::furrr_options(seed = TRUE)
)
# Combine results from all tiles
field_kpi_stats <- dplyr::bind_rows(field_kpi_list)
if (nrow(field_kpi_stats) == 0) {
message(" No KPI data extracted from tiles")
return(NULL)
}
message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields"))
return(field_kpi_stats)
}
# Helper function to process a single tile (like process_single_tile in CI extraction)
process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) {
tryCatch({
tile_basename <- basename(tile_file)
# Load tile raster
tile_raster <- terra::rast(tile_file)
# Get first band (CI band for weekly mosaics)
ci_band <- tile_raster[[1]]
# EXACTLY LIKE SCRIPT 20: Crop to field bounding box first, then extract with sf directly
field_bbox <- sf::st_bbox(field_boundaries_sf)
ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out")
# Extract CI values for ALL fields at once using sf object directly (NOT terra::vect)
# terra::extract() works with sf objects and handles geometries properly
extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE)
# Initialize results for this tile
tile_results <- data.frame()
# Get tile ID from filename
tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename))
# Process each field: extracted_vals is a data.frame with ID column (field indices) + extracted values
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
field_id <- field_boundaries_sf$field[field_idx]
# extracted_vals columns: 1=ID, 2=mean_CI (since we used fun="mean")
mean_ci <- extracted_vals[field_idx, 2]
# Skip if no data for this field in this tile
if (is.na(mean_ci)) {
next
}
# For tile-level stats, we only have mean from extraction (no variance without all pixels)
# Add to results
tile_results <- rbind(tile_results, data.frame(
field = field_id,
tile_id = tile_id_match,
tile_file = tile_basename,
mean_ci = round(mean_ci, 4),
stringsAsFactors = FALSE
))
}
return(tile_results)
}, error = function(e) {
message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message))
return(data.frame())
})
}
calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf,
harvesting_data, cumulative_CI_vals_dir,
weekly_CI_mosaic, reports_dir) {
weekly_CI_mosaic, reports_dir, current_week, year,
tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") {
message("\n=== CALCULATING FARM-LEVEL KPIs ===")
message("(6 high-level KPI metrics)")
tryCatch({
source(here("r_app", "kpi_utils.R"))
}, error = function(e) {
message(paste("Warning: Could not load kpi_utils.R:", e$message))
message("Farm-level KPIs will be skipped")
return(NULL)
})
if (!exists("calculate_all_kpis")) {
message("Warning: calculate_all_kpis() function not found in kpi_utils.R")
return(NULL)
}
message("(6 high-level KPI metrics with tile-based extraction)")
output_dir <- file.path(reports_dir, "kpis")
if (!dir.exists(output_dir)) {
dir.create(output_dir, recursive = TRUE)
}
tryCatch({
kpi_results <- calculate_all_kpis(
report_date = report_date,
output_dir = output_dir,
field_boundaries_sf = field_boundaries_sf,
harvesting_data = harvesting_data,
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
weekly_CI_mosaic = weekly_CI_mosaic,
reports_dir = reports_dir,
project_dir = project_dir
)
# Print KPI summary
cat("\n=== FARM-LEVEL KPI SUMMARY ===\n")
cat("Report Date:", as.character(kpi_results$metadata$report_date), "\n")
cat("Current Week:", kpi_results$metadata$current_week, "\n")
cat("Previous Week:", kpi_results$metadata$previous_week, "\n")
cat("Total Fields Analyzed:", kpi_results$metadata$total_fields, "\n")
cat("Calculation Time:", as.character(kpi_results$metadata$calculation_time), "\n")
cat("\n--- KPI Metrics ---\n")
cat("Field Uniformity Summary:\n")
print(kpi_results$field_uniformity_summary)
cat("\nArea Change Summary:\n")
print(kpi_results$area_change)
cat("\nTCH Forecasted:\n")
print(kpi_results$tch_forecasted)
cat("\nGrowth Decline Index:\n")
print(kpi_results$growth_decline)
cat("\nWeed Presence Score:\n")
print(kpi_results$weed_presence)
cat("\nGap Filling Score:\n")
print(kpi_results$gap_filling)
return(kpi_results)
}, error = function(e) {
message(paste("Error calculating farm-level KPIs:", e$message))
# Get mosaic directory with grid size if using tiles
mosaic_dir <- if (use_tile_mosaic && !is.null(tile_grid_size)) {
file.path(weekly_CI_mosaic, tile_grid_size)
} else {
weekly_CI_mosaic
}
# Extract field-level KPI statistics from tiles
field_kpi_stats <- calculate_field_kpis_from_tiles(
tile_dir = mosaic_dir,
week_num = current_week,
year = year,
field_boundaries_sf = field_boundaries_sf,
tile_grid = tile_grid
)
if (is.null(field_kpi_stats) || nrow(field_kpi_stats) == 0) {
message("Warning: No field KPI statistics extracted from tiles")
return(NULL)
})
}
# Aggregate tile-based statistics by field (average across tiles for each field)
field_summary_stats <- field_kpi_stats %>%
dplyr::group_by(field) %>%
dplyr::summarise(
mean_ci = mean(mean_ci, na.rm = TRUE),
cv_ci = mean(cv_ci, na.rm = TRUE),
min_ci = min(min_ci, na.rm = TRUE),
max_ci = max(max_ci, na.rm = TRUE),
total_pixels = sum(n_pixels, na.rm = TRUE),
num_tiles = n_distinct(tile_id),
.groups = 'drop'
)
# Create results list
kpi_results <- list(
field_kpi_stats = field_kpi_stats,
field_summary_stats = field_summary_stats,
metadata = list(
report_date = report_date,
current_week = current_week,
year = year,
calculation_method = "tile_based_extraction",
num_fields_processed = length(unique(field_kpi_stats$field)),
num_tiles_processed = length(unique(field_kpi_stats$tile_id))
)
)
# Save results
rds_filename <- paste0(project_dir, "_farm_kpi_stats_week", sprintf("%02d", current_week), ".rds")
rds_path <- file.path(output_dir, rds_filename)
saveRDS(kpi_results, rds_path)
message(paste("✓ Farm-level KPI stats exported to:", rds_path))
# Print summary
cat("\n=== FARM-LEVEL KPI SUMMARY ===\n")
cat("Report Date:", as.character(report_date), "\n")
cat("Week:", current_week, "Year:", year, "\n")
cat("Fields Processed:", length(unique(field_kpi_stats$field)), "\n")
cat("Tiles Processed:", length(unique(field_kpi_stats$tile_id)), "\n")
cat("\n--- Field Summary Statistics (Mean across tiles) ---\n")
print(head(field_summary_stats, 20))
return(kpi_results)
}
# ============================================================================
# HELPER: Extract field-level statistics from CI raster (all pixels, single call)
# ============================================================================
extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
#' Extract CI statistics for all fields from a single CI raster band
#'
#' This function extracts all pixel values for each field in one terra::extract call,
#' then calculates mean, CV, and percentiles from those pixels.
#'
#' @param ci_band Single CI band from terra raster
#' @param field_boundaries_sf SF object with field geometries
#' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count
# Extract all pixels for all fields at once (more efficient than individual calls)
all_pixels <- terra::extract(ci_band, field_boundaries_sf)
# Calculate statistics for each field
stats_list <- list()
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
# Extract pixel values for this field (skip ID column 1)
pixels <- all_pixels[field_idx, -1, drop = TRUE]
pixels <- as.numeric(pixels)
pixels <- pixels[!is.na(pixels)]
# Only calculate stats if we have pixels
if (length(pixels) > 0) {
mean_val <- mean(pixels, na.rm = TRUE)
# Only calculate CV if mean > 0 (avoid division by zero)
if (mean_val > 0) {
cv_val <- sd(pixels, na.rm = TRUE) / mean_val
} else {
cv_val <- NA
}
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = mean_val,
cv = cv_val,
p10 = p10_val,
p90 = p90_val,
pixel_count = length(pixels),
stringsAsFactors = FALSE
)
} else {
# No pixels for this field (doesn't intersect tile)
stats_list[[field_idx]] <- data.frame(
field_idx = field_idx,
mean_ci = NA_real_,
cv = NA_real_,
p10 = NA_real_,
p90 = NA_real_,
pixel_count = 0,
stringsAsFactors = FALSE
)
}
}
return(dplyr::bind_rows(stats_list))
}
# ============================================================================
@ -923,7 +1169,7 @@ main <- function() {
message("")
# Load configuration and utilities
source(here("r_app", "crop_messaging_utils.R"))
# source(here("r_app", "crop_messaging_utils.R"))
tryCatch({
source(here("r_app", "parameters_project.R"))
@ -950,10 +1196,29 @@ main <- function() {
message(paste("Week:", current_week, "/ Year:", year))
message("Building tile grid from available weekly tiles...")
tile_grid <- build_tile_grid(weekly_tile_max, current_week, year)
message(paste(" Found", nrow(tile_grid), "tiles"))
# Find tile files - approach from Script 20
message("Finding tile files...")
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", current_week, year)
# Detect grid size subdirectory
detected_grid_size <- NA
if (dir.exists(weekly_tile_max)) {
subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
detected_grid_size <- grid_patterns[1]
mosaic_dir <- file.path(weekly_tile_max, detected_grid_size)
message(paste(" Using grid-size subdirectory:", detected_grid_size))
}
}
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
if (length(tile_files) == 0) {
stop(paste("No tile files found for week", current_week, year, "in", mosaic_dir))
}
message(paste(" Found", length(tile_files), "tiles"))
# Load field boundaries
tryCatch({
boundaries_result <- load_field_boundaries(data_dir)
@ -979,38 +1244,217 @@ main <- function() {
}
historical_data <- load_historical_field_data(project_dir, current_week, reports_dir, num_weeks = num_weeks_to_load)
planting_dates <- extract_planting_dates(harvesting_data)
planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
message("Setting up parallel processing...")
current_plan <- class(future::plan())[1]
if (current_plan == "sequential") {
num_workers <- parallel::detectCores() - 1
message(paste(" Using", num_workers, "workers"))
future::plan(future::multisession, workers = num_workers)
# Validate planting_dates
if (is.null(planting_dates) || nrow(planting_dates) == 0) {
message("WARNING: No planting dates available. Using NA for all fields.")
planting_dates <- data.frame(
field_id = field_boundaries_sf$field,
date = rep(as.Date(NA), nrow(field_boundaries_sf)),
stringsAsFactors = FALSE
)
}
message("Analyzing", nrow(field_boundaries_sf), "fields in parallel...")
# SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile
message("\nProcessing tiles and extracting field statistics...")
all_tile_results <- list()
field_analysis_list <- furrr::future_map(
seq_len(nrow(field_boundaries_sf)),
~ analyze_single_field(
field_idx = .,
field_boundaries_sf = field_boundaries_sf,
tile_grid = tile_grid,
week_num = current_week,
year = year,
mosaic_dir = weekly_tile_max,
historical_data = historical_data,
planting_dates = planting_dates,
report_date = end_date,
harvest_imminence_data = NULL,
harvesting_data = harvesting_data
),
.progress = TRUE,
.options = furrr::furrr_options(seed = TRUE)
)
for (i in seq_along(tile_files)) {
tile_file <- tile_files[i]
message(paste(" Processing tile", i, "of", length(tile_files), ":", basename(tile_file)))
tryCatch({
# Load current tile and previous week tile
current_rast <- terra::rast(tile_file)
# DEBUG: Check tile structure on first tile
if (i == 1) {
message(paste(" [DEBUG] Tile CRS:", terra::crs(current_rast)))
message(paste(" [DEBUG] Tile extent:", paste(terra::ext(current_rast))))
message(paste(" [DEBUG] Field boundaries CRS:", sf::st_crs(field_boundaries_sf)))
field_bbox <- sf::st_bbox(field_boundaries_sf)
message(paste(" [DEBUG] Field bbox:", paste(round(field_bbox, 2))))
message(paste(" [DEBUG] Band names:", paste(names(current_rast), collapse=", ")))
}
# Extract CI band by name
ci_band <- current_rast[["CI"]]
# Check if CI band exists - use proper logical checks
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
message(paste(" ERROR: CI band not found. Available bands:", paste(names(current_rast), collapse=", ")))
next
}
# Check if CI band has any valid data
if (tryCatch(all(is.na(values(ci_band))), error = function(e) TRUE)) {
message(paste(" ERROR: CI band has no valid data"))
next
}
# Load previous week tile if available
previous_tile_file <- sub(sprintf("week_%02d", current_week),
sprintf("week_%02d", previous_week),
tile_file)
previous_ci <- NULL
if (file.exists(previous_tile_file)) {
previous_rast <- terra::rast(previous_tile_file)
previous_ci <- previous_rast[["CI"]]
}
# OPTION 1 + 2: Extract all CI statistics from one pixel extraction (single call)
current_stats <- extract_field_statistics_from_ci(ci_band, field_boundaries_sf)
# DEBUG: Check extraction result on first tile
if (i == 1) {
num_with_data <- sum(!is.na(current_stats$mean_ci))
message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data"))
if (num_with_data > 0) {
message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", ")))
}
}
# Extract previous week CI statistics if available
previous_stats <- NULL
if (!is.null(previous_ci)) {
previous_stats <- extract_field_statistics_from_ci(previous_ci, field_boundaries_sf)
}
# Process each field that was extracted
field_results_this_tile <- list()
fields_added <- 0
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
tryCatch({
field_id <- field_boundaries_sf$field[field_idx]
field_sf <- field_boundaries_sf[field_idx, ]
# Get statistics from helper function results
# current_stats should have same number of rows as field_boundaries_sf
if (field_idx > nrow(current_stats)) {
message(paste(" [ERROR] field_idx", field_idx, "> nrow(current_stats)", nrow(current_stats)))
next
}
mean_ci_current <- current_stats$mean_ci[field_idx]
pixel_count <- current_stats$pixel_count[field_idx]
# SKIP fields with no data in this tile (they don't intersect this tile)
if (is.na(pixel_count) || pixel_count == 0) {
next
}
ci_cv_current <- current_stats$cv[field_idx]
ci_percentile_low <- current_stats$p10[field_idx]
ci_percentile_high <- current_stats$p90[field_idx]
# If field doesn't intersect this tile, mean_ci_current will be NA
if (is.na(mean_ci_current)) {
next # Skip this field - doesn't intersect this tile
}
field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000
field_area_acres <- field_area_ha / 0.404686
# Extract previous week CI if available
mean_ci_previous <- NA
ci_change <- NA
if (!is.null(previous_stats)) {
mean_ci_previous <- previous_stats$mean_ci[field_idx]
if (!is.na(mean_ci_previous)) {
ci_change <- mean_ci_current - mean_ci_previous
}
}
# Reconstruct pixel values for status trigger (we need the actual pixel array)
# Use the percentiles and mean to create a synthetic distribution for status_trigger
# For now, use mean CI repeated by pixel count for testing
# TODO: Consider extracting pixels directly if needed for more complex triggers
pixel_count <- current_stats$pixel_count[field_idx]
ci_vals_current <- if (pixel_count > 0) {
rep(mean_ci_current, pixel_count) # Simplified: use mean value repeated
} else {
numeric(0)
}
# Calculate age
age_weeks <- if (!is.null(planting_dates) && nrow(planting_dates) > 0 && field_idx <= nrow(planting_dates)) {
planting_date <- planting_dates$date[field_idx]
if (!is.na(planting_date)) {
as.numeric(difftime(end_date, planting_date, units = "weeks"))
} else {
0
}
} else {
0
}
# Get phase and status
phase <- get_phase_by_age(age_weeks)
status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks)
# Cloud coverage categorization based on CI value
# No data = No image available
# CI 0.01 to 95 = Partial coverage
# CI >= 95 = Clear view
if (is.na(mean_ci_current) || mean_ci_current == 0) {
cloud_category <- "No image available"
# Set all CI metrics to NA since no valid data
ci_change <- NA
ci_cv_current <- NA
ci_percentile_low <- NA
ci_percentile_high <- NA
} else if (mean_ci_current >= 95) {
cloud_category <- "Clear view"
} else {
cloud_category <- "Partial coverage"
}
# Build result row
result_row <- data.frame(
Field_id = field_id,
Acreage = field_area_acres,
Mean_CI = mean_ci_current,
Mean_CI_prev = mean_ci_previous,
CI_change = ci_change,
CI_CV = ci_cv_current,
CI_percentile_low = ci_percentile_low,
CI_percentile_high = ci_percentile_high,
Age_weeks = age_weeks,
Phase = phase,
Status_trigger = status_trigger,
Cloud_category = cloud_category,
stringsAsFactors = FALSE
)
field_results_this_tile[[as.character(field_id)]] <- result_row
fields_added <- fields_added + 1
}, error = function(e) {
# Show error for debugging
message(paste(" [FIELD ERROR] Field", field_idx, ":", e$message))
})
}
if (length(field_results_this_tile) > 0) {
all_tile_results[[basename(tile_file)]] <- dplyr::bind_rows(field_results_this_tile)
message(paste(" Extracted", length(field_results_this_tile), "fields from tile (processed", fields_added, "fields total)"))
} else {
message(paste(" WARNING: No fields extracted from this tile (processed", fields_added, "fields, all either NA or errored)"))
}
}, error = function(e) {
message(paste(" Error processing tile", basename(tile_file), ":", e$message))
})
}
field_analysis_df <- dplyr::bind_rows(field_analysis_list)
# Combine all tile results, keeping unique fields (may appear in multiple tiles)
if (length(all_tile_results) == 0) {
stop("No fields extracted from any tiles!")
}
field_analysis_df <- dplyr::bind_rows(all_tile_results) %>%
distinct(Field_id, .keep_all = TRUE)
if (nrow(field_analysis_df) == 0) {
stop("No fields analyzed successfully!")
@ -1038,17 +1482,90 @@ main <- function() {
cat("\n--- Summary Statistics ---\n")
print(summary_statistics_df)
# ========== FARM-LEVEL KPI CALCULATION ==========
# ========== FARM-LEVEL KPI AGGREGATION ==========
# Aggregate the per-field analysis into farm-level summary statistics
farm_kpi_results <- calculate_and_export_farm_kpis(
report_date = end_date,
project_dir = project_dir,
field_boundaries_sf = field_boundaries_sf,
harvesting_data = harvesting_data,
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
weekly_CI_mosaic = weekly_CI_mosaic,
reports_dir = reports_dir
)
cat("\n=== CALCULATING FARM-LEVEL KPI SUMMARY ===\n")
# Filter to only fields that have actual data (non-NA CI and valid acreage)
field_data <- field_analysis_df %>%
filter(!is.na(Mean_CI) & !is.na(Acreage)) %>%
filter(Acreage > 0)
if (nrow(field_data) > 0) {
if (nrow(field_data) > 0) {
# Create summary statistics
farm_summary <- list()
# 1. PHASE DISTRIBUTION
phase_dist <- field_data %>%
group_by(Phase) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Phase)
farm_summary$phase_distribution <- phase_dist
# 2. STATUS TRIGGER DISTRIBUTION
status_dist <- field_data %>%
group_by(Status_trigger) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Status_trigger)
farm_summary$status_distribution <- status_dist
# 3. CLOUD COVERAGE DISTRIBUTION
cloud_dist <- field_data %>%
group_by(Cloud_category) %>%
summarise(
num_fields = n(),
acreage = sum(Acreage, na.rm = TRUE),
.groups = 'drop'
) %>%
rename(Category = Cloud_category)
farm_summary$cloud_distribution <- cloud_dist
# 4. OVERALL STATISTICS
farm_summary$overall_stats <- data.frame(
total_fields = nrow(field_data),
total_acreage = sum(field_data$Acreage, na.rm = TRUE),
mean_ci = round(mean(field_data$Mean_CI, na.rm = TRUE), 2),
median_ci = round(median(field_data$Mean_CI, na.rm = TRUE), 2),
mean_cv = round(mean(field_data$CI_CV, na.rm = TRUE), 4),
week = current_week,
year = year,
date = as.character(end_date)
)
# Print summaries
cat("\n--- PHASE DISTRIBUTION ---\n")
print(phase_dist)
cat("\n--- STATUS TRIGGER DISTRIBUTION ---\n")
print(status_dist)
cat("\n--- CLOUD COVERAGE DISTRIBUTION ---\n")
print(cloud_dist)
cat("\n--- OVERALL FARM STATISTICS ---\n")
print(farm_summary$overall_stats)
farm_kpi_results <- farm_summary
} else {
farm_kpi_results <- NULL
}
} else {
farm_kpi_results <- NULL
}
# ========== FINAL SUMMARY ==========
@ -1063,7 +1580,7 @@ main <- function() {
if (!is.null(farm_kpi_results)) {
cat("\nFarm-level KPIs: CALCULATED\n")
} else {
cat("\nFarm-level KPIs: SKIPPED (kpi_utils.R not available)\n")
cat("\nFarm-level KPIs: SKIPPED (no valid tile data extracted)\n")
}
cat("\n✓ Consolidated KPI calculation complete!\n")

View file

@ -50,7 +50,7 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
}
# PRIORITY 2: File-based detection (fallback if metadata not found)
# Check if merged_final_tif/ contains tile-named files
# Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories
if (!dir.exists(merged_final_tif_dir)) {
return(list(
@ -61,6 +61,30 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
))
}
# First check if there are grid-size subdirectories (5x5, 10x10, etc.)
# This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
# Found grid-size subdirectories - tiles exist!
grid_size <- grid_patterns[1]
grid_dir <- file.path(merged_final_tif_dir, grid_size)
# List sample tile files from the grid directory
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
return(list(
has_tiles = TRUE,
detected_tiles = sample_tiles,
total_files = length(sample_tiles),
source = "grid_subdirectory_detection",
grid_size = grid_size,
grid_path = grid_dir
))
}
# Fall back to checking for tile-named files directly in merged_final_tif
# List all .tif files in merged_final_tif
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)