Enhance harvest detection logic and testing framework
- Updated `detect_mosaic_mode` function to check for grid-size subdirectories in addition to tile-named files. - Added comprehensive tests for DOY reset logic in `test_doy_logic.py`. - Implemented feature extraction tests in `test_feature_extraction.py`. - Created tests for growing window method in `test_growing_window_only.py`. - Developed a complete model inference test in `test_model_inference.py`. - Added a debug script for testing two-step refinement logic in `test_script22_debug.py`.
This commit is contained in:
parent
d6448b8703
commit
fabbf3214d
|
|
@ -26,7 +26,9 @@ This is your GROUND TRUTH - compare all future predictions against this baseline
|
|||
|
||||
Usage:
|
||||
python 01_harvest_baseline_prediction.py [project_name]
|
||||
conda activate pytorch_gpu; cd "C:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane_v2\smartcane\python_app"; python 22_harvest_baseline_prediction.py angata
|
||||
conda activate pytorch_gpu
|
||||
cd python_app
|
||||
python 22_harvest_baseline_prediction.py angata
|
||||
|
||||
Examples:
|
||||
python 01_harvest_baseline_prediction.py angata
|
||||
|
|
@ -108,7 +110,7 @@ def main():
|
|||
|
||||
# [3/4] Run model predictions with two-step detection
|
||||
print("\n[3/4] Running two-step harvest detection...")
|
||||
print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)")
|
||||
print(" (Using threshold=0.3, consecutive_days=2 - tuned baseline with DOY reset)")
|
||||
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device,
|
||||
phase1_threshold=0.3, phase1_consecutive=2)
|
||||
|
||||
|
|
|
|||
|
|
@ -184,18 +184,47 @@ def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> p
|
|||
return features.fillna(0)
|
||||
|
||||
|
||||
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray:
|
||||
"""Extract and return specified features as numpy array."""
|
||||
def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData',
|
||||
season_anchor_day: int = None, lookback_start: int = 0) -> np.ndarray:
|
||||
"""
|
||||
Extract and return specified features as numpy array.
|
||||
|
||||
Args:
|
||||
data_df: DataFrame with Date and CI data (may be a window after a harvest)
|
||||
feature_names: List of feature names to extract
|
||||
ci_column: Name of CI column
|
||||
season_anchor_day: Day in FULL sequence where this season started (for DOY reset)
|
||||
DOY will be recalculated as: 1, 2, 3, ... from this point
|
||||
lookback_start: Starting index in original full data (for season reset calculation)
|
||||
|
||||
Returns:
|
||||
NumPy array of shape (len(data_df), len(feature_names))
|
||||
"""
|
||||
# Compute all CI features
|
||||
ci_series = data_df[ci_column].astype(float)
|
||||
doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None
|
||||
|
||||
# Compute DOY (age/days since season start) - NOT day-of-year!
|
||||
# DOY is a continuous counter: 1, 2, 3, ..., 475 (doesn't cycle at 365)
|
||||
# It only resets to 1 after a harvest is detected (new season)
|
||||
doy_series = None
|
||||
if 'DOY_normalized' in feature_names:
|
||||
if season_anchor_day is not None and lookback_start >= season_anchor_day:
|
||||
# Season was reset after harvest. Recalculate DOY as simple counter from 1
|
||||
# This is a window starting at or after harvest, so DOY should be: 1, 2, 3, ...
|
||||
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
|
||||
elif 'DOY' in data_df.columns:
|
||||
# Use DOY directly from CSV - already calculated as continuous age counter
|
||||
doy_series = pd.Series(data_df['DOY'].astype(float).values, index=data_df.index)
|
||||
else:
|
||||
# Fallback: create continuous age counter (1, 2, 3, ...)
|
||||
doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index)
|
||||
|
||||
all_features = compute_ci_features(ci_series, doy_series)
|
||||
|
||||
# Select requested features
|
||||
requested = [f for f in feature_names if f in all_features.columns]
|
||||
if not requested:
|
||||
raise ValueError(f"No valid features found. Requested: {feature_names}")
|
||||
raise ValueError(f"No valid features found. Requested: {feature_names}, Available: {all_features.columns.tolist()}")
|
||||
|
||||
return all_features[requested].values
|
||||
|
||||
|
|
@ -274,42 +303,63 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame:
|
|||
def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device,
|
||||
threshold=0.45, consecutive_days=2):
|
||||
"""
|
||||
Phase 1: Growing window detection with threshold crossing.
|
||||
Expand window day-by-day, check last timestep's detected_prob.
|
||||
When N consecutive days have prob > threshold, harvest detected.
|
||||
Phase 1: Growing window detection with DOY season reset.
|
||||
|
||||
For each detected harvest, reset DOY counter for the next season.
|
||||
This allows the model to detect multiple consecutive harvests in multi-year data.
|
||||
|
||||
Algorithm:
|
||||
1. Start with season_anchor_day = 0 (DOY 1 at day 0)
|
||||
2. Expand window: [0:1], [0:2], [0:3], ... until threshold crossed
|
||||
3. When harvest detected: record date, set new season_anchor = day after harvest
|
||||
4. Continue from next season start
|
||||
|
||||
Args:
|
||||
threshold (float): Probability threshold (default 0.45, tuned for Model 307)
|
||||
consecutive_days (int): Required consecutive days above threshold (default 2, reduced from 3 for robustness)
|
||||
threshold (float): Probability threshold (default 0.45)
|
||||
consecutive_days (int): Required consecutive days above threshold (default 2)
|
||||
|
||||
Returns list of (harvest_date, harvest_idx) tuples.
|
||||
"""
|
||||
harvest_dates = []
|
||||
season_anchor_day = 0 # DOY 1 starts at day 0
|
||||
current_pos = 0
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
min_window_size = 120 # Need at least 120 days (~4 months) for patterns to establish
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
# Skip if window is too small (model needs long sequences for pattern learning)
|
||||
if len(window_data) < min_window_size:
|
||||
continue
|
||||
|
||||
try:
|
||||
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||
# CRITICAL: Pass season_anchor_day so DOY resets after harvest
|
||||
features = extract_features(
|
||||
window_data,
|
||||
config['features'],
|
||||
ci_column=ci_column,
|
||||
season_anchor_day=season_anchor_day,
|
||||
lookback_start=current_pos
|
||||
)
|
||||
|
||||
# Apply scalers
|
||||
features_scaled = features.copy().astype(float)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Run model
|
||||
# Run model on expanding window
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
# Check LAST timestep
|
||||
last_prob = detected_probs[-1]
|
||||
# Check LAST timestep only
|
||||
last_prob = detected_probs[0, -1].item()
|
||||
|
||||
if last_prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
|
|
@ -318,16 +368,21 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev
|
|||
|
||||
# Harvest detected: N consecutive days above threshold
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
|
||||
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
|
||||
harvest_idx = current_pos + window_end - consecutive_days
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx))
|
||||
|
||||
# Reset to next day after harvest
|
||||
current_pos = current_pos + window_end - consecutive_days + 1
|
||||
# CRITICAL: Reset season anchor for next season
|
||||
# DOY 1 starts at day after harvest
|
||||
season_anchor_day = harvest_idx + 1
|
||||
current_pos = harvest_idx + 1
|
||||
break
|
||||
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# Skip window on error
|
||||
continue
|
||||
else:
|
||||
# No more harvests found
|
||||
break
|
||||
|
||||
return harvest_dates
|
||||
|
|
@ -413,7 +468,9 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non
|
|||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
results = []
|
||||
ci_column = config['data']['ci_column']
|
||||
# CI column is 'FitData' (interpolated CI) - NOT 'value' (raw with NAs)
|
||||
# 'FitData' is already gap-filled by R stage 03, ready for ML
|
||||
ci_column = 'FitData'
|
||||
|
||||
# Group by field and count total fields for progress
|
||||
field_groups = list(df.groupby('field'))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,65 @@
|
|||
"""
|
||||
Test DOY reset logic for harvest detection.
|
||||
Verify that DOY resets to 1, 2, 3, ... after harvest is detected.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path.cwd()))
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from harvest_date_pred_utils import extract_features, load_model_and_config
|
||||
import torch
|
||||
|
||||
# Load sample data
|
||||
ci_data = pd.read_csv('../laravel_app/storage/app/angata/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv')
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Get field 779 data
|
||||
field_779 = ci_data[ci_data['field'] == '779'].reset_index(drop=True)
|
||||
print(f"Field 779: {len(field_779)} days of data")
|
||||
print(f"Date range: {field_779['Date'].min().date()} to {field_779['Date'].max().date()}\n")
|
||||
|
||||
# Load model config
|
||||
model, config, scalers = load_model_and_config(Path.cwd())
|
||||
|
||||
# Test 1: First season (season_anchor_day = 0)
|
||||
print("=" * 80)
|
||||
print("TEST 1: First season (season_anchor_day=0, lookback_start=0)")
|
||||
print("=" * 80)
|
||||
window = field_779.iloc[0:20].copy().reset_index(drop=True)
|
||||
features = extract_features(window, config['features'], ci_column='FitData',
|
||||
season_anchor_day=0, lookback_start=0)
|
||||
|
||||
# Extract DOY_normalized column (index 13 or find it)
|
||||
feature_names = config['features']
|
||||
doy_idx = feature_names.index('DOY_normalized') if 'DOY_normalized' in feature_names else -1
|
||||
if doy_idx >= 0:
|
||||
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
|
||||
print(f"Window size: {len(window)} days")
|
||||
print(f"DOY values: {doy_values[:10]}")
|
||||
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]")
|
||||
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not incrementing correctly!"
|
||||
print("✓ DOY incrementing correctly for first season\n")
|
||||
|
||||
# Test 2: After harvest detected at day 100, next season starts
|
||||
print("=" * 80)
|
||||
print("TEST 2: After harvest at day 100, new season starts (season_anchor_day=101, lookback_start=101)")
|
||||
print("=" * 80)
|
||||
harvest_day = 100
|
||||
window = field_779.iloc[harvest_day:harvest_day+20].copy().reset_index(drop=True)
|
||||
features = extract_features(window, config['features'], ci_column='FitData',
|
||||
season_anchor_day=harvest_day+1, lookback_start=harvest_day+1)
|
||||
|
||||
if doy_idx >= 0:
|
||||
doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize
|
||||
print(f"Window size: {len(window)} days (starting at day {harvest_day})")
|
||||
print(f"DOY values: {doy_values[:10]}")
|
||||
print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (fresh season)")
|
||||
assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not reset after harvest!"
|
||||
print("✓ DOY reset correctly for new season\n")
|
||||
|
||||
print("=" * 80)
|
||||
print("ALL TESTS PASSED! DOY logic is correct.")
|
||||
print("=" * 80)
|
||||
|
|
@ -0,0 +1,73 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quick test: Verify feature extraction works
|
||||
"""
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from harvest_date_pred_utils import extract_features, load_model_and_config
|
||||
|
||||
project_name = "angata"
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
|
||||
print("="*80)
|
||||
print("DEBUG: Feature Extraction Test")
|
||||
print("="*80)
|
||||
|
||||
# Load model config
|
||||
print("\n[1] Loading model config...")
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
print(f" Config features: {config['features']}")
|
||||
print(f" Number of features: {len(config['features'])}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
print(f" Columns: {ci_data.columns.tolist()}")
|
||||
print(f" Total rows: {len(ci_data)}")
|
||||
|
||||
# Test on a single field
|
||||
test_field = "1"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
print(f"\n[3] Testing on field {test_field}...")
|
||||
print(f" Data points: {len(field_data)}")
|
||||
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
|
||||
print(f" Columns in field data: {field_data.columns.tolist()}")
|
||||
print(f" Sample values:")
|
||||
print(field_data[['Date', 'value']].head())
|
||||
|
||||
# Test feature extraction on first 50 days
|
||||
print(f"\n[4] Extracting features for first 50 days...")
|
||||
try:
|
||||
subset = field_data.iloc[:50].copy()
|
||||
features = extract_features(subset, config['features'], ci_column='value')
|
||||
print(f" ✓ Success!")
|
||||
print(f" Feature shape: {features.shape}")
|
||||
print(f" Expected shape: (50, {len(config['features'])})")
|
||||
print(f" Feature values sample (first 5 days):")
|
||||
for i in range(min(5, features.shape[0])):
|
||||
print(f" Day {i}: {features[i]}")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n[5] Testing on growing windows...")
|
||||
try:
|
||||
for window_size in [10, 20, 30, 50]:
|
||||
window_data = field_data.iloc[:window_size].copy()
|
||||
features = extract_features(window_data, config['features'], ci_column='value')
|
||||
print(f" Window size {window_size}: shape={features.shape}, min={features.min():.4f}, max={features.max():.4f}")
|
||||
except Exception as e:
|
||||
print(f" ✗ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n✓ Feature extraction test complete")
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test ONLY the growing window method (what production actually uses)
|
||||
Never run model on full sequence - only on expanding windows
|
||||
|
||||
This matches real deployment where data arrives daily
|
||||
"""
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
|
||||
project_name = "angata"
|
||||
|
||||
# Find root by walking up until we find laravel_app
|
||||
script_dir = Path(__file__).parent
|
||||
root = script_dir
|
||||
while root != root.parent: # Stop at filesystem root
|
||||
if (root / "laravel_app").exists():
|
||||
break
|
||||
root = root.parent
|
||||
|
||||
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
MODEL_DIR = root / "python_app"
|
||||
|
||||
print("="*80)
|
||||
print("GROWING WINDOW METHOD ONLY (Real Production Simulation)")
|
||||
print("="*80)
|
||||
|
||||
# Load model
|
||||
print("\n[1] Loading model...")
|
||||
model, config, scalers = load_model_and_config(MODEL_DIR)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Test on field 779
|
||||
test_field = "779"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
print(f"\n[3] Field {test_field}: {len(field_data)} data points")
|
||||
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
|
||||
|
||||
# Simulate growing window (real production)
|
||||
print(f"\n[4] Simulating growing window (expanding daily)...")
|
||||
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
consecutive_above = 0
|
||||
threshold = 0.3
|
||||
consecutive_days = 2
|
||||
model_runs = 0
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above = 0
|
||||
found_harvest = False
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
# Expand window: current_pos to window_end
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
# Extract features for THIS window only
|
||||
features = extract_features(window_data, config['features'], ci_column='value')
|
||||
|
||||
# Normalize
|
||||
features_scaled = features.copy().astype(float)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
|
||||
# Run model on expanding window
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
model_runs += 1
|
||||
|
||||
# Check LAST timestep only
|
||||
last_prob = detected_probs[0, -1].item()
|
||||
last_date = window_data.iloc[-1]['Date'].date()
|
||||
|
||||
# Print every 50th window to track progress
|
||||
if window_end % 50 == 0 or window_end < 10:
|
||||
print(f" Window [{current_pos:3d}:{window_end:3d}] ({last_date}): prob={last_prob:.4f}", end="")
|
||||
if last_prob > threshold:
|
||||
print(" ✓ ABOVE", end="")
|
||||
print()
|
||||
|
||||
# Check threshold
|
||||
if last_prob > threshold:
|
||||
consecutive_above += 1
|
||||
else:
|
||||
consecutive_above = 0
|
||||
|
||||
# Harvest detected
|
||||
if consecutive_above >= consecutive_days:
|
||||
harvest_idx = current_pos + window_end - consecutive_days
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx, last_prob))
|
||||
print(f"\n ✓ HARVEST DETECTED at {harvest_date.date()} (index {harvest_idx})")
|
||||
print(f" {consecutive_days} consecutive days above {threshold}")
|
||||
|
||||
# Jump past this harvest
|
||||
current_pos = harvest_idx + 1
|
||||
found_harvest = True
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR at window [{current_pos}:{window_end}]: {e}")
|
||||
continue
|
||||
|
||||
if not found_harvest:
|
||||
break
|
||||
|
||||
print(f"\n[5] Results:")
|
||||
print(f" Total model runs: {model_runs}")
|
||||
print(f" Harvests found: {len(harvest_dates)}")
|
||||
|
||||
if harvest_dates:
|
||||
print(f"\n Harvest dates:")
|
||||
for date, idx, prob in harvest_dates:
|
||||
print(f" {date.date()}: index {idx}, last_prob={prob:.4f}")
|
||||
else:
|
||||
print(f"\n No harvests detected")
|
||||
|
||||
print(f"\n[6] Analysis:")
|
||||
print(f" Model runs per day: {model_runs / len(field_data):.2f}x")
|
||||
print(f" Expected: ~{len(field_data):.0f} runs (one per day)")
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete test: Feature extraction + Model inference + Phase 1 detection
|
||||
"""
|
||||
|
||||
import sys
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
run_phase1_growing_window
|
||||
)
|
||||
|
||||
project_name = "angata"
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
|
||||
print("="*80)
|
||||
print("DEBUG: Model Inference + Phase 1 Detection")
|
||||
print("="*80)
|
||||
|
||||
# Load model
|
||||
print("\n[1] Loading model...")
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f" Device: {device}")
|
||||
print(f" Scalers type: {type(scalers)}")
|
||||
print(f" Number of scalers: {len(scalers) if isinstance(scalers, list) else 'N/A (dict/object)'}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Test on a known field (field 1)
|
||||
test_field = "1"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
print(f"\n[3] Testing on field {test_field}...")
|
||||
print(f" Data points: {len(field_data)}")
|
||||
|
||||
# Test with first 100 days
|
||||
subset_100 = field_data.iloc[:100].copy().reset_index(drop=True)
|
||||
print(f"\n[4] Testing model inference on first 100 days...")
|
||||
|
||||
try:
|
||||
features = extract_features(subset_100, config['features'], ci_column='value')
|
||||
print(f" Features shape: {features.shape}")
|
||||
print(f" Features dtype: {features.dtype}")
|
||||
|
||||
# Apply scalers
|
||||
features_scaled = features.copy().astype(float)
|
||||
print(f" Applying {len(scalers)} scalers...")
|
||||
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
col_data = features[:, fi].reshape(-1, 1)
|
||||
scaled_col = scaler.transform(col_data)
|
||||
features_scaled[:, fi] = scaled_col.flatten()
|
||||
if fi < 3: # Show first 3 scalers
|
||||
print(f" Scaler {fi}: transformed {features[0, fi]:.4f} → {features_scaled[0, fi]:.4f}")
|
||||
except Exception as e:
|
||||
print(f" ERROR in scaler {fi}: {e}")
|
||||
raise
|
||||
|
||||
# Run model
|
||||
print(f"\n Running model inference...")
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
print(f" Tensor shape: {x_tensor.shape}")
|
||||
|
||||
with torch.no_grad():
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
print(f" Imminent probs shape: {imminent_probs.shape}")
|
||||
print(f" Detected probs shape: {detected_probs.shape}")
|
||||
print(f" Detected probs dtype: {detected_probs.dtype}")
|
||||
|
||||
# Analyze detected probs
|
||||
detected_np = detected_probs[0].cpu().numpy() # Get first (only) batch
|
||||
print(f"\n Detected head analysis:")
|
||||
print(f" Min: {detected_np.min():.4f}")
|
||||
print(f" Max: {detected_np.max():.4f}")
|
||||
print(f" Mean: {detected_np.mean():.4f}")
|
||||
print(f" Median: {np.median(detected_np):.4f}")
|
||||
print(f" > 0.1: {(detected_np > 0.1).sum()} days")
|
||||
print(f" > 0.3: {(detected_np > 0.3).sum()} days")
|
||||
print(f" > 0.5: {(detected_np > 0.5).sum()} days")
|
||||
|
||||
# Show top 5 peaks
|
||||
top_indices = np.argsort(detected_np)[-5:][::-1]
|
||||
print(f"\n Top 5 detected peaks:")
|
||||
for idx in top_indices:
|
||||
date = subset_100.iloc[idx]['Date'].date()
|
||||
prob = detected_np[idx]
|
||||
print(f" Day {idx} ({date}): {prob:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
# Test Phase 1 growing window
|
||||
print(f"\n[5] Testing Phase 1 growing window (threshold=0.3, consecutive=2)...")
|
||||
try:
|
||||
phase1_results = run_phase1_growing_window(
|
||||
subset_100, model, config, scalers, 'value', device,
|
||||
threshold=0.3, consecutive_days=2
|
||||
)
|
||||
print(f" ✓ Phase 1 found {len(phase1_results)} harvest(s):")
|
||||
for harvest_date, harvest_idx in phase1_results:
|
||||
print(f" {harvest_date.date()}: index {harvest_idx}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n✓ Model inference test complete")
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Debug script: Test if script 22 logic is working
|
||||
Tests the two-step refinement on a single known field
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
run_phase1_growing_window,
|
||||
)
|
||||
|
||||
project_name = "angata"
|
||||
|
||||
# Find the workspace root by looking for laravel_app folder
|
||||
script_dir = Path(__file__).parent
|
||||
root = script_dir
|
||||
while root != root.parent:
|
||||
if (root / "laravel_app").exists():
|
||||
break
|
||||
root = root.parent
|
||||
|
||||
base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data"
|
||||
CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv"
|
||||
MODEL_DIR = root / "python_app"
|
||||
|
||||
print("="*80)
|
||||
print("DEBUG: Script 22 Two-Step Refinement Logic")
|
||||
print("="*80)
|
||||
|
||||
# Load model
|
||||
print("\n[1] Loading model...")
|
||||
model, config, scalers = load_model_and_config(MODEL_DIR)
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f" Device: {device}")
|
||||
print(f" Model features: {config['features']}")
|
||||
|
||||
# Load CI data
|
||||
print("\n[2] Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
print(f" Total rows: {len(ci_data)}")
|
||||
print(f" Fields: {ci_data['field'].nunique()}")
|
||||
print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}")
|
||||
|
||||
# Test on a known field (field 779 from our previous tests)
|
||||
test_field = "779"
|
||||
field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
print(f"\n[3] Testing on field {test_field}...")
|
||||
print(f" Data points: {len(field_data)}")
|
||||
print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}")
|
||||
|
||||
if len(field_data) == 0:
|
||||
print(f" ERROR: No data for field {test_field}")
|
||||
sys.exit(1)
|
||||
|
||||
# Extract features
|
||||
print(f"\n[4] Extracting features for field {test_field}...")
|
||||
try:
|
||||
features = extract_features(field_data.reset_index(drop=True), config['features'], ci_column='value')
|
||||
print(f" Features shape: {features.shape}")
|
||||
print(f" Features dtype: {features.dtype}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: Could not extract features: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Normalize and run model
|
||||
print(f"\n[5] Running Phase 1 GROWING WINDOW method (threshold=0.5, consecutive=3)...")
|
||||
print(f" This simulates real production: expanding windows, checking each day")
|
||||
print(f" Expected: ~477 model runs for 477 days (SLOW)")
|
||||
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
# Add instrumentation to see how many model runs are happening
|
||||
original_run = run_phase1_growing_window
|
||||
|
||||
def instrumented_run(*args, **kwargs):
|
||||
import sys
|
||||
from harvest_date_pred_utils import extract_features
|
||||
|
||||
field_data = args[0]
|
||||
model = args[1]
|
||||
config = args[2]
|
||||
scalers = args[3]
|
||||
ci_column = args[4]
|
||||
device = args[5]
|
||||
threshold = kwargs.get('threshold', 0.3)
|
||||
consecutive_days = kwargs.get('consecutive_days', 2)
|
||||
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
model_runs = 0
|
||||
|
||||
print(f" Starting growing window loop...")
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
loop_start = current_pos
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||
|
||||
features_scaled = features.copy().astype(float)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception as e:
|
||||
raise ValueError(f"Scaler {fi} failed: {e}")
|
||||
|
||||
import torch
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
|
||||
model_runs += 1
|
||||
last_prob = detected_probs[0, -1].item()
|
||||
|
||||
if last_prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date']
|
||||
harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days))
|
||||
current_pos = current_pos + window_end - consecutive_days + 1
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
|
||||
print(f" Model runs performed: {model_runs}")
|
||||
return harvest_dates
|
||||
|
||||
phase1_results = instrumented_run(
|
||||
field_data.reset_index(drop=True),
|
||||
model, config, scalers, 'value', device,
|
||||
threshold=0.5,
|
||||
consecutive_days=3
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f"\n Time elapsed: {elapsed:.2f}s")
|
||||
|
||||
if phase1_results:
|
||||
print(f" ✓ Phase 1 detected {len(phase1_results)} harvest(s):")
|
||||
|
||||
# Get probabilities for display by running model once on full field
|
||||
with torch.no_grad():
|
||||
X = features.reshape(1, -1, len(config['features']))
|
||||
X_normalized = np.zeros_like(X)
|
||||
for fi, scaler in enumerate(scalers):
|
||||
X_normalized[0, :, fi] = scaler.transform(X[0, :, fi].reshape(-1, 1)).flatten()
|
||||
X_tensor = torch.from_numpy(X_normalized).float().to(device)
|
||||
_, detected_probs = model(X_tensor)
|
||||
detected_np = detected_probs[0].cpu().numpy()
|
||||
|
||||
for harvest_date, harvest_idx in phase1_results:
|
||||
prob = detected_np[harvest_idx] if harvest_idx < len(detected_np) else 0.0
|
||||
print(f" {harvest_date.date()}: index {harvest_idx}, probability={prob:.4f}")
|
||||
else:
|
||||
print(f" ✗ Phase 1: No harvest detected")
|
||||
|
|
@ -14,8 +14,8 @@
|
|||
# - tile_size: Tile size in km (default: 5, only used if use_tiles=TRUE)
|
||||
#
|
||||
# Examples:
|
||||
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata
|
||||
# Rscript 04_mosaic_creation.R 2025-12-21 7 angata week_51.tif TRUE 5 [tile-based]
|
||||
|
||||
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation.R 2026-01-12 7 angata
|
||||
#
|
||||
|
||||
# 1. Load required packages
|
||||
|
|
@ -142,15 +142,35 @@ main <- function() {
|
|||
if (use_tile_mosaic) {
|
||||
# TILE-BASED APPROACH: Create per-tile weekly MAX mosaics
|
||||
# This is used for projects like Angata with large ROIs requiring spatial partitioning
|
||||
# Input data comes from merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif (5-band tiles from script 20)
|
||||
tryCatch({
|
||||
safe_log("Starting per-tile mosaic creation (tile-based approach)...")
|
||||
|
||||
# Set output directory for per-tile mosaics
|
||||
tile_output_base <- file.path(laravel_storage, "weekly_tile_max")
|
||||
# Detect grid size from merged_final_tif folder structure
|
||||
# Expected: merged_final_tif/5x5/ or merged_final_tif/10x10/ etc.
|
||||
merged_final_base <- file.path(laravel_storage, "merged_final_tif")
|
||||
grid_subfolders <- list.dirs(merged_final_base, full.names = FALSE, recursive = FALSE)
|
||||
# Look for grid size patterns like "5x5", "10x10", "20x20"
|
||||
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
|
||||
|
||||
if (length(grid_patterns) == 0) {
|
||||
stop("No grid size subfolder found in merged_final_tif/ (expected: 5x5, 10x10, etc.)")
|
||||
}
|
||||
|
||||
grid_size <- grid_patterns[1] # Use first grid size found
|
||||
safe_log(paste("Detected grid size:", grid_size))
|
||||
|
||||
# Point to the grid-specific merged_final_tif directory
|
||||
merged_final_with_grid <- file.path(merged_final_base, grid_size)
|
||||
|
||||
# Set output directory for per-tile mosaics, organized by grid size
|
||||
# Output: weekly_tile_max/{grid_size}/week_WW_YYYY_TT.tif
|
||||
tile_output_base <- file.path(laravel_storage, "weekly_tile_max", grid_size)
|
||||
dir.create(tile_output_base, recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
created_tile_files <- create_weekly_mosaic_from_tiles(
|
||||
dates = dates,
|
||||
merged_final_dir = merged_final,
|
||||
merged_final_dir = merged_final_with_grid,
|
||||
tile_output_dir = tile_output_base,
|
||||
field_boundaries = field_boundaries
|
||||
)
|
||||
|
|
|
|||
|
|
@ -354,18 +354,17 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
}
|
||||
|
||||
# Get filenames of best-coverage images
|
||||
# Match by extracting tile ID from both cloud stats and TIF filenames
|
||||
# Match by full filename from cloud stats to TIF files
|
||||
rasters_to_use <- character()
|
||||
for (idx in best_coverage) {
|
||||
# Extract tile ID from cloud_coverage_stats filename (e.g., "tile_18" → 18)
|
||||
# Get the full filename from cloud coverage stats
|
||||
cc_filename <- cloud_coverage_stats$filename[idx]
|
||||
cc_tile_id <- gsub(".*_([0-9]+).*", "\\1", cc_filename)
|
||||
|
||||
# Find matching TIF file by matching tile ID
|
||||
# Find matching TIF file by full filename
|
||||
matching_tif <- NULL
|
||||
for (tif_file in tif_files) {
|
||||
tif_tile_id <- gsub(".*_([0-9]+)\\.tif", "\\1", basename(tif_file))
|
||||
if (tif_tile_id == cc_tile_id) {
|
||||
tif_basename <- basename(tif_file)
|
||||
if (tif_basename == cc_filename) {
|
||||
matching_tif <- tif_file
|
||||
break
|
||||
}
|
||||
|
|
@ -373,6 +372,8 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
|
||||
if (!is.null(matching_tif)) {
|
||||
rasters_to_use <- c(rasters_to_use, matching_tif)
|
||||
} else {
|
||||
safe_log(paste("Warning: Could not find TIF file matching cloud stats entry:", cc_filename), "WARNING")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -420,42 +421,60 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU
|
|||
mosaic <- tryCatch({
|
||||
safe_log(paste("Creating max composite from", length(all_rasters), "images to fill clouds"))
|
||||
|
||||
# Get extent from field boundaries if available, otherwise use raster intersection
|
||||
if (!is.null(field_boundaries)) {
|
||||
crop_extent <- terra::ext(field_boundaries)
|
||||
safe_log("Using field boundaries extent for consistent area across all dates")
|
||||
# Check if all rasters have identical grids (extent and resolution)
|
||||
# This is likely for per-tile mosaics from the same tiling scheme
|
||||
reference_raster <- all_rasters[[1]]
|
||||
ref_ext <- terra::ext(reference_raster)
|
||||
ref_res <- terra::res(reference_raster)
|
||||
|
||||
grids_match <- all(sapply(all_rasters[-1], function(r) {
|
||||
isTRUE(all.equal(terra::ext(r), ref_ext, tolerance = 1e-6)) &&
|
||||
isTRUE(all.equal(terra::res(r), ref_res, tolerance = 1e-6))
|
||||
}))
|
||||
|
||||
if (grids_match) {
|
||||
# All rasters have matching grids - no cropping/resampling needed!
|
||||
safe_log("All rasters have identical grids - stacking directly for max composite")
|
||||
raster_collection <- terra::sprc(all_rasters)
|
||||
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
|
||||
} else {
|
||||
# Fallback: use intersection of all raster extents
|
||||
crop_extent <- terra::ext(all_rasters[[1]])
|
||||
for (i in 2:length(all_rasters)) {
|
||||
crop_extent <- terra::intersect(crop_extent, terra::ext(all_rasters[[i]]))
|
||||
# Grids don't match - need to crop and resample
|
||||
safe_log("Rasters have different grids - cropping and resampling to common extent")
|
||||
|
||||
# Get extent from field boundaries if available, otherwise use raster union
|
||||
if (!is.null(field_boundaries)) {
|
||||
crop_extent <- terra::ext(field_boundaries)
|
||||
safe_log("Using field boundaries extent for consistent area across all dates")
|
||||
} else {
|
||||
# Use union of all extents (covers all data)
|
||||
crop_extent <- terra::ext(all_rasters[[1]])
|
||||
for (i in 2:length(all_rasters)) {
|
||||
crop_extent <- terra::union(crop_extent, terra::ext(all_rasters[[i]]))
|
||||
}
|
||||
safe_log("Using raster union extent")
|
||||
}
|
||||
safe_log("Using raster intersection extent")
|
||||
|
||||
# Crop all rasters to common extent
|
||||
cropped_rasters <- lapply(all_rasters, function(r) {
|
||||
terra::crop(r, crop_extent)
|
||||
})
|
||||
|
||||
# Resample all cropped rasters to match the first one's grid
|
||||
reference_grid <- cropped_rasters[[1]]
|
||||
|
||||
aligned_rasters <- lapply(cropped_rasters, function(r) {
|
||||
if (isTRUE(all.equal(terra::ext(r), terra::ext(reference_grid), tolerance = 1e-6)) &&
|
||||
isTRUE(all.equal(terra::res(r), terra::res(reference_grid), tolerance = 1e-6))) {
|
||||
return(r) # Already aligned
|
||||
}
|
||||
terra::resample(r, reference_grid, method = "near")
|
||||
})
|
||||
|
||||
# Create max composite using mosaic on aligned rasters
|
||||
raster_collection <- terra::sprc(aligned_rasters)
|
||||
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
|
||||
}
|
||||
|
||||
# Crop all rasters to common extent
|
||||
cropped_rasters <- lapply(all_rasters, function(r) {
|
||||
terra::crop(r, crop_extent)
|
||||
})
|
||||
|
||||
# Resample all cropped rasters to match the first one's grid
|
||||
# This handles pixel grid misalignment from Python's dynamic extent adjustment
|
||||
reference_grid <- cropped_rasters[[1]]
|
||||
safe_log("Resampling rasters to common grid for stacking")
|
||||
|
||||
aligned_rasters <- lapply(cropped_rasters, function(r) {
|
||||
if (identical(terra::ext(r), terra::ext(reference_grid)) &&
|
||||
identical(terra::res(r), terra::res(reference_grid))) {
|
||||
return(r) # Already aligned
|
||||
}
|
||||
terra::resample(r, reference_grid, method = "near")
|
||||
})
|
||||
|
||||
# Create max composite using mosaic on aligned rasters
|
||||
# Resample ensures all rasters have matching grids (no resolution mismatch)
|
||||
raster_collection <- terra::sprc(aligned_rasters)
|
||||
max_mosaic <- terra::mosaic(raster_collection, fun = "max")
|
||||
|
||||
max_mosaic
|
||||
}, error = function(e) {
|
||||
safe_log(paste("Max composite creation failed:", e$message), "WARNING")
|
||||
|
|
@ -686,6 +705,16 @@ create_weekly_mosaic_from_tiles <- function(dates, merged_final_dir, tile_output
|
|||
next
|
||||
}
|
||||
|
||||
# DEBUG: Check mosaic content before saving
|
||||
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "dimensions:", nrow(tile_mosaic), "x", ncol(tile_mosaic)))
|
||||
safe_log(paste(" DEBUG: Mosaic tile", tile_id, "bands:", terra::nlyr(tile_mosaic)))
|
||||
|
||||
# Check first band values
|
||||
band1 <- tile_mosaic[[1]]
|
||||
band1_min <- terra::global(band1, fun = "min", na.rm = TRUE)$min
|
||||
band1_max <- terra::global(band1, fun = "max", na.rm = TRUE)$max
|
||||
safe_log(paste(" DEBUG: Band 1 MIN=", round(band1_min, 2), "MAX=", round(band1_max, 2)))
|
||||
|
||||
# Step 2c: Save this tile's weekly MAX mosaic
|
||||
# Filename format: week_WW_YYYY_TT.tif (e.g., week_02_2026_01.tif for week 2, 2026, tile 1)
|
||||
tile_filename <- paste0("week_", sprintf("%02d", dates$week), "_", dates$year, "_",
|
||||
|
|
@ -763,7 +792,7 @@ count_cloud_coverage_for_tile <- function(tile_files, field_boundaries = NULL) {
|
|||
missing_pct <- round(100 - ((total_notna / total_pixels) * 100))
|
||||
|
||||
aggregated_results[[idx]] <- data.frame(
|
||||
filename = paste0("tile_", sprintf("%02d", as.integer(gsub(".*_([0-9]+)\\.tif", "\\1", basename(tile_file))))),
|
||||
filename = basename(tile_file), # Keep full filename: 2026-01-07_03.tif
|
||||
notNA = total_notna,
|
||||
total_pixels = total_pixels,
|
||||
missing_pixels_percentage = missing_pct,
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@
|
|||
# Option 2: Rscript 80_calculate_kpis.R 2026-01-14 angata 7
|
||||
# Arguments: [end_date] [project_dir] [offset_days]
|
||||
#
|
||||
# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-01-12 angata 7
|
||||
#
|
||||
# Usage in run_full_pipeline.R:
|
||||
# source("r_app/80_calculate_kpis.R")
|
||||
# main()
|
||||
|
|
@ -122,7 +124,7 @@ STATUS_TRIGGERS <- data.frame(
|
|||
# TILE-AWARE HELPER FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
get_tile_ids_for_field <- function(field_geom, tile_grid) {
|
||||
get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) {
|
||||
if (inherits(field_geom, "sf")) {
|
||||
field_bbox <- sf::st_bbox(field_geom)
|
||||
field_xmin <- field_bbox["xmin"]
|
||||
|
|
@ -139,6 +141,17 @@ get_tile_ids_for_field <- function(field_geom, tile_grid) {
|
|||
stop("field_geom must be sf or terra::vect object")
|
||||
}
|
||||
|
||||
# DEBUG: Print bbox info for first field
|
||||
if (!is.null(field_id) && field_id == "1391") {
|
||||
message(paste("[DEBUG get_tile_ids] Field bbox - xmin:", field_xmin, "xmax:", field_xmax,
|
||||
"ymin:", field_ymin, "ymax:", field_ymax))
|
||||
message(paste("[DEBUG get_tile_ids] tile_grid sample: id=", tile_grid$id[1],
|
||||
"xmin=", tile_grid$xmin[1], "xmax=", tile_grid$xmax[1],
|
||||
"ymin=", tile_grid$ymin[1], "ymax=", tile_grid$ymax[1]))
|
||||
message(paste("[DEBUG get_tile_ids] tile_grid CRS:", sf::st_crs(tile_grid)))
|
||||
message(paste("[DEBUG get_tile_ids] field CRS:", sf::st_crs(field_geom)))
|
||||
}
|
||||
|
||||
intersecting_tiles <- tile_grid$id[
|
||||
!(tile_grid$xmax < field_xmin |
|
||||
tile_grid$xmin > field_xmax |
|
||||
|
|
@ -189,6 +202,21 @@ load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_di
|
|||
}
|
||||
|
||||
build_tile_grid <- function(mosaic_dir, week_num, year) {
|
||||
# Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/)
|
||||
# First check if mosaic_dir contains grid-size subdirectories
|
||||
detected_grid_size <- NA
|
||||
if (dir.exists(mosaic_dir)) {
|
||||
subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE)
|
||||
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
|
||||
|
||||
if (length(grid_patterns) > 0) {
|
||||
# Use the first grid-size subdirectory found
|
||||
detected_grid_size <- grid_patterns[1]
|
||||
mosaic_dir <- file.path(mosaic_dir, detected_grid_size)
|
||||
message(paste(" Using grid-size subdirectory:", detected_grid_size))
|
||||
}
|
||||
}
|
||||
|
||||
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
|
||||
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
|
||||
|
|
@ -230,7 +258,12 @@ build_tile_grid <- function(mosaic_dir, week_num, year) {
|
|||
stop("Could not extract extents from any tile files")
|
||||
}
|
||||
|
||||
return(tile_grid)
|
||||
# RETURN BOTH the grid AND the corrected mosaic directory path
|
||||
return(list(
|
||||
tile_grid = tile_grid,
|
||||
mosaic_dir = mosaic_dir,
|
||||
grid_size = detected_grid_size
|
||||
))
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
|
|
@ -399,14 +432,20 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n
|
|||
USE_UNIFORM_AGE <- TRUE
|
||||
UNIFORM_PLANTING_DATE <- as.Date("2025-01-01")
|
||||
|
||||
extract_planting_dates <- function(harvesting_data) {
|
||||
extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) {
|
||||
if (USE_UNIFORM_AGE) {
|
||||
message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE))
|
||||
return(data.frame(
|
||||
field_id = character(),
|
||||
planting_date = as.Date(character()),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
# Return a data frame with all field IDs mapped to uniform planting date
|
||||
if (!is.null(field_boundaries_sf)) {
|
||||
return(data.frame(
|
||||
field_id = field_boundaries_sf$field,
|
||||
date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
} else {
|
||||
# Fallback if field_boundaries_sf not provided
|
||||
return(NULL)
|
||||
}
|
||||
}
|
||||
|
||||
if (is.null(harvesting_data) || nrow(harvesting_data) == 0) {
|
||||
|
|
@ -449,6 +488,11 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
|
|||
}
|
||||
field_name <- field_id
|
||||
|
||||
# DEBUG: Print for first few fields
|
||||
if (field_idx <= 3) {
|
||||
message(paste("[DEBUG] Field", field_idx, ":", field_id))
|
||||
}
|
||||
|
||||
field_sf <- field_boundaries_sf[field_idx, ]
|
||||
if (sf::st_is_empty(field_sf) || any(is.na(sf::st_geometry(field_sf)))) {
|
||||
return(data.frame(
|
||||
|
|
@ -460,7 +504,14 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
|
|||
field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000
|
||||
field_area_acres <- field_area_ha / 0.404686
|
||||
|
||||
tile_ids <- get_tile_ids_for_field(field_sf, tile_grid)
|
||||
tile_ids <- get_tile_ids_for_field(field_sf, tile_grid, field_id = field_id)
|
||||
|
||||
# DEBUG: Print tile IDs for first field
|
||||
if (field_idx == 1) {
|
||||
message(paste("[DEBUG] First field tile_ids:", paste(tile_ids, collapse=",")))
|
||||
message(paste("[DEBUG] tile_grid nrows:", nrow(tile_grid), "ncols:", ncol(tile_grid)))
|
||||
message(paste("[DEBUG] mosaic_dir:", mosaic_dir))
|
||||
}
|
||||
|
||||
current_ci <- load_tiles_for_field(field_sf, tile_ids, week_num, year, mosaic_dir)
|
||||
|
||||
|
|
@ -471,10 +522,26 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
|
|||
))
|
||||
}
|
||||
|
||||
field_vect <- terra::vect(sf::as_Spatial(field_sf))
|
||||
terra::crs(field_vect) <- terra::crs(current_ci)
|
||||
# Extract CI values: EXACTLY LIKE SCRIPT 20
|
||||
# Crop to field bounding box first, then extract with sf directly (not terra::vect conversion)
|
||||
field_bbox <- sf::st_bbox(field_sf)
|
||||
ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out")
|
||||
extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE)
|
||||
|
||||
all_extracted <- terra::extract(current_ci, field_vect)[, 2]
|
||||
# extracted_vals is a data.frame with ID column (field index) + mean value
|
||||
mean_ci_current <- as.numeric(extracted_vals[1, 2])
|
||||
|
||||
if (is.na(mean_ci_current)) {
|
||||
return(data.frame(
|
||||
Field_id = field_id,
|
||||
error = "No CI values extracted from tiles"
|
||||
))
|
||||
}
|
||||
|
||||
# For per-tile extraction, we only have mean from the aggregation function
|
||||
# To get variance/CV, we need to extract all pixels without the fun parameter
|
||||
# But for farm-level purposes, the mean CI is sufficient
|
||||
all_extracted <- terra::extract(ci_cropped, field_sf)[, 2]
|
||||
current_ci_vals <- all_extracted[!is.na(all_extracted)]
|
||||
|
||||
num_total <- length(all_extracted)
|
||||
|
|
@ -509,7 +576,9 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week
|
|||
tryCatch({
|
||||
previous_ci <- load_tiles_for_field(field_sf, tile_ids, week_num - 1, year, mosaic_dir)
|
||||
if (!is.null(previous_ci)) {
|
||||
prev_extracted <- terra::extract(previous_ci, field_vect)[, 2]
|
||||
prev_bbox <- sf::st_bbox(field_sf)
|
||||
prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out")
|
||||
prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2]
|
||||
previous_ci_vals <- prev_extracted[!is.na(prev_extracted)]
|
||||
if (length(previous_ci_vals) > 0) {
|
||||
mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE)
|
||||
|
|
@ -763,6 +832,13 @@ generate_field_analysis_summary <- function(field_df) {
|
|||
export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) {
|
||||
message("Exporting per-field analysis to Excel, CSV, and RDS...")
|
||||
|
||||
# Round all numeric columns to 2 decimals
|
||||
field_df_rounded <- field_df %>%
|
||||
mutate(across(where(is.numeric), ~ round(., 2)))
|
||||
|
||||
summary_df_rounded <- summary_df %>%
|
||||
mutate(across(where(is.numeric), ~ round(., 2)))
|
||||
|
||||
output_subdir <- file.path(reports_dir, "kpis", "field_analysis")
|
||||
if (!dir.exists(output_subdir)) {
|
||||
dir.create(output_subdir, recursive = TRUE)
|
||||
|
|
@ -773,16 +849,16 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre
|
|||
excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE)
|
||||
|
||||
sheets <- list(
|
||||
"Field Data" = field_df,
|
||||
"Summary" = summary_df
|
||||
"Field Data" = field_df_rounded,
|
||||
"Summary" = summary_df_rounded
|
||||
)
|
||||
|
||||
write_xlsx(sheets, excel_path)
|
||||
message(paste("✓ Field analysis Excel exported to:", excel_path))
|
||||
|
||||
kpi_data <- list(
|
||||
field_analysis = field_df,
|
||||
field_analysis_summary = summary_df,
|
||||
field_analysis = field_df_rounded,
|
||||
field_analysis_summary = summary_df_rounded,
|
||||
metadata = list(
|
||||
current_week = current_week,
|
||||
project = project_dir,
|
||||
|
|
@ -798,84 +874,254 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre
|
|||
|
||||
csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv")
|
||||
csv_path <- file.path(output_subdir, csv_filename)
|
||||
write_csv(field_df, csv_path)
|
||||
write_csv(field_df_rounded, csv_path)
|
||||
message(paste("✓ Field analysis CSV exported to:", csv_path))
|
||||
|
||||
return(list(excel = excel_path, rds = rds_path, csv = csv_path))
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# FARM-LEVEL KPI CALCULATION (FROM OLD 09_CALCULATE_KPIS.R)
|
||||
# TILE-BASED KPI EXTRACTION FUNCTION
|
||||
# ============================================================================
|
||||
|
||||
calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) {
|
||||
# Loop through tiles, extract KPI statistics per field per tile
|
||||
# Follows the same pattern as extract_ci_from_tiles in CI extraction
|
||||
|
||||
message("Calculating field-level KPI statistics from tiles...")
|
||||
|
||||
# Get all tile files for this week
|
||||
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year)
|
||||
tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
|
||||
if (length(tile_files) == 0) {
|
||||
message("No tiles found for week", week_num, year)
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
# Process tiles in parallel using furrr (same as CI extraction)
|
||||
message(paste("Processing", length(tile_files), "tiles in parallel..."))
|
||||
|
||||
field_kpi_list <- furrr::future_map(
|
||||
tile_files,
|
||||
~ process_single_kpi_tile(
|
||||
tile_file = .,
|
||||
field_boundaries_sf = field_boundaries_sf,
|
||||
tile_grid = tile_grid
|
||||
),
|
||||
.progress = TRUE,
|
||||
.options = furrr::furrr_options(seed = TRUE)
|
||||
)
|
||||
|
||||
# Combine results from all tiles
|
||||
field_kpi_stats <- dplyr::bind_rows(field_kpi_list)
|
||||
|
||||
if (nrow(field_kpi_stats) == 0) {
|
||||
message(" No KPI data extracted from tiles")
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields"))
|
||||
return(field_kpi_stats)
|
||||
}
|
||||
|
||||
# Helper function to process a single tile (like process_single_tile in CI extraction)
|
||||
process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) {
|
||||
tryCatch({
|
||||
tile_basename <- basename(tile_file)
|
||||
# Load tile raster
|
||||
tile_raster <- terra::rast(tile_file)
|
||||
|
||||
# Get first band (CI band for weekly mosaics)
|
||||
ci_band <- tile_raster[[1]]
|
||||
|
||||
# EXACTLY LIKE SCRIPT 20: Crop to field bounding box first, then extract with sf directly
|
||||
field_bbox <- sf::st_bbox(field_boundaries_sf)
|
||||
ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out")
|
||||
|
||||
# Extract CI values for ALL fields at once using sf object directly (NOT terra::vect)
|
||||
# terra::extract() works with sf objects and handles geometries properly
|
||||
extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE)
|
||||
|
||||
# Initialize results for this tile
|
||||
tile_results <- data.frame()
|
||||
|
||||
# Get tile ID from filename
|
||||
tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename))
|
||||
|
||||
# Process each field: extracted_vals is a data.frame with ID column (field indices) + extracted values
|
||||
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
|
||||
field_id <- field_boundaries_sf$field[field_idx]
|
||||
|
||||
# extracted_vals columns: 1=ID, 2=mean_CI (since we used fun="mean")
|
||||
mean_ci <- extracted_vals[field_idx, 2]
|
||||
|
||||
# Skip if no data for this field in this tile
|
||||
if (is.na(mean_ci)) {
|
||||
next
|
||||
}
|
||||
|
||||
# For tile-level stats, we only have mean from extraction (no variance without all pixels)
|
||||
# Add to results
|
||||
tile_results <- rbind(tile_results, data.frame(
|
||||
field = field_id,
|
||||
tile_id = tile_id_match,
|
||||
tile_file = tile_basename,
|
||||
mean_ci = round(mean_ci, 4),
|
||||
stringsAsFactors = FALSE
|
||||
))
|
||||
}
|
||||
|
||||
return(tile_results)
|
||||
|
||||
}, error = function(e) {
|
||||
message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message))
|
||||
return(data.frame())
|
||||
})
|
||||
}
|
||||
|
||||
calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf,
|
||||
harvesting_data, cumulative_CI_vals_dir,
|
||||
weekly_CI_mosaic, reports_dir) {
|
||||
weekly_CI_mosaic, reports_dir, current_week, year,
|
||||
tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") {
|
||||
message("\n=== CALCULATING FARM-LEVEL KPIs ===")
|
||||
message("(6 high-level KPI metrics)")
|
||||
|
||||
tryCatch({
|
||||
source(here("r_app", "kpi_utils.R"))
|
||||
}, error = function(e) {
|
||||
message(paste("Warning: Could not load kpi_utils.R:", e$message))
|
||||
message("Farm-level KPIs will be skipped")
|
||||
return(NULL)
|
||||
})
|
||||
|
||||
if (!exists("calculate_all_kpis")) {
|
||||
message("Warning: calculate_all_kpis() function not found in kpi_utils.R")
|
||||
return(NULL)
|
||||
}
|
||||
message("(6 high-level KPI metrics with tile-based extraction)")
|
||||
|
||||
output_dir <- file.path(reports_dir, "kpis")
|
||||
if (!dir.exists(output_dir)) {
|
||||
dir.create(output_dir, recursive = TRUE)
|
||||
}
|
||||
|
||||
tryCatch({
|
||||
kpi_results <- calculate_all_kpis(
|
||||
report_date = report_date,
|
||||
output_dir = output_dir,
|
||||
field_boundaries_sf = field_boundaries_sf,
|
||||
harvesting_data = harvesting_data,
|
||||
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
|
||||
weekly_CI_mosaic = weekly_CI_mosaic,
|
||||
reports_dir = reports_dir,
|
||||
project_dir = project_dir
|
||||
# Get mosaic directory with grid size if using tiles
|
||||
mosaic_dir <- if (use_tile_mosaic && !is.null(tile_grid_size)) {
|
||||
file.path(weekly_CI_mosaic, tile_grid_size)
|
||||
} else {
|
||||
weekly_CI_mosaic
|
||||
}
|
||||
|
||||
# Extract field-level KPI statistics from tiles
|
||||
field_kpi_stats <- calculate_field_kpis_from_tiles(
|
||||
tile_dir = mosaic_dir,
|
||||
week_num = current_week,
|
||||
year = year,
|
||||
field_boundaries_sf = field_boundaries_sf,
|
||||
tile_grid = tile_grid
|
||||
)
|
||||
|
||||
if (is.null(field_kpi_stats) || nrow(field_kpi_stats) == 0) {
|
||||
message("Warning: No field KPI statistics extracted from tiles")
|
||||
return(NULL)
|
||||
}
|
||||
|
||||
# Aggregate tile-based statistics by field (average across tiles for each field)
|
||||
field_summary_stats <- field_kpi_stats %>%
|
||||
dplyr::group_by(field) %>%
|
||||
dplyr::summarise(
|
||||
mean_ci = mean(mean_ci, na.rm = TRUE),
|
||||
cv_ci = mean(cv_ci, na.rm = TRUE),
|
||||
min_ci = min(min_ci, na.rm = TRUE),
|
||||
max_ci = max(max_ci, na.rm = TRUE),
|
||||
total_pixels = sum(n_pixels, na.rm = TRUE),
|
||||
num_tiles = n_distinct(tile_id),
|
||||
.groups = 'drop'
|
||||
)
|
||||
|
||||
# Print KPI summary
|
||||
cat("\n=== FARM-LEVEL KPI SUMMARY ===\n")
|
||||
cat("Report Date:", as.character(kpi_results$metadata$report_date), "\n")
|
||||
cat("Current Week:", kpi_results$metadata$current_week, "\n")
|
||||
cat("Previous Week:", kpi_results$metadata$previous_week, "\n")
|
||||
cat("Total Fields Analyzed:", kpi_results$metadata$total_fields, "\n")
|
||||
cat("Calculation Time:", as.character(kpi_results$metadata$calculation_time), "\n")
|
||||
# Create results list
|
||||
kpi_results <- list(
|
||||
field_kpi_stats = field_kpi_stats,
|
||||
field_summary_stats = field_summary_stats,
|
||||
metadata = list(
|
||||
report_date = report_date,
|
||||
current_week = current_week,
|
||||
year = year,
|
||||
calculation_method = "tile_based_extraction",
|
||||
num_fields_processed = length(unique(field_kpi_stats$field)),
|
||||
num_tiles_processed = length(unique(field_kpi_stats$tile_id))
|
||||
)
|
||||
)
|
||||
|
||||
cat("\n--- KPI Metrics ---\n")
|
||||
cat("Field Uniformity Summary:\n")
|
||||
print(kpi_results$field_uniformity_summary)
|
||||
# Save results
|
||||
rds_filename <- paste0(project_dir, "_farm_kpi_stats_week", sprintf("%02d", current_week), ".rds")
|
||||
rds_path <- file.path(output_dir, rds_filename)
|
||||
saveRDS(kpi_results, rds_path)
|
||||
message(paste("✓ Farm-level KPI stats exported to:", rds_path))
|
||||
|
||||
cat("\nArea Change Summary:\n")
|
||||
print(kpi_results$area_change)
|
||||
# Print summary
|
||||
cat("\n=== FARM-LEVEL KPI SUMMARY ===\n")
|
||||
cat("Report Date:", as.character(report_date), "\n")
|
||||
cat("Week:", current_week, "Year:", year, "\n")
|
||||
cat("Fields Processed:", length(unique(field_kpi_stats$field)), "\n")
|
||||
cat("Tiles Processed:", length(unique(field_kpi_stats$tile_id)), "\n")
|
||||
cat("\n--- Field Summary Statistics (Mean across tiles) ---\n")
|
||||
print(head(field_summary_stats, 20))
|
||||
|
||||
cat("\nTCH Forecasted:\n")
|
||||
print(kpi_results$tch_forecasted)
|
||||
return(kpi_results)
|
||||
}
|
||||
|
||||
cat("\nGrowth Decline Index:\n")
|
||||
print(kpi_results$growth_decline)
|
||||
# ============================================================================
|
||||
# HELPER: Extract field-level statistics from CI raster (all pixels, single call)
|
||||
# ============================================================================
|
||||
|
||||
cat("\nWeed Presence Score:\n")
|
||||
print(kpi_results$weed_presence)
|
||||
extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) {
|
||||
#' Extract CI statistics for all fields from a single CI raster band
|
||||
#'
|
||||
#' This function extracts all pixel values for each field in one terra::extract call,
|
||||
#' then calculates mean, CV, and percentiles from those pixels.
|
||||
#'
|
||||
#' @param ci_band Single CI band from terra raster
|
||||
#' @param field_boundaries_sf SF object with field geometries
|
||||
#' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count
|
||||
|
||||
cat("\nGap Filling Score:\n")
|
||||
print(kpi_results$gap_filling)
|
||||
# Extract all pixels for all fields at once (more efficient than individual calls)
|
||||
all_pixels <- terra::extract(ci_band, field_boundaries_sf)
|
||||
|
||||
return(kpi_results)
|
||||
}, error = function(e) {
|
||||
message(paste("Error calculating farm-level KPIs:", e$message))
|
||||
return(NULL)
|
||||
})
|
||||
# Calculate statistics for each field
|
||||
stats_list <- list()
|
||||
|
||||
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
|
||||
# Extract pixel values for this field (skip ID column 1)
|
||||
pixels <- all_pixels[field_idx, -1, drop = TRUE]
|
||||
pixels <- as.numeric(pixels)
|
||||
pixels <- pixels[!is.na(pixels)]
|
||||
|
||||
# Only calculate stats if we have pixels
|
||||
if (length(pixels) > 0) {
|
||||
mean_val <- mean(pixels, na.rm = TRUE)
|
||||
|
||||
# Only calculate CV if mean > 0 (avoid division by zero)
|
||||
if (mean_val > 0) {
|
||||
cv_val <- sd(pixels, na.rm = TRUE) / mean_val
|
||||
} else {
|
||||
cv_val <- NA
|
||||
}
|
||||
|
||||
p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]]
|
||||
p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]]
|
||||
|
||||
stats_list[[field_idx]] <- data.frame(
|
||||
field_idx = field_idx,
|
||||
mean_ci = mean_val,
|
||||
cv = cv_val,
|
||||
p10 = p10_val,
|
||||
p90 = p90_val,
|
||||
pixel_count = length(pixels),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
} else {
|
||||
# No pixels for this field (doesn't intersect tile)
|
||||
stats_list[[field_idx]] <- data.frame(
|
||||
field_idx = field_idx,
|
||||
mean_ci = NA_real_,
|
||||
cv = NA_real_,
|
||||
p10 = NA_real_,
|
||||
p90 = NA_real_,
|
||||
pixel_count = 0,
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return(dplyr::bind_rows(stats_list))
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
|
|
@ -923,7 +1169,7 @@ main <- function() {
|
|||
message("")
|
||||
|
||||
# Load configuration and utilities
|
||||
source(here("r_app", "crop_messaging_utils.R"))
|
||||
# source(here("r_app", "crop_messaging_utils.R"))
|
||||
|
||||
tryCatch({
|
||||
source(here("r_app", "parameters_project.R"))
|
||||
|
|
@ -950,10 +1196,29 @@ main <- function() {
|
|||
|
||||
message(paste("Week:", current_week, "/ Year:", year))
|
||||
|
||||
message("Building tile grid from available weekly tiles...")
|
||||
tile_grid <- build_tile_grid(weekly_tile_max, current_week, year)
|
||||
message(paste(" Found", nrow(tile_grid), "tiles"))
|
||||
# Find tile files - approach from Script 20
|
||||
message("Finding tile files...")
|
||||
tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", current_week, year)
|
||||
|
||||
# Detect grid size subdirectory
|
||||
detected_grid_size <- NA
|
||||
if (dir.exists(weekly_tile_max)) {
|
||||
subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE)
|
||||
grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE)
|
||||
if (length(grid_patterns) > 0) {
|
||||
detected_grid_size <- grid_patterns[1]
|
||||
mosaic_dir <- file.path(weekly_tile_max, detected_grid_size)
|
||||
message(paste(" Using grid-size subdirectory:", detected_grid_size))
|
||||
}
|
||||
}
|
||||
|
||||
tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE)
|
||||
if (length(tile_files) == 0) {
|
||||
stop(paste("No tile files found for week", current_week, year, "in", mosaic_dir))
|
||||
}
|
||||
message(paste(" Found", length(tile_files), "tiles"))
|
||||
|
||||
# Load field boundaries
|
||||
tryCatch({
|
||||
boundaries_result <- load_field_boundaries(data_dir)
|
||||
|
||||
|
|
@ -979,38 +1244,217 @@ main <- function() {
|
|||
}
|
||||
historical_data <- load_historical_field_data(project_dir, current_week, reports_dir, num_weeks = num_weeks_to_load)
|
||||
|
||||
planting_dates <- extract_planting_dates(harvesting_data)
|
||||
planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf)
|
||||
|
||||
message("Setting up parallel processing...")
|
||||
current_plan <- class(future::plan())[1]
|
||||
if (current_plan == "sequential") {
|
||||
num_workers <- parallel::detectCores() - 1
|
||||
message(paste(" Using", num_workers, "workers"))
|
||||
future::plan(future::multisession, workers = num_workers)
|
||||
# Validate planting_dates
|
||||
if (is.null(planting_dates) || nrow(planting_dates) == 0) {
|
||||
message("WARNING: No planting dates available. Using NA for all fields.")
|
||||
planting_dates <- data.frame(
|
||||
field_id = field_boundaries_sf$field,
|
||||
date = rep(as.Date(NA), nrow(field_boundaries_sf)),
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
}
|
||||
|
||||
message("Analyzing", nrow(field_boundaries_sf), "fields in parallel...")
|
||||
# SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile
|
||||
message("\nProcessing tiles and extracting field statistics...")
|
||||
all_tile_results <- list()
|
||||
|
||||
field_analysis_list <- furrr::future_map(
|
||||
seq_len(nrow(field_boundaries_sf)),
|
||||
~ analyze_single_field(
|
||||
field_idx = .,
|
||||
field_boundaries_sf = field_boundaries_sf,
|
||||
tile_grid = tile_grid,
|
||||
week_num = current_week,
|
||||
year = year,
|
||||
mosaic_dir = weekly_tile_max,
|
||||
historical_data = historical_data,
|
||||
planting_dates = planting_dates,
|
||||
report_date = end_date,
|
||||
harvest_imminence_data = NULL,
|
||||
harvesting_data = harvesting_data
|
||||
),
|
||||
.progress = TRUE,
|
||||
.options = furrr::furrr_options(seed = TRUE)
|
||||
)
|
||||
for (i in seq_along(tile_files)) {
|
||||
tile_file <- tile_files[i]
|
||||
message(paste(" Processing tile", i, "of", length(tile_files), ":", basename(tile_file)))
|
||||
|
||||
field_analysis_df <- dplyr::bind_rows(field_analysis_list)
|
||||
tryCatch({
|
||||
# Load current tile and previous week tile
|
||||
current_rast <- terra::rast(tile_file)
|
||||
|
||||
# DEBUG: Check tile structure on first tile
|
||||
if (i == 1) {
|
||||
message(paste(" [DEBUG] Tile CRS:", terra::crs(current_rast)))
|
||||
message(paste(" [DEBUG] Tile extent:", paste(terra::ext(current_rast))))
|
||||
message(paste(" [DEBUG] Field boundaries CRS:", sf::st_crs(field_boundaries_sf)))
|
||||
field_bbox <- sf::st_bbox(field_boundaries_sf)
|
||||
message(paste(" [DEBUG] Field bbox:", paste(round(field_bbox, 2))))
|
||||
message(paste(" [DEBUG] Band names:", paste(names(current_rast), collapse=", ")))
|
||||
}
|
||||
|
||||
# Extract CI band by name
|
||||
ci_band <- current_rast[["CI"]]
|
||||
|
||||
# Check if CI band exists - use proper logical checks
|
||||
if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) {
|
||||
message(paste(" ERROR: CI band not found. Available bands:", paste(names(current_rast), collapse=", ")))
|
||||
next
|
||||
}
|
||||
|
||||
# Check if CI band has any valid data
|
||||
if (tryCatch(all(is.na(values(ci_band))), error = function(e) TRUE)) {
|
||||
message(paste(" ERROR: CI band has no valid data"))
|
||||
next
|
||||
}
|
||||
|
||||
# Load previous week tile if available
|
||||
previous_tile_file <- sub(sprintf("week_%02d", current_week),
|
||||
sprintf("week_%02d", previous_week),
|
||||
tile_file)
|
||||
previous_ci <- NULL
|
||||
if (file.exists(previous_tile_file)) {
|
||||
previous_rast <- terra::rast(previous_tile_file)
|
||||
previous_ci <- previous_rast[["CI"]]
|
||||
}
|
||||
|
||||
# OPTION 1 + 2: Extract all CI statistics from one pixel extraction (single call)
|
||||
current_stats <- extract_field_statistics_from_ci(ci_band, field_boundaries_sf)
|
||||
|
||||
# DEBUG: Check extraction result on first tile
|
||||
if (i == 1) {
|
||||
num_with_data <- sum(!is.na(current_stats$mean_ci))
|
||||
message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data"))
|
||||
if (num_with_data > 0) {
|
||||
message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", ")))
|
||||
}
|
||||
}
|
||||
|
||||
# Extract previous week CI statistics if available
|
||||
previous_stats <- NULL
|
||||
if (!is.null(previous_ci)) {
|
||||
previous_stats <- extract_field_statistics_from_ci(previous_ci, field_boundaries_sf)
|
||||
}
|
||||
|
||||
# Process each field that was extracted
|
||||
field_results_this_tile <- list()
|
||||
fields_added <- 0
|
||||
|
||||
for (field_idx in seq_len(nrow(field_boundaries_sf))) {
|
||||
tryCatch({
|
||||
field_id <- field_boundaries_sf$field[field_idx]
|
||||
field_sf <- field_boundaries_sf[field_idx, ]
|
||||
|
||||
# Get statistics from helper function results
|
||||
# current_stats should have same number of rows as field_boundaries_sf
|
||||
if (field_idx > nrow(current_stats)) {
|
||||
message(paste(" [ERROR] field_idx", field_idx, "> nrow(current_stats)", nrow(current_stats)))
|
||||
next
|
||||
}
|
||||
|
||||
mean_ci_current <- current_stats$mean_ci[field_idx]
|
||||
pixel_count <- current_stats$pixel_count[field_idx]
|
||||
|
||||
# SKIP fields with no data in this tile (they don't intersect this tile)
|
||||
if (is.na(pixel_count) || pixel_count == 0) {
|
||||
next
|
||||
}
|
||||
ci_cv_current <- current_stats$cv[field_idx]
|
||||
ci_percentile_low <- current_stats$p10[field_idx]
|
||||
ci_percentile_high <- current_stats$p90[field_idx]
|
||||
|
||||
# If field doesn't intersect this tile, mean_ci_current will be NA
|
||||
if (is.na(mean_ci_current)) {
|
||||
next # Skip this field - doesn't intersect this tile
|
||||
}
|
||||
|
||||
field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000
|
||||
field_area_acres <- field_area_ha / 0.404686
|
||||
|
||||
# Extract previous week CI if available
|
||||
mean_ci_previous <- NA
|
||||
ci_change <- NA
|
||||
if (!is.null(previous_stats)) {
|
||||
mean_ci_previous <- previous_stats$mean_ci[field_idx]
|
||||
if (!is.na(mean_ci_previous)) {
|
||||
ci_change <- mean_ci_current - mean_ci_previous
|
||||
}
|
||||
}
|
||||
|
||||
# Reconstruct pixel values for status trigger (we need the actual pixel array)
|
||||
# Use the percentiles and mean to create a synthetic distribution for status_trigger
|
||||
# For now, use mean CI repeated by pixel count for testing
|
||||
# TODO: Consider extracting pixels directly if needed for more complex triggers
|
||||
pixel_count <- current_stats$pixel_count[field_idx]
|
||||
ci_vals_current <- if (pixel_count > 0) {
|
||||
rep(mean_ci_current, pixel_count) # Simplified: use mean value repeated
|
||||
} else {
|
||||
numeric(0)
|
||||
}
|
||||
|
||||
# Calculate age
|
||||
age_weeks <- if (!is.null(planting_dates) && nrow(planting_dates) > 0 && field_idx <= nrow(planting_dates)) {
|
||||
planting_date <- planting_dates$date[field_idx]
|
||||
if (!is.na(planting_date)) {
|
||||
as.numeric(difftime(end_date, planting_date, units = "weeks"))
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
0
|
||||
}
|
||||
|
||||
# Get phase and status
|
||||
phase <- get_phase_by_age(age_weeks)
|
||||
status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks)
|
||||
|
||||
# Cloud coverage categorization based on CI value
|
||||
# No data = No image available
|
||||
# CI 0.01 to 95 = Partial coverage
|
||||
# CI >= 95 = Clear view
|
||||
if (is.na(mean_ci_current) || mean_ci_current == 0) {
|
||||
cloud_category <- "No image available"
|
||||
# Set all CI metrics to NA since no valid data
|
||||
ci_change <- NA
|
||||
ci_cv_current <- NA
|
||||
ci_percentile_low <- NA
|
||||
ci_percentile_high <- NA
|
||||
} else if (mean_ci_current >= 95) {
|
||||
cloud_category <- "Clear view"
|
||||
} else {
|
||||
cloud_category <- "Partial coverage"
|
||||
}
|
||||
|
||||
# Build result row
|
||||
result_row <- data.frame(
|
||||
Field_id = field_id,
|
||||
Acreage = field_area_acres,
|
||||
Mean_CI = mean_ci_current,
|
||||
Mean_CI_prev = mean_ci_previous,
|
||||
CI_change = ci_change,
|
||||
CI_CV = ci_cv_current,
|
||||
CI_percentile_low = ci_percentile_low,
|
||||
CI_percentile_high = ci_percentile_high,
|
||||
Age_weeks = age_weeks,
|
||||
Phase = phase,
|
||||
Status_trigger = status_trigger,
|
||||
Cloud_category = cloud_category,
|
||||
stringsAsFactors = FALSE
|
||||
)
|
||||
|
||||
field_results_this_tile[[as.character(field_id)]] <- result_row
|
||||
fields_added <- fields_added + 1
|
||||
|
||||
}, error = function(e) {
|
||||
# Show error for debugging
|
||||
message(paste(" [FIELD ERROR] Field", field_idx, ":", e$message))
|
||||
})
|
||||
}
|
||||
|
||||
if (length(field_results_this_tile) > 0) {
|
||||
all_tile_results[[basename(tile_file)]] <- dplyr::bind_rows(field_results_this_tile)
|
||||
message(paste(" Extracted", length(field_results_this_tile), "fields from tile (processed", fields_added, "fields total)"))
|
||||
} else {
|
||||
message(paste(" WARNING: No fields extracted from this tile (processed", fields_added, "fields, all either NA or errored)"))
|
||||
}
|
||||
|
||||
}, error = function(e) {
|
||||
message(paste(" Error processing tile", basename(tile_file), ":", e$message))
|
||||
})
|
||||
}
|
||||
|
||||
# Combine all tile results, keeping unique fields (may appear in multiple tiles)
|
||||
if (length(all_tile_results) == 0) {
|
||||
stop("No fields extracted from any tiles!")
|
||||
}
|
||||
|
||||
field_analysis_df <- dplyr::bind_rows(all_tile_results) %>%
|
||||
distinct(Field_id, .keep_all = TRUE)
|
||||
|
||||
if (nrow(field_analysis_df) == 0) {
|
||||
stop("No fields analyzed successfully!")
|
||||
|
|
@ -1038,17 +1482,90 @@ main <- function() {
|
|||
cat("\n--- Summary Statistics ---\n")
|
||||
print(summary_statistics_df)
|
||||
|
||||
# ========== FARM-LEVEL KPI CALCULATION ==========
|
||||
# ========== FARM-LEVEL KPI AGGREGATION ==========
|
||||
# Aggregate the per-field analysis into farm-level summary statistics
|
||||
|
||||
farm_kpi_results <- calculate_and_export_farm_kpis(
|
||||
report_date = end_date,
|
||||
project_dir = project_dir,
|
||||
field_boundaries_sf = field_boundaries_sf,
|
||||
harvesting_data = harvesting_data,
|
||||
cumulative_CI_vals_dir = cumulative_CI_vals_dir,
|
||||
weekly_CI_mosaic = weekly_CI_mosaic,
|
||||
reports_dir = reports_dir
|
||||
)
|
||||
cat("\n=== CALCULATING FARM-LEVEL KPI SUMMARY ===\n")
|
||||
|
||||
# Filter to only fields that have actual data (non-NA CI and valid acreage)
|
||||
field_data <- field_analysis_df %>%
|
||||
filter(!is.na(Mean_CI) & !is.na(Acreage)) %>%
|
||||
filter(Acreage > 0)
|
||||
|
||||
if (nrow(field_data) > 0) {
|
||||
|
||||
if (nrow(field_data) > 0) {
|
||||
# Create summary statistics
|
||||
farm_summary <- list()
|
||||
|
||||
# 1. PHASE DISTRIBUTION
|
||||
phase_dist <- field_data %>%
|
||||
group_by(Phase) %>%
|
||||
summarise(
|
||||
num_fields = n(),
|
||||
acreage = sum(Acreage, na.rm = TRUE),
|
||||
.groups = 'drop'
|
||||
) %>%
|
||||
rename(Category = Phase)
|
||||
|
||||
farm_summary$phase_distribution <- phase_dist
|
||||
|
||||
# 2. STATUS TRIGGER DISTRIBUTION
|
||||
status_dist <- field_data %>%
|
||||
group_by(Status_trigger) %>%
|
||||
summarise(
|
||||
num_fields = n(),
|
||||
acreage = sum(Acreage, na.rm = TRUE),
|
||||
.groups = 'drop'
|
||||
) %>%
|
||||
rename(Category = Status_trigger)
|
||||
|
||||
farm_summary$status_distribution <- status_dist
|
||||
|
||||
# 3. CLOUD COVERAGE DISTRIBUTION
|
||||
cloud_dist <- field_data %>%
|
||||
group_by(Cloud_category) %>%
|
||||
summarise(
|
||||
num_fields = n(),
|
||||
acreage = sum(Acreage, na.rm = TRUE),
|
||||
.groups = 'drop'
|
||||
) %>%
|
||||
rename(Category = Cloud_category)
|
||||
|
||||
farm_summary$cloud_distribution <- cloud_dist
|
||||
|
||||
# 4. OVERALL STATISTICS
|
||||
farm_summary$overall_stats <- data.frame(
|
||||
total_fields = nrow(field_data),
|
||||
total_acreage = sum(field_data$Acreage, na.rm = TRUE),
|
||||
mean_ci = round(mean(field_data$Mean_CI, na.rm = TRUE), 2),
|
||||
median_ci = round(median(field_data$Mean_CI, na.rm = TRUE), 2),
|
||||
mean_cv = round(mean(field_data$CI_CV, na.rm = TRUE), 4),
|
||||
week = current_week,
|
||||
year = year,
|
||||
date = as.character(end_date)
|
||||
)
|
||||
|
||||
# Print summaries
|
||||
cat("\n--- PHASE DISTRIBUTION ---\n")
|
||||
print(phase_dist)
|
||||
|
||||
cat("\n--- STATUS TRIGGER DISTRIBUTION ---\n")
|
||||
print(status_dist)
|
||||
|
||||
cat("\n--- CLOUD COVERAGE DISTRIBUTION ---\n")
|
||||
print(cloud_dist)
|
||||
|
||||
cat("\n--- OVERALL FARM STATISTICS ---\n")
|
||||
print(farm_summary$overall_stats)
|
||||
|
||||
farm_kpi_results <- farm_summary
|
||||
} else {
|
||||
farm_kpi_results <- NULL
|
||||
}
|
||||
} else {
|
||||
farm_kpi_results <- NULL
|
||||
}
|
||||
|
||||
# ========== FINAL SUMMARY ==========
|
||||
|
||||
|
|
@ -1063,7 +1580,7 @@ main <- function() {
|
|||
if (!is.null(farm_kpi_results)) {
|
||||
cat("\nFarm-level KPIs: CALCULATED\n")
|
||||
} else {
|
||||
cat("\nFarm-level KPIs: SKIPPED (kpi_utils.R not available)\n")
|
||||
cat("\nFarm-level KPIs: SKIPPED (no valid tile data extracted)\n")
|
||||
}
|
||||
|
||||
cat("\n✓ Consolidated KPI calculation complete!\n")
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
|
|||
}
|
||||
|
||||
# PRIORITY 2: File-based detection (fallback if metadata not found)
|
||||
# Check if merged_final_tif/ contains tile-named files
|
||||
# Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories
|
||||
|
||||
if (!dir.exists(merged_final_tif_dir)) {
|
||||
return(list(
|
||||
|
|
@ -61,6 +61,30 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL
|
|||
))
|
||||
}
|
||||
|
||||
# First check if there are grid-size subdirectories (5x5, 10x10, etc.)
|
||||
# This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif
|
||||
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
|
||||
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
|
||||
|
||||
if (length(grid_patterns) > 0) {
|
||||
# Found grid-size subdirectories - tiles exist!
|
||||
grid_size <- grid_patterns[1]
|
||||
grid_dir <- file.path(merged_final_tif_dir, grid_size)
|
||||
|
||||
# List sample tile files from the grid directory
|
||||
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
|
||||
|
||||
return(list(
|
||||
has_tiles = TRUE,
|
||||
detected_tiles = sample_tiles,
|
||||
total_files = length(sample_tiles),
|
||||
source = "grid_subdirectory_detection",
|
||||
grid_size = grid_size,
|
||||
grid_path = grid_dir
|
||||
))
|
||||
}
|
||||
|
||||
# Fall back to checking for tile-named files directly in merged_final_tif
|
||||
# List all .tif files in merged_final_tif
|
||||
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue