Fix: Revert run_phase1_growing_window to original GitHub version (off-by-one fix) and move test scripts to harvest_detection_experiments/tests
This commit is contained in:
parent
458b8247be
commit
7b347ddba6
|
|
@ -107,9 +107,9 @@ def main():
|
|||
|
||||
# [3/4] Run model predictions with two-step detection
|
||||
print("\n[3/4] Running two-step harvest detection...")
|
||||
print(" (Using threshold=0.45, consecutive_days=2 - tuned for Model 307 output)")
|
||||
print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)")
|
||||
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device,
|
||||
phase1_threshold=0.45, phase1_consecutive=2)
|
||||
phase1_threshold=0.3, phase1_consecutive=2)
|
||||
|
||||
# Build and export
|
||||
print("\nBuilding production harvest table...")
|
||||
|
|
|
|||
|
|
@ -307,7 +307,6 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev
|
|||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
imminent_probs, detected_probs = model(x_tensor)
|
||||
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||
|
||||
# Check LAST timestep
|
||||
last_prob = detected_probs[-1]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,175 @@
|
|||
"""
|
||||
BATCH TEST: Sample multiple fields to understand model output distribution
|
||||
Purpose: Determine optimal threshold and consecutive_days parameters
|
||||
|
||||
This runs the model on 10 random fields and summarizes the results,
|
||||
helping us decide what parameters to use in the production script.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
|
||||
|
||||
def test_field(ci_data, field_id, model, config, scalers, device):
|
||||
"""Test a single field and return summary statistics"""
|
||||
|
||||
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
|
||||
if len(field_data) == 0:
|
||||
return None
|
||||
|
||||
try:
|
||||
ci_column = config['data']['ci_column']
|
||||
features = extract_features(field_data, config['features'], ci_column=ci_column)
|
||||
|
||||
# Apply scalers
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Run model
|
||||
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
output = model(features_tensor)
|
||||
|
||||
if isinstance(output, tuple):
|
||||
detected_probs = output[1].cpu().numpy().flatten()
|
||||
else:
|
||||
detected_probs = output.cpu().numpy().flatten()
|
||||
|
||||
# Analyze
|
||||
max_prob = detected_probs.max()
|
||||
mean_prob = detected_probs.mean()
|
||||
median_prob = np.median(detected_probs)
|
||||
|
||||
# Count consecutive days above thresholds
|
||||
consecutive_above = {}
|
||||
for thresh in [0.2, 0.3, 0.4, 0.5]:
|
||||
above = (detected_probs > thresh).astype(int)
|
||||
changes = np.diff(np.concatenate(([0], above, [0])))
|
||||
starts = np.where(changes == 1)[0]
|
||||
ends = np.where(changes == -1)[0]
|
||||
runs = ends - starts if len(starts) > 0 else []
|
||||
consecutive_above[thresh] = np.max(runs) if len(runs) > 0 else 0
|
||||
|
||||
return {
|
||||
'field': field_id,
|
||||
'max_prob': max_prob,
|
||||
'mean_prob': mean_prob,
|
||||
'median_prob': median_prob,
|
||||
'consecutive_0.2': consecutive_above[0.2],
|
||||
'consecutive_0.3': consecutive_above[0.3],
|
||||
'consecutive_0.4': consecutive_above[0.4],
|
||||
'consecutive_0.5': consecutive_above[0.5],
|
||||
'num_days': len(field_data),
|
||||
}
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 10
|
||||
|
||||
# Load data
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not CI_DATA_FILE.exists():
|
||||
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||
return
|
||||
|
||||
print(f"Loading CI data from {CI_DATA_FILE}...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Get random sample of fields
|
||||
all_fields = sorted(ci_data['field'].unique())
|
||||
np.random.seed(42)
|
||||
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
|
||||
print(f"Testing {len(sample_fields)} random fields...")
|
||||
|
||||
# Load model
|
||||
print(f"Loading model...")
|
||||
from harvest_date_pred_utils import load_model_and_config
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
# Test each field
|
||||
results = []
|
||||
for idx, field_id in enumerate(sample_fields, 1):
|
||||
result = test_field(ci_data, field_id, model, config, scalers, device)
|
||||
if result:
|
||||
results.append(result)
|
||||
print(f" [{idx:2d}/{len(sample_fields)}] Field {field_id:>6s}: "
|
||||
f"max={result['max_prob']:.3f} mean={result['mean_prob']:.4f} "
|
||||
f"consec@0.3={result['consecutive_0.3']:2d} consec@0.2={result['consecutive_0.2']:2d}")
|
||||
|
||||
# Summary statistics
|
||||
results_df = pd.DataFrame(results)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"SUMMARY STATISTICS ({len(results)} fields tested):")
|
||||
print(f"{'='*80}")
|
||||
print(f"\nMax probability (per field):")
|
||||
print(f" Mean: {results_df['max_prob'].mean():.4f}")
|
||||
print(f" Median: {results_df['max_prob'].median():.4f}")
|
||||
print(f" Min: {results_df['max_prob'].min():.4f}")
|
||||
print(f" Max: {results_df['max_prob'].max():.4f}")
|
||||
|
||||
print(f"\nMean probability (per field):")
|
||||
print(f" Mean: {results_df['mean_prob'].mean():.4f}")
|
||||
print(f" Median: {results_df['mean_prob'].median():.4f}")
|
||||
|
||||
print(f"\nConsecutive days above threshold=0.3:")
|
||||
print(f" Mean: {results_df['consecutive_0.3'].mean():.1f}")
|
||||
print(f" Median: {results_df['consecutive_0.3'].median():.1f}")
|
||||
print(f" Min: {results_df['consecutive_0.3'].min():.0f}")
|
||||
print(f" Max: {results_df['consecutive_0.3'].max():.0f}")
|
||||
|
||||
print(f"\nConsecutive days above threshold=0.2:")
|
||||
print(f" Mean: {results_df['consecutive_0.2'].mean():.1f}")
|
||||
print(f" Median: {results_df['consecutive_0.2'].median():.1f}")
|
||||
print(f" Min: {results_df['consecutive_0.2'].min():.0f}")
|
||||
print(f" Max: {results_df['consecutive_0.2'].max():.0f}")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"RECOMMENDATION:")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Calculate recommendations based on data
|
||||
median_consec_0_3 = results_df['consecutive_0.3'].median()
|
||||
median_consec_0_2 = results_df['consecutive_0.2'].median()
|
||||
|
||||
if median_consec_0_3 >= 3:
|
||||
print(f"✓ Threshold=0.3 with consecutive_days=3 should work")
|
||||
print(f" (median consecutive days: {median_consec_0_3:.0f})")
|
||||
elif median_consec_0_3 >= 2:
|
||||
print(f"✓ Threshold=0.3 with consecutive_days=2 recommended")
|
||||
print(f" (median consecutive days: {median_consec_0_3:.0f})")
|
||||
elif median_consec_0_2 >= 3:
|
||||
print(f"✓ Threshold=0.2 with consecutive_days=3 recommended")
|
||||
print(f" (median consecutive days: {median_consec_0_2:.0f})")
|
||||
else:
|
||||
print(f"✓ Threshold=0.2 with consecutive_days=2 recommended")
|
||||
print(f" (median consecutive days @ 0.2: {median_consec_0_2:.0f})")
|
||||
|
||||
print(f"\nCurrent production settings: threshold=0.45, consecutive_days=2")
|
||||
print(f" → These are likely TOO STRICT (only 289 fields detected in batch run)")
|
||||
print(f"\nSuggested adjustment:")
|
||||
print(f" → Lower threshold to 0.2-0.3")
|
||||
print(f" → Reduce consecutive_days to 1-2")
|
||||
print(f" → Re-run batch to get ~1000+ fields detected")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,272 @@
|
|||
"""
|
||||
COMPARISON TEST: Growing Window vs Single Run approach
|
||||
Purpose: Compare harvest dates detected by two different methods
|
||||
|
||||
Method 1 (Growing Window - Current):
|
||||
- Day 1: Run model on [day1]
|
||||
- Day 2: Run model on [day1:2]
|
||||
- Day 3: Run model on [day1:3]
|
||||
- ... up to day 477
|
||||
- This matches real-time production where data arrives daily
|
||||
- Takes ~477 model runs per field (SLOW)
|
||||
|
||||
Method 2 (Single Run - Proposed):
|
||||
- Run model ONCE on full [day1:477] sequence
|
||||
- Use these probabilities to scan for harvests
|
||||
- This is 477x faster but assumes different LSTM context
|
||||
- May produce different harvest dates
|
||||
|
||||
Question: Do these methods detect similar harvest dates or different ones?
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
import time
|
||||
|
||||
|
||||
def method_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
|
||||
"""Original method: expanding window, run model multiple times"""
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
for window_end in range(current_pos + 1, len(field_data) + 1):
|
||||
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
|
||||
|
||||
try:
|
||||
features = extract_features(window_data, config['features'], ci_column=ci_column)
|
||||
|
||||
# Apply scalers
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Run model
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
output = model(x_tensor)
|
||||
|
||||
if isinstance(output, tuple):
|
||||
_, detected_probs = output
|
||||
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||
else:
|
||||
detected_probs = output.squeeze(0).cpu().numpy()
|
||||
|
||||
# Check LAST timestep
|
||||
last_prob = detected_probs[-1]
|
||||
|
||||
if last_prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
# Harvest detected
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_idx = current_pos + window_end - consecutive_days - 1
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx, last_prob))
|
||||
|
||||
# Reset to next day after harvest
|
||||
current_pos = current_pos + window_end - consecutive_days
|
||||
break
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
|
||||
return harvest_dates
|
||||
|
||||
|
||||
def method_single_run(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
|
||||
"""Proposed method: run model once on full sequence"""
|
||||
harvest_dates = []
|
||||
current_pos = 0
|
||||
|
||||
try:
|
||||
# Extract features ONCE for full dataset
|
||||
features = extract_features(field_data, config['features'], ci_column=ci_column)
|
||||
|
||||
# Apply scalers
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Run model ONCE to get all probabilities
|
||||
with torch.no_grad():
|
||||
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
|
||||
output = model(x_tensor)
|
||||
|
||||
if isinstance(output, tuple):
|
||||
_, detected_probs = output
|
||||
detected_probs = detected_probs.squeeze(0).cpu().numpy()
|
||||
else:
|
||||
detected_probs = output.squeeze(0).cpu().numpy()
|
||||
|
||||
# Scan forward looking for harvests
|
||||
while current_pos < len(field_data):
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
for pos in range(current_pos, len(field_data)):
|
||||
prob = detected_probs[pos]
|
||||
|
||||
if prob > threshold:
|
||||
consecutive_above_threshold += 1
|
||||
else:
|
||||
consecutive_above_threshold = 0
|
||||
|
||||
# Harvest detected
|
||||
if consecutive_above_threshold >= consecutive_days:
|
||||
harvest_idx = pos - consecutive_days + 1
|
||||
harvest_date = field_data.iloc[harvest_idx]['Date']
|
||||
harvest_dates.append((harvest_date, harvest_idx, prob))
|
||||
|
||||
# Move anchor point past this harvest
|
||||
current_pos = harvest_idx + 1
|
||||
break
|
||||
else:
|
||||
# No harvest found in remaining data
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return harvest_dates
|
||||
|
||||
|
||||
def compare_field(field_id, ci_data, model, config, scalers, device, threshold=0.3):
|
||||
"""Compare both methods for a single field"""
|
||||
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
if len(field_data) < 10:
|
||||
return None
|
||||
|
||||
ci_column = config['data']['ci_column']
|
||||
|
||||
# Method 1: Growing window (SLOW)
|
||||
print(f"\n Growing Window method...", end=" ", flush=True)
|
||||
start = time.time()
|
||||
growing_harvests = method_growing_window(field_data, model, config, scalers, ci_column, device, threshold)
|
||||
time_growing = time.time() - start
|
||||
print(f"({time_growing:.2f}s, {len(field_data)} model runs)")
|
||||
|
||||
# Method 2: Single run (FAST)
|
||||
print(f" Single Run method...", end=" ", flush=True)
|
||||
start = time.time()
|
||||
single_harvests = method_single_run(field_data, model, config, scalers, ci_column, device, threshold)
|
||||
time_single = time.time() - start
|
||||
print(f"({time_single:.2f}s, 1 model run)")
|
||||
|
||||
return {
|
||||
'field': field_id,
|
||||
'num_days': len(field_data),
|
||||
'growing_harvests': growing_harvests,
|
||||
'single_harvests': single_harvests,
|
||||
'time_growing': time_growing,
|
||||
'time_single': time_single,
|
||||
'speedup': time_growing / time_single if time_single > 0 else 0,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 3
|
||||
threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3
|
||||
|
||||
# Load data
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not CI_DATA_FILE.exists():
|
||||
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||
return
|
||||
|
||||
print(f"Loading CI data...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
# Get sample of fields
|
||||
all_fields = sorted(ci_data['field'].unique())
|
||||
np.random.seed(42)
|
||||
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
|
||||
|
||||
# Load model
|
||||
print(f"Loading model...")
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"COMPARING METHODS (threshold={threshold}, consecutive_days=2)")
|
||||
print(f"{'='*80}")
|
||||
|
||||
all_results = []
|
||||
for idx, field_id in enumerate(sample_fields, 1):
|
||||
print(f"\n[{idx}/{len(sample_fields)}] Field {field_id}")
|
||||
result = compare_field(field_id, ci_data, model, config, scalers, device, threshold)
|
||||
|
||||
if result:
|
||||
all_results.append(result)
|
||||
|
||||
print(f"\n Growing Window found {len(result['growing_harvests'])} harvests:")
|
||||
for date, idx, prob in result['growing_harvests']:
|
||||
print(f" - {date.date()}: prob={prob:.4f}")
|
||||
|
||||
print(f"\n Single Run found {len(result['single_harvests'])} harvests:")
|
||||
for date, idx, prob in result['single_harvests']:
|
||||
print(f" - {date.date()}: prob={prob:.4f}")
|
||||
|
||||
print(f"\n ⏱ Speed comparison:")
|
||||
print(f" Growing Window: {result['time_growing']:.2f}s ({result['num_days']} days processed)")
|
||||
print(f" Single Run: {result['time_single']:.2f}s (1 run)")
|
||||
print(f" Speedup: {result['speedup']:.0f}x faster")
|
||||
|
||||
# Compare harvests
|
||||
growing_dates = [d for d, _, _ in result['growing_harvests']]
|
||||
single_dates = [d for d, _, _ in result['single_harvests']]
|
||||
|
||||
if growing_dates == single_dates:
|
||||
print(f"\n ✓ IDENTICAL: Both methods found the same harvest dates")
|
||||
else:
|
||||
print(f"\n ✗ DIFFERENT: Methods found different harvest dates")
|
||||
print(f" Growing only: {[d for d in growing_dates if d not in single_dates]}")
|
||||
print(f" Single only: {[d for d in single_dates if d not in growing_dates]}")
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*80}")
|
||||
print(f"SUMMARY ({len(all_results)} fields tested)")
|
||||
print(f"{'='*80}")
|
||||
|
||||
identical = sum(1 for r in all_results if [d for d, _, _ in r['growing_harvests']] == [d for d, _, _ in r['single_harvests']])
|
||||
different = len(all_results) - identical
|
||||
|
||||
print(f"\nIdentical results: {identical}/{len(all_results)}")
|
||||
print(f"Different results: {different}/{len(all_results)}")
|
||||
|
||||
if all_results:
|
||||
avg_speedup = np.mean([r['speedup'] for r in all_results])
|
||||
print(f"\nAverage speedup: {avg_speedup:.0f}x")
|
||||
print(f"\nConclusion:")
|
||||
if identical == len(all_results):
|
||||
print(f" ✓ Methods are EQUIVALENT - Single run approach is safe to use")
|
||||
print(f" Recommend switching to single run for {avg_speedup:.0f}x faster execution")
|
||||
else:
|
||||
print(f" ✗ Methods produce DIFFERENT results - Need to understand why")
|
||||
print(f" Growing window is slower but may be more accurate for live deployment")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,276 @@
|
|||
"""
|
||||
TEST SCRIPT: Inspect raw model output distributions for a single field
|
||||
Purpose: Diagnose why thresholding is failing and harvest dates are wrong
|
||||
|
||||
This shows:
|
||||
1. Distribution of harvest probability scores (0-1 range)
|
||||
2. How often consecutive_days=3 is actually achievable
|
||||
3. Actual season boundaries detected
|
||||
4. Recommendations for threshold adjustment
|
||||
|
||||
Usage:
|
||||
python test_model_output_distribution.py angata [field_id]
|
||||
|
||||
Examples:
|
||||
python test_model_output_distribution.py angata 1
|
||||
python test_model_output_distribution.py angata 10042
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from harvest_date_pred_utils import (
|
||||
load_model_and_config,
|
||||
extract_features,
|
||||
)
|
||||
|
||||
|
||||
def analyze_single_field(ci_data, field_id, model, config, scalers, device):
|
||||
"""Analyze raw model outputs for a single field"""
|
||||
|
||||
# Filter to field
|
||||
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
|
||||
|
||||
if len(field_data) == 0:
|
||||
print(f"ERROR: No data for field {field_id}")
|
||||
return
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FIELD: {field_id}")
|
||||
print(f"{'='*80}")
|
||||
print(f"Date range: {field_data['Date'].min()} to {field_data['Date'].max()}")
|
||||
print(f"Total days: {len(field_data)}")
|
||||
|
||||
# Extract features (same as main pipeline)
|
||||
try:
|
||||
ci_column = config['data']['ci_column']
|
||||
features = extract_features(field_data, config['features'], ci_column=ci_column)
|
||||
if features is None or len(features) == 0:
|
||||
print(f"ERROR: extract_features returned empty for field {field_id}")
|
||||
return
|
||||
except Exception as e:
|
||||
print(f"ERROR extracting features: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
# Apply scalers (CRITICAL - same as production code)
|
||||
try:
|
||||
for fi, scaler in enumerate(scalers):
|
||||
try:
|
||||
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
|
||||
except Exception as e:
|
||||
print(f" Warning: Scaler {fi} failed: {e}")
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"ERROR applying scalers: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
# Run model
|
||||
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
|
||||
with torch.no_grad():
|
||||
output = model(features_tensor)
|
||||
|
||||
# Convert to numpy (handle different output formats)
|
||||
# Model has TWO heads: imminent_probs, detected_probs
|
||||
if isinstance(output, tuple):
|
||||
imminent_probs = output[0].cpu().numpy().flatten()
|
||||
detected_probs = output[1].cpu().numpy().flatten()
|
||||
probs = detected_probs # Use DETECTED head (current harvest), not imminent (future)
|
||||
print(f"Model outputs TWO heads: imminent + detected")
|
||||
print(f" Using DETECTED head for harvest detection (sparse spikes at harvest)")
|
||||
else:
|
||||
# Fallback for single output
|
||||
probs = output.cpu().numpy().flatten()
|
||||
print(f"Model output: single head")
|
||||
|
||||
print(f"\nModel output shape: {probs.shape}")
|
||||
print(f"Output range: {probs.min():.4f} to {probs.max():.4f}")
|
||||
|
||||
# Statistics
|
||||
print(f"\nHarvest probability statistics:")
|
||||
print(f" Mean: {probs.mean():.4f}")
|
||||
print(f" Median: {np.median(probs):.4f}")
|
||||
print(f" Std Dev: {probs.std():.4f}")
|
||||
print(f" Min: {probs.min():.4f}")
|
||||
print(f" Max: {probs.max():.4f}")
|
||||
|
||||
# Distribution by threshold
|
||||
print(f"\nDistribution by threshold:")
|
||||
thresholds = [0.2, 0.3, 0.4, 0.45, 0.5, 0.6, 0.7]
|
||||
for thresh in thresholds:
|
||||
count = np.sum(probs > thresh)
|
||||
pct = 100 * count / len(probs)
|
||||
print(f" > {thresh}: {count:4d} days ({pct:5.1f}%)")
|
||||
|
||||
# Consecutive days analysis (key metric)
|
||||
print(f"\nConsecutive days above threshold (Phase 1 requirement):")
|
||||
for thresh in [0.3, 0.4, 0.45, 0.5]:
|
||||
above = (probs > thresh).astype(int)
|
||||
# Find consecutive runs
|
||||
changes = np.diff(np.concatenate(([0], above, [0])))
|
||||
starts = np.where(changes == 1)[0]
|
||||
ends = np.where(changes == -1)[0]
|
||||
runs = ends - starts
|
||||
|
||||
if len(runs) > 0:
|
||||
max_run = np.max(runs)
|
||||
print(f" Threshold {thresh}: max consecutive = {max_run} days (need 3 for Phase 1)")
|
||||
if len(runs) > 5:
|
||||
print(f" {len(runs)} separate runs detected (seasons?)")
|
||||
else:
|
||||
print(f" Threshold {thresh}: no runs detected")
|
||||
|
||||
# Show top predictions
|
||||
print(f"\nTop 10 harvest probability peaks:")
|
||||
top_indices = np.argsort(probs)[-10:][::-1]
|
||||
for rank, idx in enumerate(top_indices, 1):
|
||||
date = field_data.iloc[idx]['Date']
|
||||
prob = probs[idx]
|
||||
print(f" {rank:2d}. Day {idx:4d} ({date}): {prob:.4f}")
|
||||
|
||||
# Show timeline
|
||||
print(f"\nTimeline of probabilities (sampling every 10 days):")
|
||||
for idx in range(0, len(probs), max(1, len(probs) // 20)):
|
||||
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
|
||||
ci_value = field_data.iloc[idx]['FitData']
|
||||
prob = probs[idx]
|
||||
bar = '-' * int(prob * 35)
|
||||
try:
|
||||
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f} {bar}")
|
||||
except UnicodeEncodeError:
|
||||
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f}")
|
||||
|
||||
# FULL DAILY PROBABILITIES WITH CI VALUES
|
||||
print(f"\n{'='*80}")
|
||||
print(f"FULL DAILY PROBABILITIES WITH CI VALUES ({len(probs)} days):")
|
||||
print(f"{'='*80}")
|
||||
print(f"{'Day':>4} {'Date':<12} {'CI':>8} {'Probability':>12} {'Visual':<40}")
|
||||
print(f"{'-'*100}")
|
||||
|
||||
for idx in range(len(probs)):
|
||||
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
|
||||
ci_value = field_data.iloc[idx]['FitData']
|
||||
prob = probs[idx]
|
||||
bar = '-' * int(prob * 35) # Use dashes instead of█ for Unicode safety
|
||||
try:
|
||||
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f} {bar}")
|
||||
except UnicodeEncodeError:
|
||||
# Fallback for Windows encoding issues
|
||||
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f}")
|
||||
|
||||
print(f"{'-'*100}")
|
||||
|
||||
# Find valleys (days with low probabilities that could indicate season boundaries)
|
||||
print(f"\nDays with LOWEST probabilities (potential season boundaries):")
|
||||
valleys_threshold = 0.5 # Days below this might be season breaks
|
||||
valley_indices = np.where(probs < valleys_threshold)[0]
|
||||
|
||||
if len(valley_indices) > 0:
|
||||
print(f" Found {len(valley_indices)} days below {valleys_threshold}")
|
||||
# Get valleys sorted by probability
|
||||
valley_data = [(idx, probs[idx], field_data.iloc[idx]['Date']) for idx in valley_indices]
|
||||
valley_data.sort(key=lambda x: x[1]) # Sort by probability (lowest first)
|
||||
|
||||
print(f"\n Bottom 20 lowest-probability days:")
|
||||
for rank, (idx, prob, date) in enumerate(valley_data[:20], 1):
|
||||
print(f" {rank:2d}. Day {idx:3d} ({date}): {prob:.4f}")
|
||||
else:
|
||||
print(f" None - all days above {valleys_threshold}")
|
||||
|
||||
# Identify likely harvest dates by finding local minima (valleys between growing periods)
|
||||
print(f"\nLikely season boundaries (local minima approach):")
|
||||
# Find indices where probability suddenly drops (derivative)
|
||||
if len(probs) > 7:
|
||||
smoothed = pd.Series(probs).rolling(window=7, center=True).mean()
|
||||
derivatives = smoothed.diff().fillna(0)
|
||||
|
||||
# Find big drops (where derivative is very negative)
|
||||
drops = np.where(derivatives < -0.2)[0] # Significant downward moves
|
||||
|
||||
if len(drops) > 0:
|
||||
print(f" Found {len(drops)} significant drops (prob falling by 0.2+):")
|
||||
for idx in drops[:10]: # Show first 10
|
||||
date = field_data.iloc[idx]['Date']
|
||||
before = probs[max(0, idx-1)]
|
||||
after = probs[idx]
|
||||
print(f" Day {idx:3d} ({date}): {before:.4f} → {after:.4f}")
|
||||
else:
|
||||
print(f" No significant drops detected (probabilities don't dip much)")
|
||||
|
||||
# Show which harvest dates would be detected at different thresholds
|
||||
print(f"\nHarvest detection (first day where prob > threshold for N consecutive days):")
|
||||
for thresh in [0.2, 0.3, 0.4, 0.5, 0.6]:
|
||||
for consec in [1, 2, 3]:
|
||||
above = (probs > thresh).astype(int)
|
||||
changes = np.diff(np.concatenate(([0], above, [0])))
|
||||
starts = np.where(changes == 1)[0]
|
||||
|
||||
if len(starts) > 0:
|
||||
# For each harvest start, find where it would trigger with consecutive_days
|
||||
detected_harvests = []
|
||||
for start_idx in starts:
|
||||
# Check if we have enough consecutive days
|
||||
if start_idx + consec - 1 < len(probs):
|
||||
if all(probs[start_idx:start_idx + consec] > thresh):
|
||||
harvest_date = field_data.iloc[start_idx]['Date']
|
||||
detected_harvests.append((start_idx, harvest_date))
|
||||
|
||||
if detected_harvests:
|
||||
first_idx, first_date = detected_harvests[0]
|
||||
print(f" Threshold={thresh}, consecutive={consec}: {first_date} (day {first_idx})")
|
||||
else:
|
||||
print(f" Threshold={thresh}, consecutive={consec}: None detected")
|
||||
|
||||
|
||||
def main():
|
||||
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
|
||||
field_id = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
# Paths
|
||||
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
|
||||
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
|
||||
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
|
||||
|
||||
if not CI_DATA_FILE.exists():
|
||||
print(f"ERROR: {CI_DATA_FILE} not found")
|
||||
return
|
||||
|
||||
print(f"Loading CI data from {CI_DATA_FILE}...")
|
||||
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
|
||||
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
|
||||
|
||||
if field_id is None:
|
||||
# Show first 10 fields if none specified
|
||||
fields = sorted(ci_data['field'].unique())[:10]
|
||||
field_id = fields[0]
|
||||
print(f"No field specified. Testing first field: {field_id}")
|
||||
print(f"Available fields: {', '.join(fields)}")
|
||||
|
||||
# Load model
|
||||
print(f"\nLoading model...")
|
||||
from harvest_date_pred_utils import load_model_and_config
|
||||
model, config, scalers = load_model_and_config(Path("."))
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
print(f"Device: {device}")
|
||||
|
||||
# Analyze
|
||||
analyze_single_field(ci_data, str(field_id), model, config, scalers, device)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("INTERPRETATION:")
|
||||
print(" If max consecutive < 3 for threshold=0.5:")
|
||||
print(" → Lower threshold to 0.3-0.4, or reduce consecutive_days to 1-2")
|
||||
print(" If multiple runs detected but harvest_date == first date:")
|
||||
print(" → Season detection is failing (check extract_features)")
|
||||
print(" If peaks are scattered randomly:")
|
||||
print(" → Model may need retraining or data validation")
|
||||
print("="*80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue