Fix: Revert run_phase1_growing_window to original GitHub version (off-by-one fix) and move test scripts to harvest_detection_experiments/tests

This commit is contained in:
Timon 2026-01-15 09:08:07 +01:00
parent 458b8247be
commit 7b347ddba6
5 changed files with 725 additions and 3 deletions

View file

@ -107,9 +107,9 @@ def main():
# [3/4] Run model predictions with two-step detection
print("\n[3/4] Running two-step harvest detection...")
print(" (Using threshold=0.45, consecutive_days=2 - tuned for Model 307 output)")
print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)")
refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device,
phase1_threshold=0.45, phase1_consecutive=2)
phase1_threshold=0.3, phase1_consecutive=2)
# Build and export
print("\nBuilding production harvest table...")

View file

@ -307,7 +307,6 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
imminent_probs, detected_probs = model(x_tensor)
detected_probs = detected_probs.squeeze(0).cpu().numpy()
# Check LAST timestep
last_prob = detected_probs[-1]

View file

@ -0,0 +1,175 @@
"""
BATCH TEST: Sample multiple fields to understand model output distribution
Purpose: Determine optimal threshold and consecutive_days parameters
This runs the model on 10 random fields and summarizes the results,
helping us decide what parameters to use in the production script.
"""
import pandas as pd
import numpy as np
import torch
import sys
from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
def test_field(ci_data, field_id, model, config, scalers, device):
"""Test a single field and return summary statistics"""
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
if len(field_data) == 0:
return None
try:
ci_column = config['data']['ci_column']
features = extract_features(field_data, config['features'], ci_column=ci_column)
# Apply scalers
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except:
pass
# Run model
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
with torch.no_grad():
output = model(features_tensor)
if isinstance(output, tuple):
detected_probs = output[1].cpu().numpy().flatten()
else:
detected_probs = output.cpu().numpy().flatten()
# Analyze
max_prob = detected_probs.max()
mean_prob = detected_probs.mean()
median_prob = np.median(detected_probs)
# Count consecutive days above thresholds
consecutive_above = {}
for thresh in [0.2, 0.3, 0.4, 0.5]:
above = (detected_probs > thresh).astype(int)
changes = np.diff(np.concatenate(([0], above, [0])))
starts = np.where(changes == 1)[0]
ends = np.where(changes == -1)[0]
runs = ends - starts if len(starts) > 0 else []
consecutive_above[thresh] = np.max(runs) if len(runs) > 0 else 0
return {
'field': field_id,
'max_prob': max_prob,
'mean_prob': mean_prob,
'median_prob': median_prob,
'consecutive_0.2': consecutive_above[0.2],
'consecutive_0.3': consecutive_above[0.3],
'consecutive_0.4': consecutive_above[0.4],
'consecutive_0.5': consecutive_above[0.5],
'num_days': len(field_data),
}
except Exception as e:
return None
def main():
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 10
# Load data
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
if not CI_DATA_FILE.exists():
print(f"ERROR: {CI_DATA_FILE} not found")
return
print(f"Loading CI data from {CI_DATA_FILE}...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Get random sample of fields
all_fields = sorted(ci_data['field'].unique())
np.random.seed(42)
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
print(f"Testing {len(sample_fields)} random fields...")
# Load model
print(f"Loading model...")
from harvest_date_pred_utils import load_model_and_config
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Test each field
results = []
for idx, field_id in enumerate(sample_fields, 1):
result = test_field(ci_data, field_id, model, config, scalers, device)
if result:
results.append(result)
print(f" [{idx:2d}/{len(sample_fields)}] Field {field_id:>6s}: "
f"max={result['max_prob']:.3f} mean={result['mean_prob']:.4f} "
f"consec@0.3={result['consecutive_0.3']:2d} consec@0.2={result['consecutive_0.2']:2d}")
# Summary statistics
results_df = pd.DataFrame(results)
print(f"\n{'='*80}")
print(f"SUMMARY STATISTICS ({len(results)} fields tested):")
print(f"{'='*80}")
print(f"\nMax probability (per field):")
print(f" Mean: {results_df['max_prob'].mean():.4f}")
print(f" Median: {results_df['max_prob'].median():.4f}")
print(f" Min: {results_df['max_prob'].min():.4f}")
print(f" Max: {results_df['max_prob'].max():.4f}")
print(f"\nMean probability (per field):")
print(f" Mean: {results_df['mean_prob'].mean():.4f}")
print(f" Median: {results_df['mean_prob'].median():.4f}")
print(f"\nConsecutive days above threshold=0.3:")
print(f" Mean: {results_df['consecutive_0.3'].mean():.1f}")
print(f" Median: {results_df['consecutive_0.3'].median():.1f}")
print(f" Min: {results_df['consecutive_0.3'].min():.0f}")
print(f" Max: {results_df['consecutive_0.3'].max():.0f}")
print(f"\nConsecutive days above threshold=0.2:")
print(f" Mean: {results_df['consecutive_0.2'].mean():.1f}")
print(f" Median: {results_df['consecutive_0.2'].median():.1f}")
print(f" Min: {results_df['consecutive_0.2'].min():.0f}")
print(f" Max: {results_df['consecutive_0.2'].max():.0f}")
print(f"\n{'='*80}")
print(f"RECOMMENDATION:")
print(f"{'='*80}")
# Calculate recommendations based on data
median_consec_0_3 = results_df['consecutive_0.3'].median()
median_consec_0_2 = results_df['consecutive_0.2'].median()
if median_consec_0_3 >= 3:
print(f"✓ Threshold=0.3 with consecutive_days=3 should work")
print(f" (median consecutive days: {median_consec_0_3:.0f})")
elif median_consec_0_3 >= 2:
print(f"✓ Threshold=0.3 with consecutive_days=2 recommended")
print(f" (median consecutive days: {median_consec_0_3:.0f})")
elif median_consec_0_2 >= 3:
print(f"✓ Threshold=0.2 with consecutive_days=3 recommended")
print(f" (median consecutive days: {median_consec_0_2:.0f})")
else:
print(f"✓ Threshold=0.2 with consecutive_days=2 recommended")
print(f" (median consecutive days @ 0.2: {median_consec_0_2:.0f})")
print(f"\nCurrent production settings: threshold=0.45, consecutive_days=2")
print(f" → These are likely TOO STRICT (only 289 fields detected in batch run)")
print(f"\nSuggested adjustment:")
print(f" → Lower threshold to 0.2-0.3")
print(f" → Reduce consecutive_days to 1-2")
print(f" → Re-run batch to get ~1000+ fields detected")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,272 @@
"""
COMPARISON TEST: Growing Window vs Single Run approach
Purpose: Compare harvest dates detected by two different methods
Method 1 (Growing Window - Current):
- Day 1: Run model on [day1]
- Day 2: Run model on [day1:2]
- Day 3: Run model on [day1:3]
- ... up to day 477
- This matches real-time production where data arrives daily
- Takes ~477 model runs per field (SLOW)
Method 2 (Single Run - Proposed):
- Run model ONCE on full [day1:477] sequence
- Use these probabilities to scan for harvests
- This is 477x faster but assumes different LSTM context
- May produce different harvest dates
Question: Do these methods detect similar harvest dates or different ones?
"""
import pandas as pd
import numpy as np
import torch
import sys
from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
import time
def method_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
"""Original method: expanding window, run model multiple times"""
harvest_dates = []
current_pos = 0
while current_pos < len(field_data):
consecutive_above_threshold = 0
for window_end in range(current_pos + 1, len(field_data) + 1):
window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True)
try:
features = extract_features(window_data, config['features'], ci_column=ci_column)
# Apply scalers
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception:
pass
# Run model
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
output = model(x_tensor)
if isinstance(output, tuple):
_, detected_probs = output
detected_probs = detected_probs.squeeze(0).cpu().numpy()
else:
detected_probs = output.squeeze(0).cpu().numpy()
# Check LAST timestep
last_prob = detected_probs[-1]
if last_prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
# Harvest detected
if consecutive_above_threshold >= consecutive_days:
harvest_idx = current_pos + window_end - consecutive_days - 1
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx, last_prob))
# Reset to next day after harvest
current_pos = current_pos + window_end - consecutive_days
break
except Exception:
continue
else:
break
return harvest_dates
def method_single_run(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2):
"""Proposed method: run model once on full sequence"""
harvest_dates = []
current_pos = 0
try:
# Extract features ONCE for full dataset
features = extract_features(field_data, config['features'], ci_column=ci_column)
# Apply scalers
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception:
pass
# Run model ONCE to get all probabilities
with torch.no_grad():
x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
output = model(x_tensor)
if isinstance(output, tuple):
_, detected_probs = output
detected_probs = detected_probs.squeeze(0).cpu().numpy()
else:
detected_probs = output.squeeze(0).cpu().numpy()
# Scan forward looking for harvests
while current_pos < len(field_data):
consecutive_above_threshold = 0
for pos in range(current_pos, len(field_data)):
prob = detected_probs[pos]
if prob > threshold:
consecutive_above_threshold += 1
else:
consecutive_above_threshold = 0
# Harvest detected
if consecutive_above_threshold >= consecutive_days:
harvest_idx = pos - consecutive_days + 1
harvest_date = field_data.iloc[harvest_idx]['Date']
harvest_dates.append((harvest_date, harvest_idx, prob))
# Move anchor point past this harvest
current_pos = harvest_idx + 1
break
else:
# No harvest found in remaining data
break
except Exception as e:
pass
return harvest_dates
def compare_field(field_id, ci_data, model, config, scalers, device, threshold=0.3):
"""Compare both methods for a single field"""
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
if len(field_data) < 10:
return None
ci_column = config['data']['ci_column']
# Method 1: Growing window (SLOW)
print(f"\n Growing Window method...", end=" ", flush=True)
start = time.time()
growing_harvests = method_growing_window(field_data, model, config, scalers, ci_column, device, threshold)
time_growing = time.time() - start
print(f"({time_growing:.2f}s, {len(field_data)} model runs)")
# Method 2: Single run (FAST)
print(f" Single Run method...", end=" ", flush=True)
start = time.time()
single_harvests = method_single_run(field_data, model, config, scalers, ci_column, device, threshold)
time_single = time.time() - start
print(f"({time_single:.2f}s, 1 model run)")
return {
'field': field_id,
'num_days': len(field_data),
'growing_harvests': growing_harvests,
'single_harvests': single_harvests,
'time_growing': time_growing,
'time_single': time_single,
'speedup': time_growing / time_single if time_single > 0 else 0,
}
def main():
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 3
threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3
# Load data
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
if not CI_DATA_FILE.exists():
print(f"ERROR: {CI_DATA_FILE} not found")
return
print(f"Loading CI data...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
# Get sample of fields
all_fields = sorted(ci_data['field'].unique())
np.random.seed(42)
sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False)
# Load model
print(f"Loading model...")
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n{'='*80}")
print(f"COMPARING METHODS (threshold={threshold}, consecutive_days=2)")
print(f"{'='*80}")
all_results = []
for idx, field_id in enumerate(sample_fields, 1):
print(f"\n[{idx}/{len(sample_fields)}] Field {field_id}")
result = compare_field(field_id, ci_data, model, config, scalers, device, threshold)
if result:
all_results.append(result)
print(f"\n Growing Window found {len(result['growing_harvests'])} harvests:")
for date, idx, prob in result['growing_harvests']:
print(f" - {date.date()}: prob={prob:.4f}")
print(f"\n Single Run found {len(result['single_harvests'])} harvests:")
for date, idx, prob in result['single_harvests']:
print(f" - {date.date()}: prob={prob:.4f}")
print(f"\n ⏱ Speed comparison:")
print(f" Growing Window: {result['time_growing']:.2f}s ({result['num_days']} days processed)")
print(f" Single Run: {result['time_single']:.2f}s (1 run)")
print(f" Speedup: {result['speedup']:.0f}x faster")
# Compare harvests
growing_dates = [d for d, _, _ in result['growing_harvests']]
single_dates = [d for d, _, _ in result['single_harvests']]
if growing_dates == single_dates:
print(f"\n ✓ IDENTICAL: Both methods found the same harvest dates")
else:
print(f"\n ✗ DIFFERENT: Methods found different harvest dates")
print(f" Growing only: {[d for d in growing_dates if d not in single_dates]}")
print(f" Single only: {[d for d in single_dates if d not in growing_dates]}")
# Summary
print(f"\n{'='*80}")
print(f"SUMMARY ({len(all_results)} fields tested)")
print(f"{'='*80}")
identical = sum(1 for r in all_results if [d for d, _, _ in r['growing_harvests']] == [d for d, _, _ in r['single_harvests']])
different = len(all_results) - identical
print(f"\nIdentical results: {identical}/{len(all_results)}")
print(f"Different results: {different}/{len(all_results)}")
if all_results:
avg_speedup = np.mean([r['speedup'] for r in all_results])
print(f"\nAverage speedup: {avg_speedup:.0f}x")
print(f"\nConclusion:")
if identical == len(all_results):
print(f" ✓ Methods are EQUIVALENT - Single run approach is safe to use")
print(f" Recommend switching to single run for {avg_speedup:.0f}x faster execution")
else:
print(f" ✗ Methods produce DIFFERENT results - Need to understand why")
print(f" Growing window is slower but may be more accurate for live deployment")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,276 @@
"""
TEST SCRIPT: Inspect raw model output distributions for a single field
Purpose: Diagnose why thresholding is failing and harvest dates are wrong
This shows:
1. Distribution of harvest probability scores (0-1 range)
2. How often consecutive_days=3 is actually achievable
3. Actual season boundaries detected
4. Recommendations for threshold adjustment
Usage:
python test_model_output_distribution.py angata [field_id]
Examples:
python test_model_output_distribution.py angata 1
python test_model_output_distribution.py angata 10042
"""
import pandas as pd
import numpy as np
import torch
import sys
from pathlib import Path
from harvest_date_pred_utils import (
load_model_and_config,
extract_features,
)
def analyze_single_field(ci_data, field_id, model, config, scalers, device):
"""Analyze raw model outputs for a single field"""
# Filter to field
field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True)
if len(field_data) == 0:
print(f"ERROR: No data for field {field_id}")
return
print(f"\n{'='*80}")
print(f"FIELD: {field_id}")
print(f"{'='*80}")
print(f"Date range: {field_data['Date'].min()} to {field_data['Date'].max()}")
print(f"Total days: {len(field_data)}")
# Extract features (same as main pipeline)
try:
ci_column = config['data']['ci_column']
features = extract_features(field_data, config['features'], ci_column=ci_column)
if features is None or len(features) == 0:
print(f"ERROR: extract_features returned empty for field {field_id}")
return
except Exception as e:
print(f"ERROR extracting features: {e}")
import traceback
traceback.print_exc()
return
# Apply scalers (CRITICAL - same as production code)
try:
for fi, scaler in enumerate(scalers):
try:
features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten()
except Exception as e:
print(f" Warning: Scaler {fi} failed: {e}")
pass
except Exception as e:
print(f"ERROR applying scalers: {e}")
import traceback
traceback.print_exc()
return
# Run model
features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device)
with torch.no_grad():
output = model(features_tensor)
# Convert to numpy (handle different output formats)
# Model has TWO heads: imminent_probs, detected_probs
if isinstance(output, tuple):
imminent_probs = output[0].cpu().numpy().flatten()
detected_probs = output[1].cpu().numpy().flatten()
probs = detected_probs # Use DETECTED head (current harvest), not imminent (future)
print(f"Model outputs TWO heads: imminent + detected")
print(f" Using DETECTED head for harvest detection (sparse spikes at harvest)")
else:
# Fallback for single output
probs = output.cpu().numpy().flatten()
print(f"Model output: single head")
print(f"\nModel output shape: {probs.shape}")
print(f"Output range: {probs.min():.4f} to {probs.max():.4f}")
# Statistics
print(f"\nHarvest probability statistics:")
print(f" Mean: {probs.mean():.4f}")
print(f" Median: {np.median(probs):.4f}")
print(f" Std Dev: {probs.std():.4f}")
print(f" Min: {probs.min():.4f}")
print(f" Max: {probs.max():.4f}")
# Distribution by threshold
print(f"\nDistribution by threshold:")
thresholds = [0.2, 0.3, 0.4, 0.45, 0.5, 0.6, 0.7]
for thresh in thresholds:
count = np.sum(probs > thresh)
pct = 100 * count / len(probs)
print(f" > {thresh}: {count:4d} days ({pct:5.1f}%)")
# Consecutive days analysis (key metric)
print(f"\nConsecutive days above threshold (Phase 1 requirement):")
for thresh in [0.3, 0.4, 0.45, 0.5]:
above = (probs > thresh).astype(int)
# Find consecutive runs
changes = np.diff(np.concatenate(([0], above, [0])))
starts = np.where(changes == 1)[0]
ends = np.where(changes == -1)[0]
runs = ends - starts
if len(runs) > 0:
max_run = np.max(runs)
print(f" Threshold {thresh}: max consecutive = {max_run} days (need 3 for Phase 1)")
if len(runs) > 5:
print(f" {len(runs)} separate runs detected (seasons?)")
else:
print(f" Threshold {thresh}: no runs detected")
# Show top predictions
print(f"\nTop 10 harvest probability peaks:")
top_indices = np.argsort(probs)[-10:][::-1]
for rank, idx in enumerate(top_indices, 1):
date = field_data.iloc[idx]['Date']
prob = probs[idx]
print(f" {rank:2d}. Day {idx:4d} ({date}): {prob:.4f}")
# Show timeline
print(f"\nTimeline of probabilities (sampling every 10 days):")
for idx in range(0, len(probs), max(1, len(probs) // 20)):
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
ci_value = field_data.iloc[idx]['FitData']
prob = probs[idx]
bar = '-' * int(prob * 35)
try:
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f} {bar}")
except UnicodeEncodeError:
print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f}")
# FULL DAILY PROBABILITIES WITH CI VALUES
print(f"\n{'='*80}")
print(f"FULL DAILY PROBABILITIES WITH CI VALUES ({len(probs)} days):")
print(f"{'='*80}")
print(f"{'Day':>4} {'Date':<12} {'CI':>8} {'Probability':>12} {'Visual':<40}")
print(f"{'-'*100}")
for idx in range(len(probs)):
date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d")
ci_value = field_data.iloc[idx]['FitData']
prob = probs[idx]
bar = '-' * int(prob * 35) # Use dashes instead of█ for Unicode safety
try:
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f} {bar}")
except UnicodeEncodeError:
# Fallback for Windows encoding issues
print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f}")
print(f"{'-'*100}")
# Find valleys (days with low probabilities that could indicate season boundaries)
print(f"\nDays with LOWEST probabilities (potential season boundaries):")
valleys_threshold = 0.5 # Days below this might be season breaks
valley_indices = np.where(probs < valleys_threshold)[0]
if len(valley_indices) > 0:
print(f" Found {len(valley_indices)} days below {valleys_threshold}")
# Get valleys sorted by probability
valley_data = [(idx, probs[idx], field_data.iloc[idx]['Date']) for idx in valley_indices]
valley_data.sort(key=lambda x: x[1]) # Sort by probability (lowest first)
print(f"\n Bottom 20 lowest-probability days:")
for rank, (idx, prob, date) in enumerate(valley_data[:20], 1):
print(f" {rank:2d}. Day {idx:3d} ({date}): {prob:.4f}")
else:
print(f" None - all days above {valleys_threshold}")
# Identify likely harvest dates by finding local minima (valleys between growing periods)
print(f"\nLikely season boundaries (local minima approach):")
# Find indices where probability suddenly drops (derivative)
if len(probs) > 7:
smoothed = pd.Series(probs).rolling(window=7, center=True).mean()
derivatives = smoothed.diff().fillna(0)
# Find big drops (where derivative is very negative)
drops = np.where(derivatives < -0.2)[0] # Significant downward moves
if len(drops) > 0:
print(f" Found {len(drops)} significant drops (prob falling by 0.2+):")
for idx in drops[:10]: # Show first 10
date = field_data.iloc[idx]['Date']
before = probs[max(0, idx-1)]
after = probs[idx]
print(f" Day {idx:3d} ({date}): {before:.4f}{after:.4f}")
else:
print(f" No significant drops detected (probabilities don't dip much)")
# Show which harvest dates would be detected at different thresholds
print(f"\nHarvest detection (first day where prob > threshold for N consecutive days):")
for thresh in [0.2, 0.3, 0.4, 0.5, 0.6]:
for consec in [1, 2, 3]:
above = (probs > thresh).astype(int)
changes = np.diff(np.concatenate(([0], above, [0])))
starts = np.where(changes == 1)[0]
if len(starts) > 0:
# For each harvest start, find where it would trigger with consecutive_days
detected_harvests = []
for start_idx in starts:
# Check if we have enough consecutive days
if start_idx + consec - 1 < len(probs):
if all(probs[start_idx:start_idx + consec] > thresh):
harvest_date = field_data.iloc[start_idx]['Date']
detected_harvests.append((start_idx, harvest_date))
if detected_harvests:
first_idx, first_date = detected_harvests[0]
print(f" Threshold={thresh}, consecutive={consec}: {first_date} (day {first_idx})")
else:
print(f" Threshold={thresh}, consecutive={consec}: None detected")
def main():
project_name = sys.argv[1] if len(sys.argv) > 1 else "angata"
field_id = sys.argv[2] if len(sys.argv) > 2 else None
# Paths
base_storage = Path("../laravel_app/storage/app") / project_name / "Data"
ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python"
CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv"
if not CI_DATA_FILE.exists():
print(f"ERROR: {CI_DATA_FILE} not found")
return
print(f"Loading CI data from {CI_DATA_FILE}...")
ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str})
ci_data['Date'] = pd.to_datetime(ci_data['Date'])
if field_id is None:
# Show first 10 fields if none specified
fields = sorted(ci_data['field'].unique())[:10]
field_id = fields[0]
print(f"No field specified. Testing first field: {field_id}")
print(f"Available fields: {', '.join(fields)}")
# Load model
print(f"\nLoading model...")
from harvest_date_pred_utils import load_model_and_config
model, config, scalers = load_model_and_config(Path("."))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
# Analyze
analyze_single_field(ci_data, str(field_id), model, config, scalers, device)
print(f"\n{'='*80}")
print("INTERPRETATION:")
print(" If max consecutive < 3 for threshold=0.5:")
print(" → Lower threshold to 0.3-0.4, or reduce consecutive_days to 1-2")
print(" If multiple runs detected but harvest_date == first date:")
print(" → Season detection is failing (check extract_features)")
print(" If peaks are scattered randomly:")
print(" → Model may need retraining or data validation")
print("="*80)
if __name__ == "__main__":
main()