From 7b347ddba625086b6833f54f37df9ad1833bd1a2 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 09:08:07 +0100 Subject: [PATCH] Fix: Revert run_phase1_growing_window to original GitHub version (off-by-one fix) and move test scripts to harvest_detection_experiments/tests --- python_app/22_harvest_baseline_prediction.py | 4 +- python_app/harvest_date_pred_utils.py | 1 - .../tests/test_batch_model_distributions.py | 175 +++++++++++ .../test_growing_window_vs_single_run.py | 272 +++++++++++++++++ .../tests/test_model_output_distribution.py | 276 ++++++++++++++++++ 5 files changed, 725 insertions(+), 3 deletions(-) create mode 100644 python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py create mode 100644 python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py create mode 100644 python_app/harvest_detection_experiments/tests/test_model_output_distribution.py diff --git a/python_app/22_harvest_baseline_prediction.py b/python_app/22_harvest_baseline_prediction.py index 7a3a1e2..ac7c435 100644 --- a/python_app/22_harvest_baseline_prediction.py +++ b/python_app/22_harvest_baseline_prediction.py @@ -107,9 +107,9 @@ def main(): # [3/4] Run model predictions with two-step detection print("\n[3/4] Running two-step harvest detection...") - print(" (Using threshold=0.45, consecutive_days=2 - tuned for Model 307 output)") + print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)") refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device, - phase1_threshold=0.45, phase1_consecutive=2) + phase1_threshold=0.3, phase1_consecutive=2) # Build and export print("\nBuilding production harvest table...") diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py index f66e58e..ebf77c8 100644 --- a/python_app/harvest_date_pred_utils.py +++ b/python_app/harvest_date_pred_utils.py @@ -307,7 +307,6 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev with torch.no_grad(): x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) imminent_probs, detected_probs = model(x_tensor) - detected_probs = detected_probs.squeeze(0).cpu().numpy() # Check LAST timestep last_prob = detected_probs[-1] diff --git a/python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py b/python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py new file mode 100644 index 0000000..d9540cb --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py @@ -0,0 +1,175 @@ +""" +BATCH TEST: Sample multiple fields to understand model output distribution +Purpose: Determine optimal threshold and consecutive_days parameters + +This runs the model on 10 random fields and summarizes the results, +helping us decide what parameters to use in the production script. +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) + + +def test_field(ci_data, field_id, model, config, scalers, device): + """Test a single field and return summary statistics""" + + field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True) + if len(field_data) == 0: + return None + + try: + ci_column = config['data']['ci_column'] + features = extract_features(field_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except: + pass + + # Run model + features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) + with torch.no_grad(): + output = model(features_tensor) + + if isinstance(output, tuple): + detected_probs = output[1].cpu().numpy().flatten() + else: + detected_probs = output.cpu().numpy().flatten() + + # Analyze + max_prob = detected_probs.max() + mean_prob = detected_probs.mean() + median_prob = np.median(detected_probs) + + # Count consecutive days above thresholds + consecutive_above = {} + for thresh in [0.2, 0.3, 0.4, 0.5]: + above = (detected_probs > thresh).astype(int) + changes = np.diff(np.concatenate(([0], above, [0]))) + starts = np.where(changes == 1)[0] + ends = np.where(changes == -1)[0] + runs = ends - starts if len(starts) > 0 else [] + consecutive_above[thresh] = np.max(runs) if len(runs) > 0 else 0 + + return { + 'field': field_id, + 'max_prob': max_prob, + 'mean_prob': mean_prob, + 'median_prob': median_prob, + 'consecutive_0.2': consecutive_above[0.2], + 'consecutive_0.3': consecutive_above[0.3], + 'consecutive_0.4': consecutive_above[0.4], + 'consecutive_0.5': consecutive_above[0.5], + 'num_days': len(field_data), + } + except Exception as e: + return None + + +def main(): + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 10 + + # Load data + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + return + + print(f"Loading CI data from {CI_DATA_FILE}...") + ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + # Get random sample of fields + all_fields = sorted(ci_data['field'].unique()) + np.random.seed(42) + sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False) + print(f"Testing {len(sample_fields)} random fields...") + + # Load model + print(f"Loading model...") + from harvest_date_pred_utils import load_model_and_config + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Test each field + results = [] + for idx, field_id in enumerate(sample_fields, 1): + result = test_field(ci_data, field_id, model, config, scalers, device) + if result: + results.append(result) + print(f" [{idx:2d}/{len(sample_fields)}] Field {field_id:>6s}: " + f"max={result['max_prob']:.3f} mean={result['mean_prob']:.4f} " + f"consec@0.3={result['consecutive_0.3']:2d} consec@0.2={result['consecutive_0.2']:2d}") + + # Summary statistics + results_df = pd.DataFrame(results) + + print(f"\n{'='*80}") + print(f"SUMMARY STATISTICS ({len(results)} fields tested):") + print(f"{'='*80}") + print(f"\nMax probability (per field):") + print(f" Mean: {results_df['max_prob'].mean():.4f}") + print(f" Median: {results_df['max_prob'].median():.4f}") + print(f" Min: {results_df['max_prob'].min():.4f}") + print(f" Max: {results_df['max_prob'].max():.4f}") + + print(f"\nMean probability (per field):") + print(f" Mean: {results_df['mean_prob'].mean():.4f}") + print(f" Median: {results_df['mean_prob'].median():.4f}") + + print(f"\nConsecutive days above threshold=0.3:") + print(f" Mean: {results_df['consecutive_0.3'].mean():.1f}") + print(f" Median: {results_df['consecutive_0.3'].median():.1f}") + print(f" Min: {results_df['consecutive_0.3'].min():.0f}") + print(f" Max: {results_df['consecutive_0.3'].max():.0f}") + + print(f"\nConsecutive days above threshold=0.2:") + print(f" Mean: {results_df['consecutive_0.2'].mean():.1f}") + print(f" Median: {results_df['consecutive_0.2'].median():.1f}") + print(f" Min: {results_df['consecutive_0.2'].min():.0f}") + print(f" Max: {results_df['consecutive_0.2'].max():.0f}") + + print(f"\n{'='*80}") + print(f"RECOMMENDATION:") + print(f"{'='*80}") + + # Calculate recommendations based on data + median_consec_0_3 = results_df['consecutive_0.3'].median() + median_consec_0_2 = results_df['consecutive_0.2'].median() + + if median_consec_0_3 >= 3: + print(f"✓ Threshold=0.3 with consecutive_days=3 should work") + print(f" (median consecutive days: {median_consec_0_3:.0f})") + elif median_consec_0_3 >= 2: + print(f"✓ Threshold=0.3 with consecutive_days=2 recommended") + print(f" (median consecutive days: {median_consec_0_3:.0f})") + elif median_consec_0_2 >= 3: + print(f"✓ Threshold=0.2 with consecutive_days=3 recommended") + print(f" (median consecutive days: {median_consec_0_2:.0f})") + else: + print(f"✓ Threshold=0.2 with consecutive_days=2 recommended") + print(f" (median consecutive days @ 0.2: {median_consec_0_2:.0f})") + + print(f"\nCurrent production settings: threshold=0.45, consecutive_days=2") + print(f" → These are likely TOO STRICT (only 289 fields detected in batch run)") + print(f"\nSuggested adjustment:") + print(f" → Lower threshold to 0.2-0.3") + print(f" → Reduce consecutive_days to 1-2") + print(f" → Re-run batch to get ~1000+ fields detected") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py b/python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py new file mode 100644 index 0000000..bcda46d --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py @@ -0,0 +1,272 @@ +""" +COMPARISON TEST: Growing Window vs Single Run approach +Purpose: Compare harvest dates detected by two different methods + +Method 1 (Growing Window - Current): + - Day 1: Run model on [day1] + - Day 2: Run model on [day1:2] + - Day 3: Run model on [day1:3] + - ... up to day 477 + - This matches real-time production where data arrives daily + - Takes ~477 model runs per field (SLOW) + +Method 2 (Single Run - Proposed): + - Run model ONCE on full [day1:477] sequence + - Use these probabilities to scan for harvests + - This is 477x faster but assumes different LSTM context + - May produce different harvest dates + +Question: Do these methods detect similar harvest dates or different ones? +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) +import time + + +def method_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2): + """Original method: expanding window, run model multiple times""" + harvest_dates = [] + current_pos = 0 + + while current_pos < len(field_data): + consecutive_above_threshold = 0 + + for window_end in range(current_pos + 1, len(field_data) + 1): + window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) + + try: + features = extract_features(window_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run model + with torch.no_grad(): + x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + output = model(x_tensor) + + if isinstance(output, tuple): + _, detected_probs = output + detected_probs = detected_probs.squeeze(0).cpu().numpy() + else: + detected_probs = output.squeeze(0).cpu().numpy() + + # Check LAST timestep + last_prob = detected_probs[-1] + + if last_prob > threshold: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + # Harvest detected + if consecutive_above_threshold >= consecutive_days: + harvest_idx = current_pos + window_end - consecutive_days - 1 + harvest_date = field_data.iloc[harvest_idx]['Date'] + harvest_dates.append((harvest_date, harvest_idx, last_prob)) + + # Reset to next day after harvest + current_pos = current_pos + window_end - consecutive_days + break + + except Exception: + continue + else: + break + + return harvest_dates + + +def method_single_run(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2): + """Proposed method: run model once on full sequence""" + harvest_dates = [] + current_pos = 0 + + try: + # Extract features ONCE for full dataset + features = extract_features(field_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run model ONCE to get all probabilities + with torch.no_grad(): + x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + output = model(x_tensor) + + if isinstance(output, tuple): + _, detected_probs = output + detected_probs = detected_probs.squeeze(0).cpu().numpy() + else: + detected_probs = output.squeeze(0).cpu().numpy() + + # Scan forward looking for harvests + while current_pos < len(field_data): + consecutive_above_threshold = 0 + + for pos in range(current_pos, len(field_data)): + prob = detected_probs[pos] + + if prob > threshold: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + # Harvest detected + if consecutive_above_threshold >= consecutive_days: + harvest_idx = pos - consecutive_days + 1 + harvest_date = field_data.iloc[harvest_idx]['Date'] + harvest_dates.append((harvest_date, harvest_idx, prob)) + + # Move anchor point past this harvest + current_pos = harvest_idx + 1 + break + else: + # No harvest found in remaining data + break + + except Exception as e: + pass + + return harvest_dates + + +def compare_field(field_id, ci_data, model, config, scalers, device, threshold=0.3): + """Compare both methods for a single field""" + field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True) + + if len(field_data) < 10: + return None + + ci_column = config['data']['ci_column'] + + # Method 1: Growing window (SLOW) + print(f"\n Growing Window method...", end=" ", flush=True) + start = time.time() + growing_harvests = method_growing_window(field_data, model, config, scalers, ci_column, device, threshold) + time_growing = time.time() - start + print(f"({time_growing:.2f}s, {len(field_data)} model runs)") + + # Method 2: Single run (FAST) + print(f" Single Run method...", end=" ", flush=True) + start = time.time() + single_harvests = method_single_run(field_data, model, config, scalers, ci_column, device, threshold) + time_single = time.time() - start + print(f"({time_single:.2f}s, 1 model run)") + + return { + 'field': field_id, + 'num_days': len(field_data), + 'growing_harvests': growing_harvests, + 'single_harvests': single_harvests, + 'time_growing': time_growing, + 'time_single': time_single, + 'speedup': time_growing / time_single if time_single > 0 else 0, + } + + +def main(): + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 3 + threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3 + + # Load data + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + return + + print(f"Loading CI data...") + ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + # Get sample of fields + all_fields = sorted(ci_data['field'].unique()) + np.random.seed(42) + sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False) + + # Load model + print(f"Loading model...") + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print(f"\n{'='*80}") + print(f"COMPARING METHODS (threshold={threshold}, consecutive_days=2)") + print(f"{'='*80}") + + all_results = [] + for idx, field_id in enumerate(sample_fields, 1): + print(f"\n[{idx}/{len(sample_fields)}] Field {field_id}") + result = compare_field(field_id, ci_data, model, config, scalers, device, threshold) + + if result: + all_results.append(result) + + print(f"\n Growing Window found {len(result['growing_harvests'])} harvests:") + for date, idx, prob in result['growing_harvests']: + print(f" - {date.date()}: prob={prob:.4f}") + + print(f"\n Single Run found {len(result['single_harvests'])} harvests:") + for date, idx, prob in result['single_harvests']: + print(f" - {date.date()}: prob={prob:.4f}") + + print(f"\n ⏱ Speed comparison:") + print(f" Growing Window: {result['time_growing']:.2f}s ({result['num_days']} days processed)") + print(f" Single Run: {result['time_single']:.2f}s (1 run)") + print(f" Speedup: {result['speedup']:.0f}x faster") + + # Compare harvests + growing_dates = [d for d, _, _ in result['growing_harvests']] + single_dates = [d for d, _, _ in result['single_harvests']] + + if growing_dates == single_dates: + print(f"\n ✓ IDENTICAL: Both methods found the same harvest dates") + else: + print(f"\n ✗ DIFFERENT: Methods found different harvest dates") + print(f" Growing only: {[d for d in growing_dates if d not in single_dates]}") + print(f" Single only: {[d for d in single_dates if d not in growing_dates]}") + + # Summary + print(f"\n{'='*80}") + print(f"SUMMARY ({len(all_results)} fields tested)") + print(f"{'='*80}") + + identical = sum(1 for r in all_results if [d for d, _, _ in r['growing_harvests']] == [d for d, _, _ in r['single_harvests']]) + different = len(all_results) - identical + + print(f"\nIdentical results: {identical}/{len(all_results)}") + print(f"Different results: {different}/{len(all_results)}") + + if all_results: + avg_speedup = np.mean([r['speedup'] for r in all_results]) + print(f"\nAverage speedup: {avg_speedup:.0f}x") + print(f"\nConclusion:") + if identical == len(all_results): + print(f" ✓ Methods are EQUIVALENT - Single run approach is safe to use") + print(f" Recommend switching to single run for {avg_speedup:.0f}x faster execution") + else: + print(f" ✗ Methods produce DIFFERENT results - Need to understand why") + print(f" Growing window is slower but may be more accurate for live deployment") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/tests/test_model_output_distribution.py b/python_app/harvest_detection_experiments/tests/test_model_output_distribution.py new file mode 100644 index 0000000..571a32c --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_model_output_distribution.py @@ -0,0 +1,276 @@ +""" +TEST SCRIPT: Inspect raw model output distributions for a single field +Purpose: Diagnose why thresholding is failing and harvest dates are wrong + +This shows: +1. Distribution of harvest probability scores (0-1 range) +2. How often consecutive_days=3 is actually achievable +3. Actual season boundaries detected +4. Recommendations for threshold adjustment + +Usage: + python test_model_output_distribution.py angata [field_id] + +Examples: + python test_model_output_distribution.py angata 1 + python test_model_output_distribution.py angata 10042 +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) + + +def analyze_single_field(ci_data, field_id, model, config, scalers, device): + """Analyze raw model outputs for a single field""" + + # Filter to field + field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True) + + if len(field_data) == 0: + print(f"ERROR: No data for field {field_id}") + return + + print(f"\n{'='*80}") + print(f"FIELD: {field_id}") + print(f"{'='*80}") + print(f"Date range: {field_data['Date'].min()} to {field_data['Date'].max()}") + print(f"Total days: {len(field_data)}") + + # Extract features (same as main pipeline) + try: + ci_column = config['data']['ci_column'] + features = extract_features(field_data, config['features'], ci_column=ci_column) + if features is None or len(features) == 0: + print(f"ERROR: extract_features returned empty for field {field_id}") + return + except Exception as e: + print(f"ERROR extracting features: {e}") + import traceback + traceback.print_exc() + return + + # Apply scalers (CRITICAL - same as production code) + try: + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception as e: + print(f" Warning: Scaler {fi} failed: {e}") + pass + except Exception as e: + print(f"ERROR applying scalers: {e}") + import traceback + traceback.print_exc() + return + + # Run model + features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) + with torch.no_grad(): + output = model(features_tensor) + + # Convert to numpy (handle different output formats) + # Model has TWO heads: imminent_probs, detected_probs + if isinstance(output, tuple): + imminent_probs = output[0].cpu().numpy().flatten() + detected_probs = output[1].cpu().numpy().flatten() + probs = detected_probs # Use DETECTED head (current harvest), not imminent (future) + print(f"Model outputs TWO heads: imminent + detected") + print(f" Using DETECTED head for harvest detection (sparse spikes at harvest)") + else: + # Fallback for single output + probs = output.cpu().numpy().flatten() + print(f"Model output: single head") + + print(f"\nModel output shape: {probs.shape}") + print(f"Output range: {probs.min():.4f} to {probs.max():.4f}") + + # Statistics + print(f"\nHarvest probability statistics:") + print(f" Mean: {probs.mean():.4f}") + print(f" Median: {np.median(probs):.4f}") + print(f" Std Dev: {probs.std():.4f}") + print(f" Min: {probs.min():.4f}") + print(f" Max: {probs.max():.4f}") + + # Distribution by threshold + print(f"\nDistribution by threshold:") + thresholds = [0.2, 0.3, 0.4, 0.45, 0.5, 0.6, 0.7] + for thresh in thresholds: + count = np.sum(probs > thresh) + pct = 100 * count / len(probs) + print(f" > {thresh}: {count:4d} days ({pct:5.1f}%)") + + # Consecutive days analysis (key metric) + print(f"\nConsecutive days above threshold (Phase 1 requirement):") + for thresh in [0.3, 0.4, 0.45, 0.5]: + above = (probs > thresh).astype(int) + # Find consecutive runs + changes = np.diff(np.concatenate(([0], above, [0]))) + starts = np.where(changes == 1)[0] + ends = np.where(changes == -1)[0] + runs = ends - starts + + if len(runs) > 0: + max_run = np.max(runs) + print(f" Threshold {thresh}: max consecutive = {max_run} days (need 3 for Phase 1)") + if len(runs) > 5: + print(f" {len(runs)} separate runs detected (seasons?)") + else: + print(f" Threshold {thresh}: no runs detected") + + # Show top predictions + print(f"\nTop 10 harvest probability peaks:") + top_indices = np.argsort(probs)[-10:][::-1] + for rank, idx in enumerate(top_indices, 1): + date = field_data.iloc[idx]['Date'] + prob = probs[idx] + print(f" {rank:2d}. Day {idx:4d} ({date}): {prob:.4f}") + + # Show timeline + print(f"\nTimeline of probabilities (sampling every 10 days):") + for idx in range(0, len(probs), max(1, len(probs) // 20)): + date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d") + ci_value = field_data.iloc[idx]['FitData'] + prob = probs[idx] + bar = '-' * int(prob * 35) + try: + print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f} {bar}") + except UnicodeEncodeError: + print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f}") + + # FULL DAILY PROBABILITIES WITH CI VALUES + print(f"\n{'='*80}") + print(f"FULL DAILY PROBABILITIES WITH CI VALUES ({len(probs)} days):") + print(f"{'='*80}") + print(f"{'Day':>4} {'Date':<12} {'CI':>8} {'Probability':>12} {'Visual':<40}") + print(f"{'-'*100}") + + for idx in range(len(probs)): + date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d") + ci_value = field_data.iloc[idx]['FitData'] + prob = probs[idx] + bar = '-' * int(prob * 35) # Use dashes instead of█ for Unicode safety + try: + print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f} {bar}") + except UnicodeEncodeError: + # Fallback for Windows encoding issues + print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f}") + + print(f"{'-'*100}") + + # Find valleys (days with low probabilities that could indicate season boundaries) + print(f"\nDays with LOWEST probabilities (potential season boundaries):") + valleys_threshold = 0.5 # Days below this might be season breaks + valley_indices = np.where(probs < valleys_threshold)[0] + + if len(valley_indices) > 0: + print(f" Found {len(valley_indices)} days below {valleys_threshold}") + # Get valleys sorted by probability + valley_data = [(idx, probs[idx], field_data.iloc[idx]['Date']) for idx in valley_indices] + valley_data.sort(key=lambda x: x[1]) # Sort by probability (lowest first) + + print(f"\n Bottom 20 lowest-probability days:") + for rank, (idx, prob, date) in enumerate(valley_data[:20], 1): + print(f" {rank:2d}. Day {idx:3d} ({date}): {prob:.4f}") + else: + print(f" None - all days above {valleys_threshold}") + + # Identify likely harvest dates by finding local minima (valleys between growing periods) + print(f"\nLikely season boundaries (local minima approach):") + # Find indices where probability suddenly drops (derivative) + if len(probs) > 7: + smoothed = pd.Series(probs).rolling(window=7, center=True).mean() + derivatives = smoothed.diff().fillna(0) + + # Find big drops (where derivative is very negative) + drops = np.where(derivatives < -0.2)[0] # Significant downward moves + + if len(drops) > 0: + print(f" Found {len(drops)} significant drops (prob falling by 0.2+):") + for idx in drops[:10]: # Show first 10 + date = field_data.iloc[idx]['Date'] + before = probs[max(0, idx-1)] + after = probs[idx] + print(f" Day {idx:3d} ({date}): {before:.4f} → {after:.4f}") + else: + print(f" No significant drops detected (probabilities don't dip much)") + + # Show which harvest dates would be detected at different thresholds + print(f"\nHarvest detection (first day where prob > threshold for N consecutive days):") + for thresh in [0.2, 0.3, 0.4, 0.5, 0.6]: + for consec in [1, 2, 3]: + above = (probs > thresh).astype(int) + changes = np.diff(np.concatenate(([0], above, [0]))) + starts = np.where(changes == 1)[0] + + if len(starts) > 0: + # For each harvest start, find where it would trigger with consecutive_days + detected_harvests = [] + for start_idx in starts: + # Check if we have enough consecutive days + if start_idx + consec - 1 < len(probs): + if all(probs[start_idx:start_idx + consec] > thresh): + harvest_date = field_data.iloc[start_idx]['Date'] + detected_harvests.append((start_idx, harvest_date)) + + if detected_harvests: + first_idx, first_date = detected_harvests[0] + print(f" Threshold={thresh}, consecutive={consec}: {first_date} (day {first_idx})") + else: + print(f" Threshold={thresh}, consecutive={consec}: None detected") + + +def main(): + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + field_id = sys.argv[2] if len(sys.argv) > 2 else None + + # Paths + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + return + + print(f"Loading CI data from {CI_DATA_FILE}...") + ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + if field_id is None: + # Show first 10 fields if none specified + fields = sorted(ci_data['field'].unique())[:10] + field_id = fields[0] + print(f"No field specified. Testing first field: {field_id}") + print(f"Available fields: {', '.join(fields)}") + + # Load model + print(f"\nLoading model...") + from harvest_date_pred_utils import load_model_and_config + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Device: {device}") + + # Analyze + analyze_single_field(ci_data, str(field_id), model, config, scalers, device) + + print(f"\n{'='*80}") + print("INTERPRETATION:") + print(" If max consecutive < 3 for threshold=0.5:") + print(" → Lower threshold to 0.3-0.4, or reduce consecutive_days to 1-2") + print(" If multiple runs detected but harvest_date == first date:") + print(" → Season detection is failing (check extract_features)") + print(" If peaks are scattered randomly:") + print(" → Model may need retraining or data validation") + print("="*80) + + +if __name__ == "__main__": + main()