From 7b347ddba625086b6833f54f37df9ad1833bd1a2 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 09:08:07 +0100 Subject: [PATCH 01/15] Fix: Revert run_phase1_growing_window to original GitHub version (off-by-one fix) and move test scripts to harvest_detection_experiments/tests --- python_app/22_harvest_baseline_prediction.py | 4 +- python_app/harvest_date_pred_utils.py | 1 - .../tests/test_batch_model_distributions.py | 175 +++++++++++ .../test_growing_window_vs_single_run.py | 272 +++++++++++++++++ .../tests/test_model_output_distribution.py | 276 ++++++++++++++++++ 5 files changed, 725 insertions(+), 3 deletions(-) create mode 100644 python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py create mode 100644 python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py create mode 100644 python_app/harvest_detection_experiments/tests/test_model_output_distribution.py diff --git a/python_app/22_harvest_baseline_prediction.py b/python_app/22_harvest_baseline_prediction.py index 7a3a1e2..ac7c435 100644 --- a/python_app/22_harvest_baseline_prediction.py +++ b/python_app/22_harvest_baseline_prediction.py @@ -107,9 +107,9 @@ def main(): # [3/4] Run model predictions with two-step detection print("\n[3/4] Running two-step harvest detection...") - print(" (Using threshold=0.45, consecutive_days=2 - tuned for Model 307 output)") + print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)") refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device, - phase1_threshold=0.45, phase1_consecutive=2) + phase1_threshold=0.3, phase1_consecutive=2) # Build and export print("\nBuilding production harvest table...") diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py index f66e58e..ebf77c8 100644 --- a/python_app/harvest_date_pred_utils.py +++ b/python_app/harvest_date_pred_utils.py @@ -307,7 +307,6 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev with torch.no_grad(): x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) imminent_probs, detected_probs = model(x_tensor) - detected_probs = detected_probs.squeeze(0).cpu().numpy() # Check LAST timestep last_prob = detected_probs[-1] diff --git a/python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py b/python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py new file mode 100644 index 0000000..d9540cb --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_batch_model_distributions.py @@ -0,0 +1,175 @@ +""" +BATCH TEST: Sample multiple fields to understand model output distribution +Purpose: Determine optimal threshold and consecutive_days parameters + +This runs the model on 10 random fields and summarizes the results, +helping us decide what parameters to use in the production script. +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) + + +def test_field(ci_data, field_id, model, config, scalers, device): + """Test a single field and return summary statistics""" + + field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True) + if len(field_data) == 0: + return None + + try: + ci_column = config['data']['ci_column'] + features = extract_features(field_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except: + pass + + # Run model + features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) + with torch.no_grad(): + output = model(features_tensor) + + if isinstance(output, tuple): + detected_probs = output[1].cpu().numpy().flatten() + else: + detected_probs = output.cpu().numpy().flatten() + + # Analyze + max_prob = detected_probs.max() + mean_prob = detected_probs.mean() + median_prob = np.median(detected_probs) + + # Count consecutive days above thresholds + consecutive_above = {} + for thresh in [0.2, 0.3, 0.4, 0.5]: + above = (detected_probs > thresh).astype(int) + changes = np.diff(np.concatenate(([0], above, [0]))) + starts = np.where(changes == 1)[0] + ends = np.where(changes == -1)[0] + runs = ends - starts if len(starts) > 0 else [] + consecutive_above[thresh] = np.max(runs) if len(runs) > 0 else 0 + + return { + 'field': field_id, + 'max_prob': max_prob, + 'mean_prob': mean_prob, + 'median_prob': median_prob, + 'consecutive_0.2': consecutive_above[0.2], + 'consecutive_0.3': consecutive_above[0.3], + 'consecutive_0.4': consecutive_above[0.4], + 'consecutive_0.5': consecutive_above[0.5], + 'num_days': len(field_data), + } + except Exception as e: + return None + + +def main(): + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 10 + + # Load data + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + return + + print(f"Loading CI data from {CI_DATA_FILE}...") + ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + # Get random sample of fields + all_fields = sorted(ci_data['field'].unique()) + np.random.seed(42) + sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False) + print(f"Testing {len(sample_fields)} random fields...") + + # Load model + print(f"Loading model...") + from harvest_date_pred_utils import load_model_and_config + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Test each field + results = [] + for idx, field_id in enumerate(sample_fields, 1): + result = test_field(ci_data, field_id, model, config, scalers, device) + if result: + results.append(result) + print(f" [{idx:2d}/{len(sample_fields)}] Field {field_id:>6s}: " + f"max={result['max_prob']:.3f} mean={result['mean_prob']:.4f} " + f"consec@0.3={result['consecutive_0.3']:2d} consec@0.2={result['consecutive_0.2']:2d}") + + # Summary statistics + results_df = pd.DataFrame(results) + + print(f"\n{'='*80}") + print(f"SUMMARY STATISTICS ({len(results)} fields tested):") + print(f"{'='*80}") + print(f"\nMax probability (per field):") + print(f" Mean: {results_df['max_prob'].mean():.4f}") + print(f" Median: {results_df['max_prob'].median():.4f}") + print(f" Min: {results_df['max_prob'].min():.4f}") + print(f" Max: {results_df['max_prob'].max():.4f}") + + print(f"\nMean probability (per field):") + print(f" Mean: {results_df['mean_prob'].mean():.4f}") + print(f" Median: {results_df['mean_prob'].median():.4f}") + + print(f"\nConsecutive days above threshold=0.3:") + print(f" Mean: {results_df['consecutive_0.3'].mean():.1f}") + print(f" Median: {results_df['consecutive_0.3'].median():.1f}") + print(f" Min: {results_df['consecutive_0.3'].min():.0f}") + print(f" Max: {results_df['consecutive_0.3'].max():.0f}") + + print(f"\nConsecutive days above threshold=0.2:") + print(f" Mean: {results_df['consecutive_0.2'].mean():.1f}") + print(f" Median: {results_df['consecutive_0.2'].median():.1f}") + print(f" Min: {results_df['consecutive_0.2'].min():.0f}") + print(f" Max: {results_df['consecutive_0.2'].max():.0f}") + + print(f"\n{'='*80}") + print(f"RECOMMENDATION:") + print(f"{'='*80}") + + # Calculate recommendations based on data + median_consec_0_3 = results_df['consecutive_0.3'].median() + median_consec_0_2 = results_df['consecutive_0.2'].median() + + if median_consec_0_3 >= 3: + print(f"✓ Threshold=0.3 with consecutive_days=3 should work") + print(f" (median consecutive days: {median_consec_0_3:.0f})") + elif median_consec_0_3 >= 2: + print(f"✓ Threshold=0.3 with consecutive_days=2 recommended") + print(f" (median consecutive days: {median_consec_0_3:.0f})") + elif median_consec_0_2 >= 3: + print(f"✓ Threshold=0.2 with consecutive_days=3 recommended") + print(f" (median consecutive days: {median_consec_0_2:.0f})") + else: + print(f"✓ Threshold=0.2 with consecutive_days=2 recommended") + print(f" (median consecutive days @ 0.2: {median_consec_0_2:.0f})") + + print(f"\nCurrent production settings: threshold=0.45, consecutive_days=2") + print(f" → These are likely TOO STRICT (only 289 fields detected in batch run)") + print(f"\nSuggested adjustment:") + print(f" → Lower threshold to 0.2-0.3") + print(f" → Reduce consecutive_days to 1-2") + print(f" → Re-run batch to get ~1000+ fields detected") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py b/python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py new file mode 100644 index 0000000..bcda46d --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_growing_window_vs_single_run.py @@ -0,0 +1,272 @@ +""" +COMPARISON TEST: Growing Window vs Single Run approach +Purpose: Compare harvest dates detected by two different methods + +Method 1 (Growing Window - Current): + - Day 1: Run model on [day1] + - Day 2: Run model on [day1:2] + - Day 3: Run model on [day1:3] + - ... up to day 477 + - This matches real-time production where data arrives daily + - Takes ~477 model runs per field (SLOW) + +Method 2 (Single Run - Proposed): + - Run model ONCE on full [day1:477] sequence + - Use these probabilities to scan for harvests + - This is 477x faster but assumes different LSTM context + - May produce different harvest dates + +Question: Do these methods detect similar harvest dates or different ones? +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) +import time + + +def method_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2): + """Original method: expanding window, run model multiple times""" + harvest_dates = [] + current_pos = 0 + + while current_pos < len(field_data): + consecutive_above_threshold = 0 + + for window_end in range(current_pos + 1, len(field_data) + 1): + window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) + + try: + features = extract_features(window_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run model + with torch.no_grad(): + x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + output = model(x_tensor) + + if isinstance(output, tuple): + _, detected_probs = output + detected_probs = detected_probs.squeeze(0).cpu().numpy() + else: + detected_probs = output.squeeze(0).cpu().numpy() + + # Check LAST timestep + last_prob = detected_probs[-1] + + if last_prob > threshold: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + # Harvest detected + if consecutive_above_threshold >= consecutive_days: + harvest_idx = current_pos + window_end - consecutive_days - 1 + harvest_date = field_data.iloc[harvest_idx]['Date'] + harvest_dates.append((harvest_date, harvest_idx, last_prob)) + + # Reset to next day after harvest + current_pos = current_pos + window_end - consecutive_days + break + + except Exception: + continue + else: + break + + return harvest_dates + + +def method_single_run(field_data, model, config, scalers, ci_column, device, threshold=0.3, consecutive_days=2): + """Proposed method: run model once on full sequence""" + harvest_dates = [] + current_pos = 0 + + try: + # Extract features ONCE for full dataset + features = extract_features(field_data, config['features'], ci_column=ci_column) + + # Apply scalers + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception: + pass + + # Run model ONCE to get all probabilities + with torch.no_grad(): + x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + output = model(x_tensor) + + if isinstance(output, tuple): + _, detected_probs = output + detected_probs = detected_probs.squeeze(0).cpu().numpy() + else: + detected_probs = output.squeeze(0).cpu().numpy() + + # Scan forward looking for harvests + while current_pos < len(field_data): + consecutive_above_threshold = 0 + + for pos in range(current_pos, len(field_data)): + prob = detected_probs[pos] + + if prob > threshold: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + # Harvest detected + if consecutive_above_threshold >= consecutive_days: + harvest_idx = pos - consecutive_days + 1 + harvest_date = field_data.iloc[harvest_idx]['Date'] + harvest_dates.append((harvest_date, harvest_idx, prob)) + + # Move anchor point past this harvest + current_pos = harvest_idx + 1 + break + else: + # No harvest found in remaining data + break + + except Exception as e: + pass + + return harvest_dates + + +def compare_field(field_id, ci_data, model, config, scalers, device, threshold=0.3): + """Compare both methods for a single field""" + field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True) + + if len(field_data) < 10: + return None + + ci_column = config['data']['ci_column'] + + # Method 1: Growing window (SLOW) + print(f"\n Growing Window method...", end=" ", flush=True) + start = time.time() + growing_harvests = method_growing_window(field_data, model, config, scalers, ci_column, device, threshold) + time_growing = time.time() - start + print(f"({time_growing:.2f}s, {len(field_data)} model runs)") + + # Method 2: Single run (FAST) + print(f" Single Run method...", end=" ", flush=True) + start = time.time() + single_harvests = method_single_run(field_data, model, config, scalers, ci_column, device, threshold) + time_single = time.time() - start + print(f"({time_single:.2f}s, 1 model run)") + + return { + 'field': field_id, + 'num_days': len(field_data), + 'growing_harvests': growing_harvests, + 'single_harvests': single_harvests, + 'time_growing': time_growing, + 'time_single': time_single, + 'speedup': time_growing / time_single if time_single > 0 else 0, + } + + +def main(): + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + num_samples = int(sys.argv[2]) if len(sys.argv) > 2 else 3 + threshold = float(sys.argv[3]) if len(sys.argv) > 3 else 0.3 + + # Load data + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + return + + print(f"Loading CI data...") + ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + # Get sample of fields + all_fields = sorted(ci_data['field'].unique()) + np.random.seed(42) + sample_fields = np.random.choice(all_fields, size=min(num_samples, len(all_fields)), replace=False) + + # Load model + print(f"Loading model...") + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print(f"\n{'='*80}") + print(f"COMPARING METHODS (threshold={threshold}, consecutive_days=2)") + print(f"{'='*80}") + + all_results = [] + for idx, field_id in enumerate(sample_fields, 1): + print(f"\n[{idx}/{len(sample_fields)}] Field {field_id}") + result = compare_field(field_id, ci_data, model, config, scalers, device, threshold) + + if result: + all_results.append(result) + + print(f"\n Growing Window found {len(result['growing_harvests'])} harvests:") + for date, idx, prob in result['growing_harvests']: + print(f" - {date.date()}: prob={prob:.4f}") + + print(f"\n Single Run found {len(result['single_harvests'])} harvests:") + for date, idx, prob in result['single_harvests']: + print(f" - {date.date()}: prob={prob:.4f}") + + print(f"\n ⏱ Speed comparison:") + print(f" Growing Window: {result['time_growing']:.2f}s ({result['num_days']} days processed)") + print(f" Single Run: {result['time_single']:.2f}s (1 run)") + print(f" Speedup: {result['speedup']:.0f}x faster") + + # Compare harvests + growing_dates = [d for d, _, _ in result['growing_harvests']] + single_dates = [d for d, _, _ in result['single_harvests']] + + if growing_dates == single_dates: + print(f"\n ✓ IDENTICAL: Both methods found the same harvest dates") + else: + print(f"\n ✗ DIFFERENT: Methods found different harvest dates") + print(f" Growing only: {[d for d in growing_dates if d not in single_dates]}") + print(f" Single only: {[d for d in single_dates if d not in growing_dates]}") + + # Summary + print(f"\n{'='*80}") + print(f"SUMMARY ({len(all_results)} fields tested)") + print(f"{'='*80}") + + identical = sum(1 for r in all_results if [d for d, _, _ in r['growing_harvests']] == [d for d, _, _ in r['single_harvests']]) + different = len(all_results) - identical + + print(f"\nIdentical results: {identical}/{len(all_results)}") + print(f"Different results: {different}/{len(all_results)}") + + if all_results: + avg_speedup = np.mean([r['speedup'] for r in all_results]) + print(f"\nAverage speedup: {avg_speedup:.0f}x") + print(f"\nConclusion:") + if identical == len(all_results): + print(f" ✓ Methods are EQUIVALENT - Single run approach is safe to use") + print(f" Recommend switching to single run for {avg_speedup:.0f}x faster execution") + else: + print(f" ✗ Methods produce DIFFERENT results - Need to understand why") + print(f" Growing window is slower but may be more accurate for live deployment") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/tests/test_model_output_distribution.py b/python_app/harvest_detection_experiments/tests/test_model_output_distribution.py new file mode 100644 index 0000000..571a32c --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_model_output_distribution.py @@ -0,0 +1,276 @@ +""" +TEST SCRIPT: Inspect raw model output distributions for a single field +Purpose: Diagnose why thresholding is failing and harvest dates are wrong + +This shows: +1. Distribution of harvest probability scores (0-1 range) +2. How often consecutive_days=3 is actually achievable +3. Actual season boundaries detected +4. Recommendations for threshold adjustment + +Usage: + python test_model_output_distribution.py angata [field_id] + +Examples: + python test_model_output_distribution.py angata 1 + python test_model_output_distribution.py angata 10042 +""" + +import pandas as pd +import numpy as np +import torch +import sys +from pathlib import Path +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) + + +def analyze_single_field(ci_data, field_id, model, config, scalers, device): + """Analyze raw model outputs for a single field""" + + # Filter to field + field_data = ci_data[ci_data['field'] == field_id].sort_values('Date').reset_index(drop=True) + + if len(field_data) == 0: + print(f"ERROR: No data for field {field_id}") + return + + print(f"\n{'='*80}") + print(f"FIELD: {field_id}") + print(f"{'='*80}") + print(f"Date range: {field_data['Date'].min()} to {field_data['Date'].max()}") + print(f"Total days: {len(field_data)}") + + # Extract features (same as main pipeline) + try: + ci_column = config['data']['ci_column'] + features = extract_features(field_data, config['features'], ci_column=ci_column) + if features is None or len(features) == 0: + print(f"ERROR: extract_features returned empty for field {field_id}") + return + except Exception as e: + print(f"ERROR extracting features: {e}") + import traceback + traceback.print_exc() + return + + # Apply scalers (CRITICAL - same as production code) + try: + for fi, scaler in enumerate(scalers): + try: + features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception as e: + print(f" Warning: Scaler {fi} failed: {e}") + pass + except Exception as e: + print(f"ERROR applying scalers: {e}") + import traceback + traceback.print_exc() + return + + # Run model + features_tensor = torch.FloatTensor(features).unsqueeze(0).to(device) + with torch.no_grad(): + output = model(features_tensor) + + # Convert to numpy (handle different output formats) + # Model has TWO heads: imminent_probs, detected_probs + if isinstance(output, tuple): + imminent_probs = output[0].cpu().numpy().flatten() + detected_probs = output[1].cpu().numpy().flatten() + probs = detected_probs # Use DETECTED head (current harvest), not imminent (future) + print(f"Model outputs TWO heads: imminent + detected") + print(f" Using DETECTED head for harvest detection (sparse spikes at harvest)") + else: + # Fallback for single output + probs = output.cpu().numpy().flatten() + print(f"Model output: single head") + + print(f"\nModel output shape: {probs.shape}") + print(f"Output range: {probs.min():.4f} to {probs.max():.4f}") + + # Statistics + print(f"\nHarvest probability statistics:") + print(f" Mean: {probs.mean():.4f}") + print(f" Median: {np.median(probs):.4f}") + print(f" Std Dev: {probs.std():.4f}") + print(f" Min: {probs.min():.4f}") + print(f" Max: {probs.max():.4f}") + + # Distribution by threshold + print(f"\nDistribution by threshold:") + thresholds = [0.2, 0.3, 0.4, 0.45, 0.5, 0.6, 0.7] + for thresh in thresholds: + count = np.sum(probs > thresh) + pct = 100 * count / len(probs) + print(f" > {thresh}: {count:4d} days ({pct:5.1f}%)") + + # Consecutive days analysis (key metric) + print(f"\nConsecutive days above threshold (Phase 1 requirement):") + for thresh in [0.3, 0.4, 0.45, 0.5]: + above = (probs > thresh).astype(int) + # Find consecutive runs + changes = np.diff(np.concatenate(([0], above, [0]))) + starts = np.where(changes == 1)[0] + ends = np.where(changes == -1)[0] + runs = ends - starts + + if len(runs) > 0: + max_run = np.max(runs) + print(f" Threshold {thresh}: max consecutive = {max_run} days (need 3 for Phase 1)") + if len(runs) > 5: + print(f" {len(runs)} separate runs detected (seasons?)") + else: + print(f" Threshold {thresh}: no runs detected") + + # Show top predictions + print(f"\nTop 10 harvest probability peaks:") + top_indices = np.argsort(probs)[-10:][::-1] + for rank, idx in enumerate(top_indices, 1): + date = field_data.iloc[idx]['Date'] + prob = probs[idx] + print(f" {rank:2d}. Day {idx:4d} ({date}): {prob:.4f}") + + # Show timeline + print(f"\nTimeline of probabilities (sampling every 10 days):") + for idx in range(0, len(probs), max(1, len(probs) // 20)): + date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d") + ci_value = field_data.iloc[idx]['FitData'] + prob = probs[idx] + bar = '-' * int(prob * 35) + try: + print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f} {bar}") + except UnicodeEncodeError: + print(f" {date_str} CI={ci_value:.4f} Prob={prob:.4f}") + + # FULL DAILY PROBABILITIES WITH CI VALUES + print(f"\n{'='*80}") + print(f"FULL DAILY PROBABILITIES WITH CI VALUES ({len(probs)} days):") + print(f"{'='*80}") + print(f"{'Day':>4} {'Date':<12} {'CI':>8} {'Probability':>12} {'Visual':<40}") + print(f"{'-'*100}") + + for idx in range(len(probs)): + date_str = field_data.iloc[idx]['Date'].strftime("%Y-%m-%d") + ci_value = field_data.iloc[idx]['FitData'] + prob = probs[idx] + bar = '-' * int(prob * 35) # Use dashes instead of█ for Unicode safety + try: + print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f} {bar}") + except UnicodeEncodeError: + # Fallback for Windows encoding issues + print(f"{idx:4d} {date_str} {ci_value:8.4f} {prob:12.4f}") + + print(f"{'-'*100}") + + # Find valleys (days with low probabilities that could indicate season boundaries) + print(f"\nDays with LOWEST probabilities (potential season boundaries):") + valleys_threshold = 0.5 # Days below this might be season breaks + valley_indices = np.where(probs < valleys_threshold)[0] + + if len(valley_indices) > 0: + print(f" Found {len(valley_indices)} days below {valleys_threshold}") + # Get valleys sorted by probability + valley_data = [(idx, probs[idx], field_data.iloc[idx]['Date']) for idx in valley_indices] + valley_data.sort(key=lambda x: x[1]) # Sort by probability (lowest first) + + print(f"\n Bottom 20 lowest-probability days:") + for rank, (idx, prob, date) in enumerate(valley_data[:20], 1): + print(f" {rank:2d}. Day {idx:3d} ({date}): {prob:.4f}") + else: + print(f" None - all days above {valleys_threshold}") + + # Identify likely harvest dates by finding local minima (valleys between growing periods) + print(f"\nLikely season boundaries (local minima approach):") + # Find indices where probability suddenly drops (derivative) + if len(probs) > 7: + smoothed = pd.Series(probs).rolling(window=7, center=True).mean() + derivatives = smoothed.diff().fillna(0) + + # Find big drops (where derivative is very negative) + drops = np.where(derivatives < -0.2)[0] # Significant downward moves + + if len(drops) > 0: + print(f" Found {len(drops)} significant drops (prob falling by 0.2+):") + for idx in drops[:10]: # Show first 10 + date = field_data.iloc[idx]['Date'] + before = probs[max(0, idx-1)] + after = probs[idx] + print(f" Day {idx:3d} ({date}): {before:.4f} → {after:.4f}") + else: + print(f" No significant drops detected (probabilities don't dip much)") + + # Show which harvest dates would be detected at different thresholds + print(f"\nHarvest detection (first day where prob > threshold for N consecutive days):") + for thresh in [0.2, 0.3, 0.4, 0.5, 0.6]: + for consec in [1, 2, 3]: + above = (probs > thresh).astype(int) + changes = np.diff(np.concatenate(([0], above, [0]))) + starts = np.where(changes == 1)[0] + + if len(starts) > 0: + # For each harvest start, find where it would trigger with consecutive_days + detected_harvests = [] + for start_idx in starts: + # Check if we have enough consecutive days + if start_idx + consec - 1 < len(probs): + if all(probs[start_idx:start_idx + consec] > thresh): + harvest_date = field_data.iloc[start_idx]['Date'] + detected_harvests.append((start_idx, harvest_date)) + + if detected_harvests: + first_idx, first_date = detected_harvests[0] + print(f" Threshold={thresh}, consecutive={consec}: {first_date} (day {first_idx})") + else: + print(f" Threshold={thresh}, consecutive={consec}: None detected") + + +def main(): + project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" + field_id = sys.argv[2] if len(sys.argv) > 2 else None + + # Paths + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" + + if not CI_DATA_FILE.exists(): + print(f"ERROR: {CI_DATA_FILE} not found") + return + + print(f"Loading CI data from {CI_DATA_FILE}...") + ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + + if field_id is None: + # Show first 10 fields if none specified + fields = sorted(ci_data['field'].unique())[:10] + field_id = fields[0] + print(f"No field specified. Testing first field: {field_id}") + print(f"Available fields: {', '.join(fields)}") + + # Load model + print(f"\nLoading model...") + from harvest_date_pred_utils import load_model_and_config + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"Device: {device}") + + # Analyze + analyze_single_field(ci_data, str(field_id), model, config, scalers, device) + + print(f"\n{'='*80}") + print("INTERPRETATION:") + print(" If max consecutive < 3 for threshold=0.5:") + print(" → Lower threshold to 0.3-0.4, or reduce consecutive_days to 1-2") + print(" If multiple runs detected but harvest_date == first date:") + print(" → Season detection is failing (check extract_features)") + print(" If peaks are scattered randomly:") + print(" → Model may need retraining or data validation") + print("="*80) + + +if __name__ == "__main__": + main() From 4c7ca85d290e1d126acc420b045f68d4dc64be71 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 09:28:13 +0100 Subject: [PATCH 02/15] Add script 23: Convert harvest format from production to standard, remove diagnostic script --- python_app/23_convert_harvest_format.py | 225 ++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 python_app/23_convert_harvest_format.py diff --git a/python_app/23_convert_harvest_format.py b/python_app/23_convert_harvest_format.py new file mode 100644 index 0000000..bfa138c --- /dev/null +++ b/python_app/23_convert_harvest_format.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Script 23: Convert Harvest Format from Production to Standard +============================================================== + +Converts harvest_production_export.xlsx (output from script 22) to the standard +harvest.xlsx format used by R scripts 24+. + +INPUT: + - harvest_production_export.xlsx (from script 22) + Columns: field, season (numeric), season_start_date, season_end_date, phase2_harvest_date + Contains detected harvests only + +OUTPUT: + - harvest.xlsx (standard format for R pipeline) + Columns: field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha + +LOGIC: + 1. For each field, group all detections chronologically + 2. Create one row per completed season (has season_end date) + 3. season_start = first CI date (2024-09-25) for first season, then previous harvest + 1 day + 4. season_end = phase2_harvest_date (refined harvest date from script 22) + 5. year = extracted from season_start date + 6. season = "Data{year} : {field}" format + 7. sub_field = field (same as field) + 8. age, sub_area, tonnage_ha = left empty (filled by R scripts later or other data sources) + +Date Format: YYYY-MM-DD +""" + +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +import os +import sys +from pathlib import Path + + +def get_ci_date_range(project_dir): + """ + Get the date range of CI data to establish season_start for first season. + + Returns: (min_date, max_date) as datetime objects + """ + base_storage = Path("../laravel_app/storage/app") / project_dir / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + ci_csv_path = ci_data_dir / "ci_data_for_python.csv" + + if not ci_csv_path.exists(): + # Fallback: assume data starts 2024-09-25 (typical for projects) + print(f"[WARNING] CI data CSV not found at {ci_csv_path}, assuming CI starts 2024-09-25") + return datetime(2024, 9, 25), datetime.now() + + try: + # Read only date column (first column usually has dates) + df = pd.read_csv(ci_csv_path, nrows=1) + columns = df.columns.tolist() + date_col = columns[0] # First column should be dates + + df = pd.read_csv(ci_csv_path, usecols=[date_col]) + df[date_col] = pd.to_datetime(df[date_col]) + + min_date = df[date_col].min() + max_date = df[date_col].max() + + print(f"[INFO] CI data date range: {min_date.date()} to {max_date.date()}") + return min_date, max_date + except Exception as e: + print(f"[WARNING] Error reading CI date range: {e}, using fallback dates") + return datetime(2024, 9, 25), datetime.now() + + +def convert_harvest_format(project_dir="angata"): + """ + Convert harvest_production_export.xlsx to standard harvest.xlsx format. + + Parameters: + ----------- + project_dir : str + Project name (angata, esa, chemba, etc.) + """ + + print(f"\n{'='*80}") + print(f"Script 23: Convert Harvest Format") + print(f"Project: {project_dir}") + print(f"{'='*80}\n") + + # Get paths (same as script 22) + base_storage = Path("../laravel_app/storage/app") / project_dir / "Data" + harvest_data_dir = base_storage / "HarvestData" + source_file = harvest_data_dir / "harvest_production_export.xlsx" + output_file = base_storage / "harvest.xlsx" + + # Check source file exists + if not source_file.exists(): + print(f"[ERROR] Source file not found: {source_file}") + print(f"[ERROR] Please run script 22 first to generate harvest_production_export.xlsx") + return False + + print(f"[INFO] Reading source file: {source_file}") + + try: + # Read production format + df_source = pd.read_excel(source_file) + print(f"[INFO] Loaded {len(df_source)} harvest detections") + print(f"[INFO] Columns: {list(df_source.columns)}") + + # Validate required columns + required_cols = ["field", "season_start_date", "season_end_date", "phase2_harvest_date"] + missing = [c for c in required_cols if c not in df_source.columns] + if missing: + print(f"[ERROR] Missing columns: {missing}") + return False + + # Get CI date range for establishing first season start + ci_min_date, ci_max_date = get_ci_date_range(project_dir) + first_season_start = ci_min_date.strftime("%Y-%m-%d") + + # Convert to datetime for processing + df_source["phase2_harvest_date"] = pd.to_datetime(df_source["phase2_harvest_date"]) + df_source["field"] = df_source["field"].astype(str) + + # Sort by field and harvest date + df_source = df_source.sort_values(["field", "phase2_harvest_date"]).reset_index(drop=True) + + # Build output rows + output_rows = [] + + # Group by field + for field_id, group_df in df_source.groupby("field"): + # Get all harvest dates for this field, sorted chronologically + harvest_dates = group_df["phase2_harvest_date"].dt.strftime("%Y-%m-%d").tolist() + + print(f"[INFO] Field {field_id}: {len(harvest_dates)} harvest detection(s)") + + # First season always starts from CI beginning + current_season_start = first_season_start + + for harvest_idx, harvest_date in enumerate(harvest_dates): + # Extract year from current season start + season_start_obj = pd.to_datetime(current_season_start) + year = season_start_obj.year + + # Create season identifier + season_str = f"Data{year} : {field_id}" + + # Create row for completed season + row = { + "field": field_id, + "sub_field": field_id, # Same as field + "year": year, + "season": season_str, + "season_start": current_season_start, + "season_end": harvest_date, # Filled because harvest detected + "age": "", # Empty - will be calculated in R + "sub_area": "", # Empty - will be populated from other data + "tonnage_ha": "" # Empty - will be populated from other data + } + + output_rows.append(row) + + # Next season starts day after this harvest + next_season_start = (pd.to_datetime(harvest_date) + timedelta(days=1)).strftime("%Y-%m-%d") + current_season_start = next_season_start + + # If field has detections, check if we should add a final incomplete season + # Only if we're not at the end of the monitoring period + last_harvest = pd.to_datetime(harvest_dates[-1]) + days_after_last = (ci_max_date - last_harvest).days + + if days_after_last > 30: # More than 30 days of data after last harvest + # Add incomplete season row (season_end empty) + season_start_obj = pd.to_datetime(current_season_start) + year = season_start_obj.year + season_str = f"Data{year} : {field_id}" + + row = { + "field": field_id, + "sub_field": field_id, + "year": year, + "season": season_str, + "season_start": current_season_start, + "season_end": "", # Empty - season still active + "age": "", + "sub_area": "", + "tonnage_ha": "" + } + + output_rows.append(row) + print(f"[INFO] Added incomplete season starting {current_season_start}") + + # Create output DataFrame + df_output = pd.DataFrame(output_rows) + + # Reorder columns to match standard format + column_order = ["field", "sub_field", "year", "season", "season_start", "season_end", + "age", "sub_area", "tonnage_ha"] + df_output = df_output[column_order] + + # Write to Excel + print(f"\n[INFO] Writing output file: {output_file}") + df_output.to_excel(output_file, index=False, sheet_name="Harvest") + + # Print summary + print(f"\n[SUCCESS] Conversion complete!") + print(f"[INFO] Output rows: {len(df_output)}") + print(f"[INFO] Unique fields: {df_output['field'].nunique()}") + print(f"\n[INFO] Sample output:") + print(df_output.head(10).to_string(index=False)) + + return True + + except Exception as e: + print(f"[ERROR] Conversion failed: {e}") + import traceback + traceback.print_exc() + return False + + +if __name__ == "__main__": + # Get project from command line or use default + project = sys.argv[1] if len(sys.argv) > 1 else "angata" + + success = convert_harvest_format(project) + sys.exit(0 if success else 1) From d6448b8703da176a5e9c18e741f2521fc7389080 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 10:25:38 +0100 Subject: [PATCH 03/15] Refactor: Update system architecture documentation to reflect Python integration and add complete pipeline overview --- python_app/22_harvest_baseline_prediction.py | 1 + .../system_architecture/PIPELINE_OVERVIEW.md | 447 ++++++++++++++++++ .../system_architecture.md | 4 +- 3 files changed, 450 insertions(+), 2 deletions(-) create mode 100644 r_app/system_architecture/PIPELINE_OVERVIEW.md diff --git a/python_app/22_harvest_baseline_prediction.py b/python_app/22_harvest_baseline_prediction.py index ac7c435..ffd8f14 100644 --- a/python_app/22_harvest_baseline_prediction.py +++ b/python_app/22_harvest_baseline_prediction.py @@ -26,6 +26,7 @@ This is your GROUND TRUTH - compare all future predictions against this baseline Usage: python 01_harvest_baseline_prediction.py [project_name] + conda activate pytorch_gpu; cd "C:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane_v2\smartcane\python_app"; python 22_harvest_baseline_prediction.py angata Examples: python 01_harvest_baseline_prediction.py angata diff --git a/r_app/system_architecture/PIPELINE_OVERVIEW.md b/r_app/system_architecture/PIPELINE_OVERVIEW.md new file mode 100644 index 0000000..ac7786c --- /dev/null +++ b/r_app/system_architecture/PIPELINE_OVERVIEW.md @@ -0,0 +1,447 @@ +# SmartCane Processing Pipeline - Complete Script Overview + +## Pipeline Execution Order + +## Complete Pipeline Mermaid Diagram + +```mermaid +graph TD + %% ===== INPUTS ===== + API["🔑 Planet API
Credentials"] + GeoJSON["🗺️ pivot.geojson
(Field Boundaries)"] + HarvestIn["📊 harvest.xlsx
(from Stage 23)"] + + %% ===== STAGE 00: DOWNLOAD ===== + Stage00["Stage 00: Python
00_download_8band_pu_optimized.py"] + Out00["📦 merged_tif_8b/
YYYY-MM-DD.tif
(4-band uint16)"] + + %% ===== STAGE 10: OPTIONAL TILING ===== + Stage10["Stage 10: R
10_create_master_grid...
(Optional)"] + Out10["📦 daily_tiles_split/5x5/
YYYY-MM-DD/*.tif
(25 tiles)"] + + %% ===== STAGE 20: CI EXTRACTION ===== + Stage20["Stage 20: R
20_ci_extraction.R"] + Out20a["📦 combined_CI_data.rds
(wide: fields × dates)"] + Out20b["📦 daily RDS files
(per-date stats)"] + + %% ===== STAGE 21: RDS → CSV ===== + Stage21["Stage 21: R
21_convert_ci_rds_to_csv.R"] + Out21["📦 ci_data_for_python.csv
(long format + DOY)"] + + %% ===== STAGE 22: BASELINE HARVEST ===== + Stage22["Stage 22: Python
22_harvest_baseline_prediction.py
(RUN ONCE)"] + Out22["📦 harvest_production_export.xlsx
(baseline predictions)"] + + %% ===== STAGE 23: HARVEST FORMAT ===== + Stage23["Stage 23: Python
23_convert_harvest_format.py"] + Out23["📦 harvest.xlsx
(standard format)
→ Feeds back to Stage 80"] + + %% ===== STAGE 30: GROWTH MODEL ===== + Stage30["Stage 30: R
30_interpolate_growth_model.R"] + Out30["📦 All_pivots_Cumulative_CI...
_quadrant_year_v2.rds
(interpolated daily)"] + + %% ===== STAGE 31: WEEKLY HARVEST ===== + Stage31["Stage 31: Python
31_harvest_imminent_weekly.py
(Weekly)"] + Out31["📦 harvest_imminent_weekly.csv
(probabilities)"] + + %% ===== STAGE 40: MOSAIC ===== + Stage40["Stage 40: R
40_mosaic_creation.R"] + Out40["📦 weekly_mosaic/
week_WW_YYYY.tif
(5-band composite)"] + + %% ===== STAGE 80: KPI ===== + Stage80["Stage 80: R
80_calculate_kpis.R"] + Out80a["📦 field_analysis_week{WW}.xlsx"] + Out80b["📦 kpi_summary_tables_week{WW}.rds"] + + %% ===== STAGE 90: REPORT ===== + Stage90["Stage 90: R/RMarkdown
90_CI_report_with_kpis_simple.Rmd"] + Out90["📦 SmartCane_Report_week{WW}_{YYYY}.docx
(FINAL OUTPUT)"] + + %% ===== CONNECTIONS: INPUTS TO STAGE 00 ===== + API --> Stage00 + GeoJSON --> Stage00 + + %% ===== STAGE 00 → 10 OR 20 ===== + Stage00 --> Out00 + Out00 --> Stage10 + Out00 --> Stage20 + + %% ===== STAGE 10 → 20 ===== + Stage10 --> Out10 + Out10 --> Stage20 + + %% ===== STAGE 20 → 21, 30, 40 ===== + GeoJSON --> Stage20 + Stage20 --> Out20a + Stage20 --> Out20b + Out20a --> Stage21 + Out20a --> Stage30 + Out00 --> Stage40 + + %% ===== STAGE 21 → 22, 31 ===== + Stage21 --> Out21 + Out21 --> Stage22 + Out21 --> Stage31 + + %% ===== STAGE 22 → 23 ===== + Stage22 --> Out22 + Out22 --> Stage23 + + %% ===== STAGE 23 → 80 & FEEDBACK ===== + Stage23 --> Out23 + Out23 -.->|"Feeds back
(Season context)"| Stage80 + + %% ===== STAGE 30 → 80 ===== + Stage30 --> Out30 + Out30 --> Stage80 + + %% ===== STAGE 31 (PARALLEL) ===== + Stage31 --> Out31 + + %% ===== STAGE 40 → 80, 90 ===== + Stage40 --> Out40 + Out40 --> Stage80 + Out40 --> Stage90 + + %% ===== STAGE 80 → 90 ===== + Stage80 --> Out80a + Stage80 --> Out80b + Out80a --> Stage90 + Out80b --> Stage90 + + %% ===== STAGE 90 FINAL ===== + Stage90 --> Out90 + + %% ===== ADDITIONAL INPUTS ===== + HarvestIn --> Stage30 + HarvestIn --> Stage80 + GeoJSON --> Stage30 + GeoJSON --> Stage40 + GeoJSON --> Stage80 + + %% ===== STYLING ===== + classDef input fill:#e3f2fd,stroke:#1976d2,stroke-width:2px + classDef pyStage fill:#fff3e0,stroke:#f57c00,stroke-width:2px + classDef rStage fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px + classDef output fill:#e8f5e9,stroke:#388e3c,stroke-width:2px + classDef finalOutput fill:#ffebee,stroke:#c62828,stroke-width:3px + + class API,GeoJSON,HarvestIn input + class Stage00,Stage22,Stage23,Stage31 pyStage + class Stage10,Stage20,Stage21,Stage30,Stage40,Stage80,Stage90 rStage + class Out00,Out10,Out20a,Out20b,Out21,Out22,Out30,Out31,Out40,Out80a,Out80b output + class Out23,Out90 finalOutput +``` + +--- + +## Detailed Stage Descriptions + +``` +Stage 00: PYTHON - Download Satellite Data + └─ 00_download_8band_pu_optimized.py + INPUT: Planet API credentials, field boundaries (pivot.geojson), date range + OUTPUT: laravel_app/storage/app/{project}/merged_tif_8b/{YYYY-MM-DD}.tif (4-band uint16) + RUN FREQUENCY: Daily or as-needed + NOTES: 8-band includes UDM cloud masking, optimized for PU cost + +Stage 10: R - (Optional) Create Master Grid & Split TIFFs into Tiles + └─ 10_create_master_grid_and_split_tiffs.R + INPUT: Daily GeoTIFFs from merged_tif_8b/ + OUTPUT: laravel_app/storage/app/{project}/daily_tiles_split/5x5/{YYYY-MM-DD}/*.tif + RUN FREQUENCY: Optional - only if tile-based processing desired + NOTES: Creates 25 tiles per day for memory-efficient processing; 5x5 grid hardcoded + +Stage 20: R - Extract Canopy Index (CI) from Daily Imagery + └─ 20_ci_extraction.R + INPUT: Daily GeoTIFFs (merged_tif_8b/ or daily_tiles_split/) + Field boundaries (pivot.geojson) + Data source parameter (merged_tif_8b, merged_tif, merged_final_tif) + OUTPUT: RDS files: + - laravel_app/storage/app/{project}/Data/extracted_ci/daily_vals/extracted_{YYYY-MM-DD}_{suffix}.rds + - laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/combined_CI_data.rds (wide format) + RUN FREQUENCY: Daily or on-demand + COMMAND: Rscript 20_ci_extraction.R [end_date] [offset] [project_dir] [data_source] + EXAMPLE: Rscript 20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b + NOTES: Auto-detects tiles if daily_tiles_split/ exists; outputs cumulative CI (fields × dates) + +Stage 21: R - Convert CI RDS to CSV for Python Harvest Detection + └─ 21_convert_ci_rds_to_csv.R + INPUT: combined_CI_data.rds (from Stage 20) + OUTPUT: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv + RUN FREQUENCY: After Stage 20 + COMMAND: Rscript 21_convert_ci_rds_to_csv.R [project_dir] + EXAMPLE: Rscript 21_convert_ci_rds_to_csv.R angata + NOTES: Converts wide RDS (fields × dates) to long CSV; interpolates missing dates; adds DOY column + +Stage 22: PYTHON - Baseline Harvest Prediction (LSTM Model 307) + └─ 22_harvest_baseline_prediction.py + INPUT: ci_data_for_python.csv (complete historical CI data) + OUTPUT: laravel_app/storage/app/{project}/Data/HarvestData/harvest_production_export.xlsx + RUN FREQUENCY: ONCE - establishes ground truth baseline for all fields + COMMAND: python 22_harvest_baseline_prediction.py [project_name] + EXAMPLE: python 22_harvest_baseline_prediction.py angata + NOTES: Two-step detection (Phase 1: growing window, Phase 2: ±40 day argmax refinement) + Tuned parameters: threshold=0.3, consecutive_days=2 + Uses LSTM Model 307 dual output heads (imminent + detected) + +Stage 23: PYTHON - Convert Harvest Format to Standard Structure + └─ 23_convert_harvest_format.py + INPUT: harvest_production_export.xlsx (from Stage 22) + CI data date range (determines season_start for first season) + OUTPUT: laravel_app/storage/app/{project}/Data/harvest.xlsx (standard format) + RUN FREQUENCY: After Stage 22 + COMMAND: python 23_convert_harvest_format.py [project_name] + EXAMPLE: python 23_convert_harvest_format.py angata + NOTES: Converts to standard harvest.xlsx format with columns: + field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha + Season format: "Data{year} : {field}" + Only includes completed seasons (with season_end filled) + +Stage 30: R - Growth Model Interpolation (Smooth CI Time Series) + └─ 30_interpolate_growth_model.R + INPUT: combined_CI_data.rds (from Stage 20) + harvest.xlsx (optional, for seasonal context) + OUTPUT: laravel_app/storage/app/{project}/Data/extracted_ci/cumulative_vals/ + All_pivots_Cumulative_CI_quadrant_year_v2.rds + RUN FREQUENCY: Weekly or after CI extraction updates + COMMAND: Rscript 30_interpolate_growth_model.R [project_dir] + EXAMPLE: Rscript 30_interpolate_growth_model.R angata + NOTES: Linear interpolation across gaps; calculates daily change and cumulative CI + Outputs long-format data (Date, DOY, field, value, season, etc.) + +Stage 31: PYTHON - Weekly Harvest Monitoring (Real-Time Alerts) + └─ 31_harvest_imminent_weekly.py + INPUT: ci_data_for_python.csv (recent CI data, last ~300 days) + harvest_production_export.xlsx (optional baseline reference) + OUTPUT: laravel_app/storage/app/{project}/Data/HarvestData/harvest_imminent_weekly.csv + RUN FREQUENCY: Weekly or daily for operational alerts + COMMAND: python 31_harvest_imminent_weekly.py [project_name] + EXAMPLE: python 31_harvest_imminent_weekly.py angata + NOTES: Single-run inference on recent data; outputs probabilities (imminent_prob, detected_prob) + Used for real-time decision support; compared against baseline from Stage 22 + +Stage 40: R - Create Weekly 5-Band Mosaics + └─ 40_mosaic_creation.R + INPUT: Daily GeoTIFFs (merged_tif_8b/ or daily_tiles_split/) + Field boundaries (pivot.geojson) + OUTPUT: laravel_app/storage/app/{project}/weekly_mosaic/week_{WW}_{YYYY}.tif + RUN FREQUENCY: Weekly + COMMAND: Rscript 40_mosaic_creation.R [end_date] [offset] [project_dir] + EXAMPLE: Rscript 40_mosaic_creation.R 2026-01-14 7 angata + NOTES: Composites daily images using MAX function; 5 bands (R, G, B, NIR, CI) + Automatically selects images with acceptable cloud coverage + Output uses ISO week numbering (week_WW_YYYY) + +Stage 80: R - Calculate KPIs & Per-Field Analysis + └─ 80_calculate_kpis.R + INPUT: Weekly mosaic (from Stage 40) + Growth model data (from Stage 30) + Field boundaries (pivot.geojson) + Harvest data (harvest.xlsx) + OUTPUT: laravel_app/storage/app/{project}/reports/ + - {project}_field_analysis_week{WW}.xlsx + - {project}_kpi_summary_tables_week{WW}.rds + RUN FREQUENCY: Weekly + COMMAND: Rscript 80_calculate_kpis.R [end_date] [project_dir] [offset_days] + EXAMPLE: Rscript 80_calculate_kpis.R 2026-01-14 angata 7 + NOTES: Parallel processing for 1000+ fields; calculates: + - Per-field uniformity (CV), phase assignment, growth trends + - Status triggers (germination, rapid growth, disease, harvest imminence) + - Farm-level KPI metrics (6 high-level indicators) + TEST_MODE=TRUE uses only recent weeks for development + +Stage 90: R (RMarkdown) - Generate Executive Report (Word Document) + └─ 90_CI_report_with_kpis_simple.Rmd + INPUT: Weekly mosaic (from Stage 40) + KPI summary data (from Stage 80) + Field analysis (from Stage 80) + Field boundaries & harvest data (for context) + OUTPUT: laravel_app/storage/app/{project}/reports/ + SmartCane_Report_week{WW}_{YYYY}.docx (PRIMARY OUTPUT) + SmartCane_Report_week{WW}_{YYYY}.html (optional) + RUN FREQUENCY: Weekly + RENDERING: R/RMarkdown with officer + flextable packages + NOTES: Executive summary with KPI overview, phase distribution, status triggers + Field-by-field detail pages with CI metrics and interpretation guides + Automatic unit conversion (hectares ↔ acres) +``` + +--- + +## Data Storage & Persistence + +All data persists to the file system. No database writes occur during analysis—reads only for metadata. + +``` +laravel_app/storage/app/{project}/ +├── Data/ +│ ├── pivot.geojson # Field boundaries (read-only input) +│ ├── harvest.xlsx # Season dates & yield (standard format from Stage 23) +│ ├── vrt/ # Virtual raster files (daily VRTs from Stage 20) +│ │ └── YYYY-MM-DD.vrt +│ ├── extracted_ci/ +│ │ ├── ci_data_for_python/ +│ │ │ └── ci_data_for_python.csv # CSV for Python (from Stage 21) +│ │ ├── daily_vals/ +│ │ │ └── extracted_YYYY-MM-DD_{suffix}.rds # Daily field CI stats (from Stage 20) +│ │ └── cumulative_vals/ +│ │ ├── combined_CI_data.rds # Cumulative CI, wide format (from Stage 20) +│ │ └── All_pivots_Cumulative_CI_quadrant_year_v2.rds # Interpolated daily (from Stage 30) +│ └── HarvestData/ +│ ├── harvest_production_export.xlsx # Baseline harvest predictions (from Stage 22) +│ └── harvest_imminent_weekly.csv # Weekly monitoring output (from Stage 31) +│ +├── merged_tif_8b/ # Raw 4-band satellite imagery (Stage 00 output) +│ └── YYYY-MM-DD.tif # 4 bands: R, G, B, NIR (uint16 with UDM cloud masking) +│ +├── daily_tiles_split/ # (Optional) Tile-based processing (Stage 10 output) +│ ├── 5x5/ +│ │ ├── tiling_config.json # Metadata about tiling parameters +│ │ └── YYYY-MM-DD/ # Date-specific folder +│ │ └── YYYY-MM-DD_{00-24}.tif # 25 tiles per day +│ +├── weekly_mosaic/ # Weekly composite mosaics (Stage 40 output) +│ └── week_WW_YYYY.tif # 5 bands: R, G, B, NIR, CI (composite) +│ +└── reports/ # Analysis outputs & reports (Stage 80, 90 outputs) + ├── SmartCane_Report_week{WW}_{YYYY}.docx # FINAL REPORT (Stage 90) + ├── SmartCane_Report_week{WW}_{YYYY}.html # Alternative format + ├── {project}_field_analysis_week{WW}.xlsx # Field-by-field data (Stage 80) + ├── {project}_kpi_summary_tables_week{WW}.rds # Summary RDS (Stage 80) + └── kpis/ + └── week_WW_YYYY/ # Week-specific KPI folder +``` + +--- + +## Key File Formats + +| Format | Stage | Purpose | Example | +|--------|-------|---------|---------| +| `.tif` (GeoTIFF) | 00, 10, 40 | Geospatial raster imagery | `2026-01-14.tif` (4-band), `week_02_2026.tif` (5-band) | +| `.vrt` (Virtual Raster) | 20 | Virtual pointer to TIFFs | `2026-01-14.vrt` | +| `.rds` (R Binary) | 20, 21, 30, 80 | R serialized data objects | `combined_CI_data.rds`, `All_pivots_Cumulative_CI_quadrant_year_v2.rds` | +| `.csv` (Comma-Separated) | 21, 31 | Tabular data for Python | `ci_data_for_python.csv`, `harvest_imminent_weekly.csv` | +| `.xlsx` (Excel) | 22, 23, 80 | Tabular reports & harvest data | `harvest.xlsx`, `harvest_production_export.xlsx`, field analysis | +| `.docx` (Word) | 90 | Executive report (final output) | `SmartCane_Report_week02_2026.docx` | +| `.json` | 10 | Tiling metadata | `tiling_config.json` | +| `.geojson` | Input | Field boundaries (read-only) | `pivot.geojson` | + +--- + +## Script Dependencies & Utility Files + +``` +parameters_project.R + ├─ Loaded by: 20_ci_extraction.R, 30_interpolate_growth_model.R, + │ 40_mosaic_creation.R, 80_calculate_kpis.R, 90_CI_report_with_kpis_simple.Rmd + └─ Purpose: Initializes project config (paths, field boundaries, harvest data) + +harvest_date_pred_utils.py + ├─ Used by: 22_harvest_baseline_prediction.py, 23_convert_harvest_format.py, 31_harvest_imminent_weekly.py + └─ Purpose: LSTM model loading, feature extraction, two-step harvest detection + +20_ci_extraction_utils.R + ├─ Used by: 20_ci_extraction.R + └─ Purpose: CI calculation, field masking, RDS I/O, tile detection + +30_growth_model_utils.R + ├─ Used by: 30_interpolate_growth_model.R + └─ Purpose: Linear interpolation, daily metrics, seasonal grouping + +40_mosaic_creation_utils.R, 40_mosaic_creation_tile_utils.R + ├─ Used by: 40_mosaic_creation.R + └─ Purpose: Weekly composite creation, cloud assessment, raster masking + +kpi_utils.R + ├─ Used by: 80_calculate_kpis.R + └─ Purpose: Per-field statistics, phase assignment, trigger detection + +report_utils.R + ├─ Used by: 90_CI_report_with_kpis_simple.Rmd + └─ Purpose: Report building, table formatting, Word document generation +``` + +--- + +## Command-Line Execution Examples + +### Daily/Weekly Workflow + +```bash +# Stage 00: Download today's satellite data +cd python_app +python 00_download_8band_pu_optimized.py angata --cleanup + +# Stage 20: Extract CI from daily imagery (last 7 days) +cd ../r_app +Rscript 20_ci_extraction.R 2026-01-14 7 angata merged_tif_8b + +# Stage 21: Convert CI to CSV for harvest detection +Rscript 21_convert_ci_rds_to_csv.R angata + +# Stage 31: Weekly harvest monitoring (real-time alerts) +cd ../python_app +python 31_harvest_imminent_weekly.py angata + +# Back to R for mosaic and KPIs +cd ../r_app +Rscript 40_mosaic_creation.R 2026-01-14 7 angata +Rscript 80_calculate_kpis.R 2026-01-14 angata 7 + +# Stage 90: Generate report +Rscript -e "rmarkdown::render('90_CI_report_with_kpis_simple.Rmd')" +``` + +### One-Time Setup (Baseline Harvest Detection) + +```bash +# Only run ONCE to establish baseline +cd python_app +python 22_harvest_baseline_prediction.py angata + +# Convert to standard format +python 23_convert_harvest_format.py angata +``` + +--- + +## Processing Notes + +### CI Extraction (Stage 20) +- Calculates CI = (NIR - Green) / (NIR + Green) +- Supports both 4-band and 8-band imagery with auto-detection +- Handles cloud masking via UDM band (8-band) or manual thresholding (4-band) +- Outputs cumulative RDS in wide format (fields × dates) for fast lookups + +### Growth Model (Stage 30) +- Linear interpolation across missing dates +- Maintains seasonal context for agricultural lifecycle tracking +- Outputs long-format data for trend analysis + +### Harvest Detection (Stages 22 & 31) +- **Model 307**: Unidirectional LSTM with dual output heads + - Imminent Head: Probability field will be harvestable in next 28 days + - Detected Head: Probability of immediate harvest event +- **Stage 22 (Baseline)**: Two-step detection on complete historical data + - Phase 1: Growing window expansion (real-time simulation) + - Phase 2: ±40 day refinement (argmax harvest signal) +- **Stage 31 (Weekly)**: Single-run inference on recent data (~300 days) + - Compares against baseline for anomaly detection + +### KPI Calculation (Stage 80) +- **Per-field metrics**: Uniformity (CV), phase, growth trends, 4-week trends +- **Status triggers**: Germination, rapid growth, slow growth, non-uniform, weed pressure, harvest imminence +- **Farm-level KPIs**: 6 high-level indicators for executive summary +- **Parallel processing**: ~1000+ fields processed in <5 minutes + +--- + +## Future Enhancements + +- **Real-Time Monitoring**: Daily harvest probability updates integrated into web dashboard +- **SAR Integration**: Radar satellite data (Sentinel-1) for all-weather monitoring +- **IoT Sensors**: Ground-based soil moisture and weather integration +- **Advanced Yield Models**: Enhanced harvest forecasting with satellite + ground truth +- **Automated Alerts**: WhatsApp/email dispatch of critical agricultural advice diff --git a/r_app/system_architecture/system_architecture.md b/r_app/system_architecture/system_architecture.md index a14e572..fd69f31 100644 --- a/r_app/system_architecture/system_architecture.md +++ b/r_app/system_architecture/system_architecture.md @@ -1,9 +1,9 @@ -# SmartCane System Architecture - R Pipeline & File-Based Processing +# SmartCane System Architecture - Python + R Pipeline & File-Based Processing ## Overview -The SmartCane system is a file-based agricultural intelligence platform that processes satellite imagery through a multi-stage R-script pipeline. Raw satellite imagery flows through sequential processing steps (CI extraction, growth model interpolation, mosaic creation, KPI analysis) with outputs persisted as GeoTIFFs, RDS files, and Excel/Word reports. +The SmartCane system is a file-based agricultural intelligence platform that processes satellite imagery through sequential Python and R scripts. Raw satellite imagery is downloaded via Planet API (Python), then flows through R processing stages (CI extraction, growth model interpolation, mosaic creation, KPI analysis, harvest detection) with outputs persisted as GeoTIFFs, RDS files, Excel sheets, and Word reports. Harvest monitoring is performed via ML-based harvest detection using LSTM models trained on historical CI sequences. ## Processing Pipeline Overview From fabbf3214ddef256802e620dca390c968d85f3ec Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 14:30:54 +0100 Subject: [PATCH 04/15] Enhance harvest detection logic and testing framework - Updated `detect_mosaic_mode` function to check for grid-size subdirectories in addition to tile-named files. - Added comprehensive tests for DOY reset logic in `test_doy_logic.py`. - Implemented feature extraction tests in `test_feature_extraction.py`. - Created tests for growing window method in `test_growing_window_only.py`. - Developed a complete model inference test in `test_model_inference.py`. - Added a debug script for testing two-step refinement logic in `test_script22_debug.py`. --- python_app/22_harvest_baseline_prediction.py | 6 +- python_app/harvest_date_pred_utils.py | 99 ++- .../tests/test_doy_logic.py | 65 ++ .../tests/test_feature_extraction.py | 73 ++ .../tests/test_growing_window_only.py | 141 ++++ .../tests/test_model_inference.py | 123 +++ .../tests/test_script22_debug.py | 178 +++++ r_app/40_mosaic_creation.R | 30 +- r_app/40_mosaic_creation_utils.R | 107 ++- r_app/80_calculate_kpis.R | 755 +++++++++++++++--- r_app/parameters_project.R | 26 +- 11 files changed, 1416 insertions(+), 187 deletions(-) create mode 100644 python_app/harvest_detection_experiments/tests/test_doy_logic.py create mode 100644 python_app/harvest_detection_experiments/tests/test_feature_extraction.py create mode 100644 python_app/harvest_detection_experiments/tests/test_growing_window_only.py create mode 100644 python_app/harvest_detection_experiments/tests/test_model_inference.py create mode 100644 python_app/harvest_detection_experiments/tests/test_script22_debug.py diff --git a/python_app/22_harvest_baseline_prediction.py b/python_app/22_harvest_baseline_prediction.py index ffd8f14..f39dca6 100644 --- a/python_app/22_harvest_baseline_prediction.py +++ b/python_app/22_harvest_baseline_prediction.py @@ -26,7 +26,9 @@ This is your GROUND TRUTH - compare all future predictions against this baseline Usage: python 01_harvest_baseline_prediction.py [project_name] - conda activate pytorch_gpu; cd "C:\Users\timon\Resilience BV\4020 SCane ESA DEMO - Documenten\General\4020 SCDEMO Team\4020 TechnicalData\WP3\smartcane_v2\smartcane\python_app"; python 22_harvest_baseline_prediction.py angata + conda activate pytorch_gpu + cd python_app + python 22_harvest_baseline_prediction.py angata Examples: python 01_harvest_baseline_prediction.py angata @@ -108,7 +110,7 @@ def main(): # [3/4] Run model predictions with two-step detection print("\n[3/4] Running two-step harvest detection...") - print(" (Using threshold=0.3, consecutive_days=2 - tuned for Model 307 output)") + print(" (Using threshold=0.3, consecutive_days=2 - tuned baseline with DOY reset)") refined_results = run_two_step_refinement(ci_data, model, config, scalers, device=device, phase1_threshold=0.3, phase1_consecutive=2) diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py index ebf77c8..aa4199c 100644 --- a/python_app/harvest_date_pred_utils.py +++ b/python_app/harvest_date_pred_utils.py @@ -184,18 +184,47 @@ def compute_ci_features(ci_series: pd.Series, doy_series: pd.Series = None) -> p return features.fillna(0) -def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData') -> np.ndarray: - """Extract and return specified features as numpy array.""" +def extract_features(data_df: pd.DataFrame, feature_names: List[str], ci_column: str = 'FitData', + season_anchor_day: int = None, lookback_start: int = 0) -> np.ndarray: + """ + Extract and return specified features as numpy array. + + Args: + data_df: DataFrame with Date and CI data (may be a window after a harvest) + feature_names: List of feature names to extract + ci_column: Name of CI column + season_anchor_day: Day in FULL sequence where this season started (for DOY reset) + DOY will be recalculated as: 1, 2, 3, ... from this point + lookback_start: Starting index in original full data (for season reset calculation) + + Returns: + NumPy array of shape (len(data_df), len(feature_names)) + """ # Compute all CI features ci_series = data_df[ci_column].astype(float) - doy_series = pd.Series(range(len(data_df)), index=data_df.index) % 365 if 'DOY_normalized' in feature_names else None + + # Compute DOY (age/days since season start) - NOT day-of-year! + # DOY is a continuous counter: 1, 2, 3, ..., 475 (doesn't cycle at 365) + # It only resets to 1 after a harvest is detected (new season) + doy_series = None + if 'DOY_normalized' in feature_names: + if season_anchor_day is not None and lookback_start >= season_anchor_day: + # Season was reset after harvest. Recalculate DOY as simple counter from 1 + # This is a window starting at or after harvest, so DOY should be: 1, 2, 3, ... + doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index) + elif 'DOY' in data_df.columns: + # Use DOY directly from CSV - already calculated as continuous age counter + doy_series = pd.Series(data_df['DOY'].astype(float).values, index=data_df.index) + else: + # Fallback: create continuous age counter (1, 2, 3, ...) + doy_series = pd.Series(np.arange(1, len(data_df) + 1), index=data_df.index) all_features = compute_ci_features(ci_series, doy_series) # Select requested features requested = [f for f in feature_names if f in all_features.columns] if not requested: - raise ValueError(f"No valid features found. Requested: {feature_names}") + raise ValueError(f"No valid features found. Requested: {feature_names}, Available: {all_features.columns.tolist()}") return all_features[requested].values @@ -274,42 +303,63 @@ def load_harvest_data(data_file: Path) -> pd.DataFrame: def run_phase1_growing_window(field_data, model, config, scalers, ci_column, device, threshold=0.45, consecutive_days=2): """ - Phase 1: Growing window detection with threshold crossing. - Expand window day-by-day, check last timestep's detected_prob. - When N consecutive days have prob > threshold, harvest detected. + Phase 1: Growing window detection with DOY season reset. + + For each detected harvest, reset DOY counter for the next season. + This allows the model to detect multiple consecutive harvests in multi-year data. + + Algorithm: + 1. Start with season_anchor_day = 0 (DOY 1 at day 0) + 2. Expand window: [0:1], [0:2], [0:3], ... until threshold crossed + 3. When harvest detected: record date, set new season_anchor = day after harvest + 4. Continue from next season start Args: - threshold (float): Probability threshold (default 0.45, tuned for Model 307) - consecutive_days (int): Required consecutive days above threshold (default 2, reduced from 3 for robustness) + threshold (float): Probability threshold (default 0.45) + consecutive_days (int): Required consecutive days above threshold (default 2) Returns list of (harvest_date, harvest_idx) tuples. """ harvest_dates = [] + season_anchor_day = 0 # DOY 1 starts at day 0 current_pos = 0 while current_pos < len(field_data): consecutive_above_threshold = 0 + min_window_size = 120 # Need at least 120 days (~4 months) for patterns to establish for window_end in range(current_pos + 1, len(field_data) + 1): window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) + # Skip if window is too small (model needs long sequences for pattern learning) + if len(window_data) < min_window_size: + continue + try: - features = extract_features(window_data, config['features'], ci_column=ci_column) + # CRITICAL: Pass season_anchor_day so DOY resets after harvest + features = extract_features( + window_data, + config['features'], + ci_column=ci_column, + season_anchor_day=season_anchor_day, + lookback_start=current_pos + ) # Apply scalers + features_scaled = features.copy().astype(float) for fi, scaler in enumerate(scalers): try: - features[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() except Exception: pass - # Run model + # Run model on expanding window with torch.no_grad(): - x_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device) + x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device) imminent_probs, detected_probs = model(x_tensor) - # Check LAST timestep - last_prob = detected_probs[-1] + # Check LAST timestep only + last_prob = detected_probs[0, -1].item() if last_prob > threshold: consecutive_above_threshold += 1 @@ -318,16 +368,21 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev # Harvest detected: N consecutive days above threshold if consecutive_above_threshold >= consecutive_days: - harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date'] - harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days)) + harvest_idx = current_pos + window_end - consecutive_days + harvest_date = field_data.iloc[harvest_idx]['Date'] + harvest_dates.append((harvest_date, harvest_idx)) - # Reset to next day after harvest - current_pos = current_pos + window_end - consecutive_days + 1 + # CRITICAL: Reset season anchor for next season + # DOY 1 starts at day after harvest + season_anchor_day = harvest_idx + 1 + current_pos = harvest_idx + 1 break - except Exception: + except Exception as e: + # Skip window on error continue else: + # No more harvests found break return harvest_dates @@ -413,7 +468,9 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non device = torch.device("cuda" if torch.cuda.is_available() else "cpu") results = [] - ci_column = config['data']['ci_column'] + # CI column is 'FitData' (interpolated CI) - NOT 'value' (raw with NAs) + # 'FitData' is already gap-filled by R stage 03, ready for ML + ci_column = 'FitData' # Group by field and count total fields for progress field_groups = list(df.groupby('field')) diff --git a/python_app/harvest_detection_experiments/tests/test_doy_logic.py b/python_app/harvest_detection_experiments/tests/test_doy_logic.py new file mode 100644 index 0000000..dc4edb6 --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_doy_logic.py @@ -0,0 +1,65 @@ +""" +Test DOY reset logic for harvest detection. +Verify that DOY resets to 1, 2, 3, ... after harvest is detected. +""" + +import sys +from pathlib import Path +sys.path.insert(0, str(Path.cwd())) + +import pandas as pd +import numpy as np +from harvest_date_pred_utils import extract_features, load_model_and_config +import torch + +# Load sample data +ci_data = pd.read_csv('../laravel_app/storage/app/angata/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv') +ci_data['Date'] = pd.to_datetime(ci_data['Date']) + +# Get field 779 data +field_779 = ci_data[ci_data['field'] == '779'].reset_index(drop=True) +print(f"Field 779: {len(field_779)} days of data") +print(f"Date range: {field_779['Date'].min().date()} to {field_779['Date'].max().date()}\n") + +# Load model config +model, config, scalers = load_model_and_config(Path.cwd()) + +# Test 1: First season (season_anchor_day = 0) +print("=" * 80) +print("TEST 1: First season (season_anchor_day=0, lookback_start=0)") +print("=" * 80) +window = field_779.iloc[0:20].copy().reset_index(drop=True) +features = extract_features(window, config['features'], ci_column='FitData', + season_anchor_day=0, lookback_start=0) + +# Extract DOY_normalized column (index 13 or find it) +feature_names = config['features'] +doy_idx = feature_names.index('DOY_normalized') if 'DOY_normalized' in feature_names else -1 +if doy_idx >= 0: + doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize + print(f"Window size: {len(window)} days") + print(f"DOY values: {doy_values[:10]}") + print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]") + assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not incrementing correctly!" + print("✓ DOY incrementing correctly for first season\n") + +# Test 2: After harvest detected at day 100, next season starts +print("=" * 80) +print("TEST 2: After harvest at day 100, new season starts (season_anchor_day=101, lookback_start=101)") +print("=" * 80) +harvest_day = 100 +window = field_779.iloc[harvest_day:harvest_day+20].copy().reset_index(drop=True) +features = extract_features(window, config['features'], ci_column='FitData', + season_anchor_day=harvest_day+1, lookback_start=harvest_day+1) + +if doy_idx >= 0: + doy_values = (features[:, doy_idx] * 450).astype(int) # Denormalize + print(f"Window size: {len(window)} days (starting at day {harvest_day})") + print(f"DOY values: {doy_values[:10]}") + print(f"Expected: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] (fresh season)") + assert list(doy_values[:10]) == [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "DOY not reset after harvest!" + print("✓ DOY reset correctly for new season\n") + +print("=" * 80) +print("ALL TESTS PASSED! DOY logic is correct.") +print("=" * 80) diff --git a/python_app/harvest_detection_experiments/tests/test_feature_extraction.py b/python_app/harvest_detection_experiments/tests/test_feature_extraction.py new file mode 100644 index 0000000..bd75ffc --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_feature_extraction.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +""" +Quick test: Verify feature extraction works +""" + +import sys +import pandas as pd +import numpy as np +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) + +from harvest_date_pred_utils import extract_features, load_model_and_config + +project_name = "angata" +base_storage = Path("../laravel_app/storage/app") / project_name / "Data" +CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv" + +print("="*80) +print("DEBUG: Feature Extraction Test") +print("="*80) + +# Load model config +print("\n[1] Loading model config...") +model, config, scalers = load_model_and_config(Path(".")) +print(f" Config features: {config['features']}") +print(f" Number of features: {len(config['features'])}") + +# Load CI data +print("\n[2] Loading CI data...") +ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) +ci_data['Date'] = pd.to_datetime(ci_data['Date']) +print(f" Columns: {ci_data.columns.tolist()}") +print(f" Total rows: {len(ci_data)}") + +# Test on a single field +test_field = "1" +field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True) +print(f"\n[3] Testing on field {test_field}...") +print(f" Data points: {len(field_data)}") +print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}") +print(f" Columns in field data: {field_data.columns.tolist()}") +print(f" Sample values:") +print(field_data[['Date', 'value']].head()) + +# Test feature extraction on first 50 days +print(f"\n[4] Extracting features for first 50 days...") +try: + subset = field_data.iloc[:50].copy() + features = extract_features(subset, config['features'], ci_column='value') + print(f" ✓ Success!") + print(f" Feature shape: {features.shape}") + print(f" Expected shape: (50, {len(config['features'])})") + print(f" Feature values sample (first 5 days):") + for i in range(min(5, features.shape[0])): + print(f" Day {i}: {features[i]}") +except Exception as e: + print(f" ✗ Error: {e}") + import traceback + traceback.print_exc() + +print("\n[5] Testing on growing windows...") +try: + for window_size in [10, 20, 30, 50]: + window_data = field_data.iloc[:window_size].copy() + features = extract_features(window_data, config['features'], ci_column='value') + print(f" Window size {window_size}: shape={features.shape}, min={features.min():.4f}, max={features.max():.4f}") +except Exception as e: + print(f" ✗ Error: {e}") + import traceback + traceback.print_exc() + +print("\n✓ Feature extraction test complete") diff --git a/python_app/harvest_detection_experiments/tests/test_growing_window_only.py b/python_app/harvest_detection_experiments/tests/test_growing_window_only.py new file mode 100644 index 0000000..feec06e --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_growing_window_only.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Test ONLY the growing window method (what production actually uses) +Never run model on full sequence - only on expanding windows + +This matches real deployment where data arrives daily +""" + +import sys +import pandas as pd +import numpy as np +import torch +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, +) + +project_name = "angata" + +# Find root by walking up until we find laravel_app +script_dir = Path(__file__).parent +root = script_dir +while root != root.parent: # Stop at filesystem root + if (root / "laravel_app").exists(): + break + root = root.parent + +base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data" +CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv" +MODEL_DIR = root / "python_app" + +print("="*80) +print("GROWING WINDOW METHOD ONLY (Real Production Simulation)") +print("="*80) + +# Load model +print("\n[1] Loading model...") +model, config, scalers = load_model_and_config(MODEL_DIR) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# Load CI data +print("\n[2] Loading CI data...") +ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) +ci_data['Date'] = pd.to_datetime(ci_data['Date']) + +# Test on field 779 +test_field = "779" +field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True) + +print(f"\n[3] Field {test_field}: {len(field_data)} data points") +print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}") + +# Simulate growing window (real production) +print(f"\n[4] Simulating growing window (expanding daily)...") + +harvest_dates = [] +current_pos = 0 +consecutive_above = 0 +threshold = 0.3 +consecutive_days = 2 +model_runs = 0 + +while current_pos < len(field_data): + consecutive_above = 0 + found_harvest = False + + for window_end in range(current_pos + 1, len(field_data) + 1): + # Expand window: current_pos to window_end + window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) + + try: + # Extract features for THIS window only + features = extract_features(window_data, config['features'], ci_column='value') + + # Normalize + features_scaled = features.copy().astype(float) + for fi, scaler in enumerate(scalers): + features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + + # Run model on expanding window + with torch.no_grad(): + x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device) + imminent_probs, detected_probs = model(x_tensor) + + model_runs += 1 + + # Check LAST timestep only + last_prob = detected_probs[0, -1].item() + last_date = window_data.iloc[-1]['Date'].date() + + # Print every 50th window to track progress + if window_end % 50 == 0 or window_end < 10: + print(f" Window [{current_pos:3d}:{window_end:3d}] ({last_date}): prob={last_prob:.4f}", end="") + if last_prob > threshold: + print(" ✓ ABOVE", end="") + print() + + # Check threshold + if last_prob > threshold: + consecutive_above += 1 + else: + consecutive_above = 0 + + # Harvest detected + if consecutive_above >= consecutive_days: + harvest_idx = current_pos + window_end - consecutive_days + harvest_date = field_data.iloc[harvest_idx]['Date'] + harvest_dates.append((harvest_date, harvest_idx, last_prob)) + print(f"\n ✓ HARVEST DETECTED at {harvest_date.date()} (index {harvest_idx})") + print(f" {consecutive_days} consecutive days above {threshold}") + + # Jump past this harvest + current_pos = harvest_idx + 1 + found_harvest = True + break + + except Exception as e: + print(f" ERROR at window [{current_pos}:{window_end}]: {e}") + continue + + if not found_harvest: + break + +print(f"\n[5] Results:") +print(f" Total model runs: {model_runs}") +print(f" Harvests found: {len(harvest_dates)}") + +if harvest_dates: + print(f"\n Harvest dates:") + for date, idx, prob in harvest_dates: + print(f" {date.date()}: index {idx}, last_prob={prob:.4f}") +else: + print(f"\n No harvests detected") + +print(f"\n[6] Analysis:") +print(f" Model runs per day: {model_runs / len(field_data):.2f}x") +print(f" Expected: ~{len(field_data):.0f} runs (one per day)") diff --git a/python_app/harvest_detection_experiments/tests/test_model_inference.py b/python_app/harvest_detection_experiments/tests/test_model_inference.py new file mode 100644 index 0000000..e98d99c --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_model_inference.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Complete test: Feature extraction + Model inference + Phase 1 detection +""" + +import sys +import pandas as pd +import numpy as np +import torch +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) + +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, + run_phase1_growing_window +) + +project_name = "angata" +base_storage = Path("../laravel_app/storage/app") / project_name / "Data" +CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv" + +print("="*80) +print("DEBUG: Model Inference + Phase 1 Detection") +print("="*80) + +# Load model +print("\n[1] Loading model...") +model, config, scalers = load_model_and_config(Path(".")) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(f" Device: {device}") +print(f" Scalers type: {type(scalers)}") +print(f" Number of scalers: {len(scalers) if isinstance(scalers, list) else 'N/A (dict/object)'}") + +# Load CI data +print("\n[2] Loading CI data...") +ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) +ci_data['Date'] = pd.to_datetime(ci_data['Date']) + +# Test on a known field (field 1) +test_field = "1" +field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True) +print(f"\n[3] Testing on field {test_field}...") +print(f" Data points: {len(field_data)}") + +# Test with first 100 days +subset_100 = field_data.iloc[:100].copy().reset_index(drop=True) +print(f"\n[4] Testing model inference on first 100 days...") + +try: + features = extract_features(subset_100, config['features'], ci_column='value') + print(f" Features shape: {features.shape}") + print(f" Features dtype: {features.dtype}") + + # Apply scalers + features_scaled = features.copy().astype(float) + print(f" Applying {len(scalers)} scalers...") + + for fi, scaler in enumerate(scalers): + try: + col_data = features[:, fi].reshape(-1, 1) + scaled_col = scaler.transform(col_data) + features_scaled[:, fi] = scaled_col.flatten() + if fi < 3: # Show first 3 scalers + print(f" Scaler {fi}: transformed {features[0, fi]:.4f} → {features_scaled[0, fi]:.4f}") + except Exception as e: + print(f" ERROR in scaler {fi}: {e}") + raise + + # Run model + print(f"\n Running model inference...") + x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device) + print(f" Tensor shape: {x_tensor.shape}") + + with torch.no_grad(): + imminent_probs, detected_probs = model(x_tensor) + + print(f" Imminent probs shape: {imminent_probs.shape}") + print(f" Detected probs shape: {detected_probs.shape}") + print(f" Detected probs dtype: {detected_probs.dtype}") + + # Analyze detected probs + detected_np = detected_probs[0].cpu().numpy() # Get first (only) batch + print(f"\n Detected head analysis:") + print(f" Min: {detected_np.min():.4f}") + print(f" Max: {detected_np.max():.4f}") + print(f" Mean: {detected_np.mean():.4f}") + print(f" Median: {np.median(detected_np):.4f}") + print(f" > 0.1: {(detected_np > 0.1).sum()} days") + print(f" > 0.3: {(detected_np > 0.3).sum()} days") + print(f" > 0.5: {(detected_np > 0.5).sum()} days") + + # Show top 5 peaks + top_indices = np.argsort(detected_np)[-5:][::-1] + print(f"\n Top 5 detected peaks:") + for idx in top_indices: + date = subset_100.iloc[idx]['Date'].date() + prob = detected_np[idx] + print(f" Day {idx} ({date}): {prob:.4f}") + +except Exception as e: + print(f" ERROR: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +# Test Phase 1 growing window +print(f"\n[5] Testing Phase 1 growing window (threshold=0.3, consecutive=2)...") +try: + phase1_results = run_phase1_growing_window( + subset_100, model, config, scalers, 'value', device, + threshold=0.3, consecutive_days=2 + ) + print(f" ✓ Phase 1 found {len(phase1_results)} harvest(s):") + for harvest_date, harvest_idx in phase1_results: + print(f" {harvest_date.date()}: index {harvest_idx}") +except Exception as e: + print(f" ERROR: {e}") + import traceback + traceback.print_exc() + +print("\n✓ Model inference test complete") diff --git a/python_app/harvest_detection_experiments/tests/test_script22_debug.py b/python_app/harvest_detection_experiments/tests/test_script22_debug.py new file mode 100644 index 0000000..85186b1 --- /dev/null +++ b/python_app/harvest_detection_experiments/tests/test_script22_debug.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Debug script: Test if script 22 logic is working +Tests the two-step refinement on a single known field +""" + +import sys +import time +import pandas as pd +import numpy as np +import torch +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +from harvest_date_pred_utils import ( + load_model_and_config, + extract_features, + run_phase1_growing_window, +) + +project_name = "angata" + +# Find the workspace root by looking for laravel_app folder +script_dir = Path(__file__).parent +root = script_dir +while root != root.parent: + if (root / "laravel_app").exists(): + break + root = root.parent + +base_storage = root / "laravel_app" / "storage" / "app" / project_name / "Data" +CI_DATA_FILE = base_storage / "extracted_ci" / "ci_data_for_python" / "ci_data_for_python.csv" +MODEL_DIR = root / "python_app" + +print("="*80) +print("DEBUG: Script 22 Two-Step Refinement Logic") +print("="*80) + +# Load model +print("\n[1] Loading model...") +model, config, scalers = load_model_and_config(MODEL_DIR) +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +print(f" Device: {device}") +print(f" Model features: {config['features']}") + +# Load CI data +print("\n[2] Loading CI data...") +ci_data = pd.read_csv(CI_DATA_FILE, dtype={'field': str}) +ci_data['Date'] = pd.to_datetime(ci_data['Date']) +print(f" Total rows: {len(ci_data)}") +print(f" Fields: {ci_data['field'].nunique()}") +print(f" Date range: {ci_data['Date'].min().date()} to {ci_data['Date'].max().date()}") + +# Test on a known field (field 779 from our previous tests) +test_field = "779" +field_data = ci_data[ci_data['field'] == test_field].sort_values('Date').reset_index(drop=True) + +print(f"\n[3] Testing on field {test_field}...") +print(f" Data points: {len(field_data)}") +print(f" Date range: {field_data['Date'].min().date()} to {field_data['Date'].max().date()}") + +if len(field_data) == 0: + print(f" ERROR: No data for field {test_field}") + sys.exit(1) + +# Extract features +print(f"\n[4] Extracting features for field {test_field}...") +try: + features = extract_features(field_data.reset_index(drop=True), config['features'], ci_column='value') + print(f" Features shape: {features.shape}") + print(f" Features dtype: {features.dtype}") +except Exception as e: + print(f" ERROR: Could not extract features: {e}") + sys.exit(1) + +# Normalize and run model +print(f"\n[5] Running Phase 1 GROWING WINDOW method (threshold=0.5, consecutive=3)...") +print(f" This simulates real production: expanding windows, checking each day") +print(f" Expected: ~477 model runs for 477 days (SLOW)") + +import time +start_time = time.time() + +# Add instrumentation to see how many model runs are happening +original_run = run_phase1_growing_window + +def instrumented_run(*args, **kwargs): + import sys + from harvest_date_pred_utils import extract_features + + field_data = args[0] + model = args[1] + config = args[2] + scalers = args[3] + ci_column = args[4] + device = args[5] + threshold = kwargs.get('threshold', 0.3) + consecutive_days = kwargs.get('consecutive_days', 2) + + harvest_dates = [] + current_pos = 0 + model_runs = 0 + + print(f" Starting growing window loop...") + + while current_pos < len(field_data): + consecutive_above_threshold = 0 + loop_start = current_pos + + for window_end in range(current_pos + 1, len(field_data) + 1): + window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) + + try: + features = extract_features(window_data, config['features'], ci_column=ci_column) + + features_scaled = features.copy().astype(float) + for fi, scaler in enumerate(scalers): + try: + features_scaled[:, fi] = scaler.transform(features[:, fi].reshape(-1, 1)).flatten() + except Exception as e: + raise ValueError(f"Scaler {fi} failed: {e}") + + import torch + with torch.no_grad(): + x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device) + imminent_probs, detected_probs = model(x_tensor) + + model_runs += 1 + last_prob = detected_probs[0, -1].item() + + if last_prob > threshold: + consecutive_above_threshold += 1 + else: + consecutive_above_threshold = 0 + + if consecutive_above_threshold >= consecutive_days: + harvest_date = field_data.iloc[current_pos + window_end - consecutive_days]['Date'] + harvest_dates.append((harvest_date, current_pos + window_end - consecutive_days)) + current_pos = current_pos + window_end - consecutive_days + 1 + break + + except Exception as e: + pass + else: + break + + print(f" Model runs performed: {model_runs}") + return harvest_dates + +phase1_results = instrumented_run( + field_data.reset_index(drop=True), + model, config, scalers, 'value', device, + threshold=0.5, + consecutive_days=3 +) + +elapsed = time.time() - start_time +print(f"\n Time elapsed: {elapsed:.2f}s") + +if phase1_results: + print(f" ✓ Phase 1 detected {len(phase1_results)} harvest(s):") + + # Get probabilities for display by running model once on full field + with torch.no_grad(): + X = features.reshape(1, -1, len(config['features'])) + X_normalized = np.zeros_like(X) + for fi, scaler in enumerate(scalers): + X_normalized[0, :, fi] = scaler.transform(X[0, :, fi].reshape(-1, 1)).flatten() + X_tensor = torch.from_numpy(X_normalized).float().to(device) + _, detected_probs = model(X_tensor) + detected_np = detected_probs[0].cpu().numpy() + + for harvest_date, harvest_idx in phase1_results: + prob = detected_np[harvest_idx] if harvest_idx < len(detected_np) else 0.0 + print(f" {harvest_date.date()}: index {harvest_idx}, probability={prob:.4f}") +else: + print(f" ✗ Phase 1: No harvest detected") diff --git a/r_app/40_mosaic_creation.R b/r_app/40_mosaic_creation.R index 7b7ab23..f1d6b91 100644 --- a/r_app/40_mosaic_creation.R +++ b/r_app/40_mosaic_creation.R @@ -14,8 +14,8 @@ # - tile_size: Tile size in km (default: 5, only used if use_tiles=TRUE) # # Examples: -# Rscript 04_mosaic_creation.R 2025-12-21 7 angata -# Rscript 04_mosaic_creation.R 2025-12-21 7 angata week_51.tif TRUE 5 [tile-based] + +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/40_mosaic_creation.R 2026-01-12 7 angata # # 1. Load required packages @@ -142,15 +142,35 @@ main <- function() { if (use_tile_mosaic) { # TILE-BASED APPROACH: Create per-tile weekly MAX mosaics # This is used for projects like Angata with large ROIs requiring spatial partitioning + # Input data comes from merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif (5-band tiles from script 20) tryCatch({ safe_log("Starting per-tile mosaic creation (tile-based approach)...") - # Set output directory for per-tile mosaics - tile_output_base <- file.path(laravel_storage, "weekly_tile_max") + # Detect grid size from merged_final_tif folder structure + # Expected: merged_final_tif/5x5/ or merged_final_tif/10x10/ etc. + merged_final_base <- file.path(laravel_storage, "merged_final_tif") + grid_subfolders <- list.dirs(merged_final_base, full.names = FALSE, recursive = FALSE) + # Look for grid size patterns like "5x5", "10x10", "20x20" + grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE) + + if (length(grid_patterns) == 0) { + stop("No grid size subfolder found in merged_final_tif/ (expected: 5x5, 10x10, etc.)") + } + + grid_size <- grid_patterns[1] # Use first grid size found + safe_log(paste("Detected grid size:", grid_size)) + + # Point to the grid-specific merged_final_tif directory + merged_final_with_grid <- file.path(merged_final_base, grid_size) + + # Set output directory for per-tile mosaics, organized by grid size + # Output: weekly_tile_max/{grid_size}/week_WW_YYYY_TT.tif + tile_output_base <- file.path(laravel_storage, "weekly_tile_max", grid_size) + dir.create(tile_output_base, recursive = TRUE, showWarnings = FALSE) created_tile_files <- create_weekly_mosaic_from_tiles( dates = dates, - merged_final_dir = merged_final, + merged_final_dir = merged_final_with_grid, tile_output_dir = tile_output_base, field_boundaries = field_boundaries ) diff --git a/r_app/40_mosaic_creation_utils.R b/r_app/40_mosaic_creation_utils.R index 60e2f26..dc7b778 100644 --- a/r_app/40_mosaic_creation_utils.R +++ b/r_app/40_mosaic_creation_utils.R @@ -354,18 +354,17 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU } # Get filenames of best-coverage images - # Match by extracting tile ID from both cloud stats and TIF filenames + # Match by full filename from cloud stats to TIF files rasters_to_use <- character() for (idx in best_coverage) { - # Extract tile ID from cloud_coverage_stats filename (e.g., "tile_18" → 18) + # Get the full filename from cloud coverage stats cc_filename <- cloud_coverage_stats$filename[idx] - cc_tile_id <- gsub(".*_([0-9]+).*", "\\1", cc_filename) - # Find matching TIF file by matching tile ID + # Find matching TIF file by full filename matching_tif <- NULL for (tif_file in tif_files) { - tif_tile_id <- gsub(".*_([0-9]+)\\.tif", "\\1", basename(tif_file)) - if (tif_tile_id == cc_tile_id) { + tif_basename <- basename(tif_file) + if (tif_basename == cc_filename) { matching_tif <- tif_file break } @@ -373,6 +372,8 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU if (!is.null(matching_tif)) { rasters_to_use <- c(rasters_to_use, matching_tif) + } else { + safe_log(paste("Warning: Could not find TIF file matching cloud stats entry:", cc_filename), "WARNING") } } @@ -420,42 +421,60 @@ create_mosaic <- function(tif_files, cloud_coverage_stats, field_boundaries = NU mosaic <- tryCatch({ safe_log(paste("Creating max composite from", length(all_rasters), "images to fill clouds")) - # Get extent from field boundaries if available, otherwise use raster intersection - if (!is.null(field_boundaries)) { - crop_extent <- terra::ext(field_boundaries) - safe_log("Using field boundaries extent for consistent area across all dates") + # Check if all rasters have identical grids (extent and resolution) + # This is likely for per-tile mosaics from the same tiling scheme + reference_raster <- all_rasters[[1]] + ref_ext <- terra::ext(reference_raster) + ref_res <- terra::res(reference_raster) + + grids_match <- all(sapply(all_rasters[-1], function(r) { + isTRUE(all.equal(terra::ext(r), ref_ext, tolerance = 1e-6)) && + isTRUE(all.equal(terra::res(r), ref_res, tolerance = 1e-6)) + })) + + if (grids_match) { + # All rasters have matching grids - no cropping/resampling needed! + safe_log("All rasters have identical grids - stacking directly for max composite") + raster_collection <- terra::sprc(all_rasters) + max_mosaic <- terra::mosaic(raster_collection, fun = "max") } else { - # Fallback: use intersection of all raster extents - crop_extent <- terra::ext(all_rasters[[1]]) - for (i in 2:length(all_rasters)) { - crop_extent <- terra::intersect(crop_extent, terra::ext(all_rasters[[i]])) + # Grids don't match - need to crop and resample + safe_log("Rasters have different grids - cropping and resampling to common extent") + + # Get extent from field boundaries if available, otherwise use raster union + if (!is.null(field_boundaries)) { + crop_extent <- terra::ext(field_boundaries) + safe_log("Using field boundaries extent for consistent area across all dates") + } else { + # Use union of all extents (covers all data) + crop_extent <- terra::ext(all_rasters[[1]]) + for (i in 2:length(all_rasters)) { + crop_extent <- terra::union(crop_extent, terra::ext(all_rasters[[i]])) + } + safe_log("Using raster union extent") } - safe_log("Using raster intersection extent") + + # Crop all rasters to common extent + cropped_rasters <- lapply(all_rasters, function(r) { + terra::crop(r, crop_extent) + }) + + # Resample all cropped rasters to match the first one's grid + reference_grid <- cropped_rasters[[1]] + + aligned_rasters <- lapply(cropped_rasters, function(r) { + if (isTRUE(all.equal(terra::ext(r), terra::ext(reference_grid), tolerance = 1e-6)) && + isTRUE(all.equal(terra::res(r), terra::res(reference_grid), tolerance = 1e-6))) { + return(r) # Already aligned + } + terra::resample(r, reference_grid, method = "near") + }) + + # Create max composite using mosaic on aligned rasters + raster_collection <- terra::sprc(aligned_rasters) + max_mosaic <- terra::mosaic(raster_collection, fun = "max") } - # Crop all rasters to common extent - cropped_rasters <- lapply(all_rasters, function(r) { - terra::crop(r, crop_extent) - }) - - # Resample all cropped rasters to match the first one's grid - # This handles pixel grid misalignment from Python's dynamic extent adjustment - reference_grid <- cropped_rasters[[1]] - safe_log("Resampling rasters to common grid for stacking") - - aligned_rasters <- lapply(cropped_rasters, function(r) { - if (identical(terra::ext(r), terra::ext(reference_grid)) && - identical(terra::res(r), terra::res(reference_grid))) { - return(r) # Already aligned - } - terra::resample(r, reference_grid, method = "near") - }) - - # Create max composite using mosaic on aligned rasters - # Resample ensures all rasters have matching grids (no resolution mismatch) - raster_collection <- terra::sprc(aligned_rasters) - max_mosaic <- terra::mosaic(raster_collection, fun = "max") - max_mosaic }, error = function(e) { safe_log(paste("Max composite creation failed:", e$message), "WARNING") @@ -686,6 +705,16 @@ create_weekly_mosaic_from_tiles <- function(dates, merged_final_dir, tile_output next } + # DEBUG: Check mosaic content before saving + safe_log(paste(" DEBUG: Mosaic tile", tile_id, "dimensions:", nrow(tile_mosaic), "x", ncol(tile_mosaic))) + safe_log(paste(" DEBUG: Mosaic tile", tile_id, "bands:", terra::nlyr(tile_mosaic))) + + # Check first band values + band1 <- tile_mosaic[[1]] + band1_min <- terra::global(band1, fun = "min", na.rm = TRUE)$min + band1_max <- terra::global(band1, fun = "max", na.rm = TRUE)$max + safe_log(paste(" DEBUG: Band 1 MIN=", round(band1_min, 2), "MAX=", round(band1_max, 2))) + # Step 2c: Save this tile's weekly MAX mosaic # Filename format: week_WW_YYYY_TT.tif (e.g., week_02_2026_01.tif for week 2, 2026, tile 1) tile_filename <- paste0("week_", sprintf("%02d", dates$week), "_", dates$year, "_", @@ -763,7 +792,7 @@ count_cloud_coverage_for_tile <- function(tile_files, field_boundaries = NULL) { missing_pct <- round(100 - ((total_notna / total_pixels) * 100)) aggregated_results[[idx]] <- data.frame( - filename = paste0("tile_", sprintf("%02d", as.integer(gsub(".*_([0-9]+)\\.tif", "\\1", basename(tile_file))))), + filename = basename(tile_file), # Keep full filename: 2026-01-07_03.tif notNA = total_notna, total_pixels = total_pixels, missing_pixels_percentage = missing_pct, diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 6977491..59a9195 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -20,6 +20,8 @@ # Option 2: Rscript 80_calculate_kpis.R 2026-01-14 angata 7 # Arguments: [end_date] [project_dir] [offset_days] # +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/80_calculate_kpis.R 2026-01-12 angata 7 +# # Usage in run_full_pipeline.R: # source("r_app/80_calculate_kpis.R") # main() @@ -122,7 +124,7 @@ STATUS_TRIGGERS <- data.frame( # TILE-AWARE HELPER FUNCTIONS # ============================================================================ -get_tile_ids_for_field <- function(field_geom, tile_grid) { +get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) { if (inherits(field_geom, "sf")) { field_bbox <- sf::st_bbox(field_geom) field_xmin <- field_bbox["xmin"] @@ -139,6 +141,17 @@ get_tile_ids_for_field <- function(field_geom, tile_grid) { stop("field_geom must be sf or terra::vect object") } + # DEBUG: Print bbox info for first field + if (!is.null(field_id) && field_id == "1391") { + message(paste("[DEBUG get_tile_ids] Field bbox - xmin:", field_xmin, "xmax:", field_xmax, + "ymin:", field_ymin, "ymax:", field_ymax)) + message(paste("[DEBUG get_tile_ids] tile_grid sample: id=", tile_grid$id[1], + "xmin=", tile_grid$xmin[1], "xmax=", tile_grid$xmax[1], + "ymin=", tile_grid$ymin[1], "ymax=", tile_grid$ymax[1])) + message(paste("[DEBUG get_tile_ids] tile_grid CRS:", sf::st_crs(tile_grid))) + message(paste("[DEBUG get_tile_ids] field CRS:", sf::st_crs(field_geom))) + } + intersecting_tiles <- tile_grid$id[ !(tile_grid$xmax < field_xmin | tile_grid$xmin > field_xmax | @@ -189,6 +202,21 @@ load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_di } build_tile_grid <- function(mosaic_dir, week_num, year) { + # Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/) + # First check if mosaic_dir contains grid-size subdirectories + detected_grid_size <- NA + if (dir.exists(mosaic_dir)) { + subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE) + grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE) + + if (length(grid_patterns) > 0) { + # Use the first grid-size subdirectory found + detected_grid_size <- grid_patterns[1] + mosaic_dir <- file.path(mosaic_dir, detected_grid_size) + message(paste(" Using grid-size subdirectory:", detected_grid_size)) + } + } + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) @@ -230,7 +258,12 @@ build_tile_grid <- function(mosaic_dir, week_num, year) { stop("Could not extract extents from any tile files") } - return(tile_grid) + # RETURN BOTH the grid AND the corrected mosaic directory path + return(list( + tile_grid = tile_grid, + mosaic_dir = mosaic_dir, + grid_size = detected_grid_size + )) } # ============================================================================ @@ -399,14 +432,20 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n USE_UNIFORM_AGE <- TRUE UNIFORM_PLANTING_DATE <- as.Date("2025-01-01") -extract_planting_dates <- function(harvesting_data) { +extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { if (USE_UNIFORM_AGE) { message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE)) - return(data.frame( - field_id = character(), - planting_date = as.Date(character()), - stringsAsFactors = FALSE - )) + # Return a data frame with all field IDs mapped to uniform planting date + if (!is.null(field_boundaries_sf)) { + return(data.frame( + field_id = field_boundaries_sf$field, + date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)), + stringsAsFactors = FALSE + )) + } else { + # Fallback if field_boundaries_sf not provided + return(NULL) + } } if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { @@ -449,6 +488,11 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week } field_name <- field_id + # DEBUG: Print for first few fields + if (field_idx <= 3) { + message(paste("[DEBUG] Field", field_idx, ":", field_id)) + } + field_sf <- field_boundaries_sf[field_idx, ] if (sf::st_is_empty(field_sf) || any(is.na(sf::st_geometry(field_sf)))) { return(data.frame( @@ -460,7 +504,14 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000 field_area_acres <- field_area_ha / 0.404686 - tile_ids <- get_tile_ids_for_field(field_sf, tile_grid) + tile_ids <- get_tile_ids_for_field(field_sf, tile_grid, field_id = field_id) + + # DEBUG: Print tile IDs for first field + if (field_idx == 1) { + message(paste("[DEBUG] First field tile_ids:", paste(tile_ids, collapse=","))) + message(paste("[DEBUG] tile_grid nrows:", nrow(tile_grid), "ncols:", ncol(tile_grid))) + message(paste("[DEBUG] mosaic_dir:", mosaic_dir)) + } current_ci <- load_tiles_for_field(field_sf, tile_ids, week_num, year, mosaic_dir) @@ -471,10 +522,26 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week )) } - field_vect <- terra::vect(sf::as_Spatial(field_sf)) - terra::crs(field_vect) <- terra::crs(current_ci) + # Extract CI values: EXACTLY LIKE SCRIPT 20 + # Crop to field bounding box first, then extract with sf directly (not terra::vect conversion) + field_bbox <- sf::st_bbox(field_sf) + ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out") + extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE) - all_extracted <- terra::extract(current_ci, field_vect)[, 2] + # extracted_vals is a data.frame with ID column (field index) + mean value + mean_ci_current <- as.numeric(extracted_vals[1, 2]) + + if (is.na(mean_ci_current)) { + return(data.frame( + Field_id = field_id, + error = "No CI values extracted from tiles" + )) + } + + # For per-tile extraction, we only have mean from the aggregation function + # To get variance/CV, we need to extract all pixels without the fun parameter + # But for farm-level purposes, the mean CI is sufficient + all_extracted <- terra::extract(ci_cropped, field_sf)[, 2] current_ci_vals <- all_extracted[!is.na(all_extracted)] num_total <- length(all_extracted) @@ -509,7 +576,9 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week tryCatch({ previous_ci <- load_tiles_for_field(field_sf, tile_ids, week_num - 1, year, mosaic_dir) if (!is.null(previous_ci)) { - prev_extracted <- terra::extract(previous_ci, field_vect)[, 2] + prev_bbox <- sf::st_bbox(field_sf) + prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out") + prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2] previous_ci_vals <- prev_extracted[!is.na(prev_extracted)] if (length(previous_ci_vals) > 0) { mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE) @@ -763,6 +832,13 @@ generate_field_analysis_summary <- function(field_df) { export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) { message("Exporting per-field analysis to Excel, CSV, and RDS...") + # Round all numeric columns to 2 decimals + field_df_rounded <- field_df %>% + mutate(across(where(is.numeric), ~ round(., 2))) + + summary_df_rounded <- summary_df %>% + mutate(across(where(is.numeric), ~ round(., 2))) + output_subdir <- file.path(reports_dir, "kpis", "field_analysis") if (!dir.exists(output_subdir)) { dir.create(output_subdir, recursive = TRUE) @@ -773,16 +849,16 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE) sheets <- list( - "Field Data" = field_df, - "Summary" = summary_df + "Field Data" = field_df_rounded, + "Summary" = summary_df_rounded ) write_xlsx(sheets, excel_path) message(paste("✓ Field analysis Excel exported to:", excel_path)) kpi_data <- list( - field_analysis = field_df, - field_analysis_summary = summary_df, + field_analysis = field_df_rounded, + field_analysis_summary = summary_df_rounded, metadata = list( current_week = current_week, project = project_dir, @@ -798,84 +874,254 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv") csv_path <- file.path(output_subdir, csv_filename) - write_csv(field_df, csv_path) + write_csv(field_df_rounded, csv_path) message(paste("✓ Field analysis CSV exported to:", csv_path)) return(list(excel = excel_path, rds = rds_path, csv = csv_path)) } # ============================================================================ -# FARM-LEVEL KPI CALCULATION (FROM OLD 09_CALCULATE_KPIS.R) +# TILE-BASED KPI EXTRACTION FUNCTION # ============================================================================ +calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) { + # Loop through tiles, extract KPI statistics per field per tile + # Follows the same pattern as extract_ci_from_tiles in CI extraction + + message("Calculating field-level KPI statistics from tiles...") + + # Get all tile files for this week + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) + tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE) + + if (length(tile_files) == 0) { + message("No tiles found for week", week_num, year) + return(NULL) + } + + # Process tiles in parallel using furrr (same as CI extraction) + message(paste("Processing", length(tile_files), "tiles in parallel...")) + + field_kpi_list <- furrr::future_map( + tile_files, + ~ process_single_kpi_tile( + tile_file = ., + field_boundaries_sf = field_boundaries_sf, + tile_grid = tile_grid + ), + .progress = TRUE, + .options = furrr::furrr_options(seed = TRUE) + ) + + # Combine results from all tiles + field_kpi_stats <- dplyr::bind_rows(field_kpi_list) + + if (nrow(field_kpi_stats) == 0) { + message(" No KPI data extracted from tiles") + return(NULL) + } + + message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields")) + return(field_kpi_stats) +} + +# Helper function to process a single tile (like process_single_tile in CI extraction) +process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) { + tryCatch({ + tile_basename <- basename(tile_file) + # Load tile raster + tile_raster <- terra::rast(tile_file) + + # Get first band (CI band for weekly mosaics) + ci_band <- tile_raster[[1]] + + # EXACTLY LIKE SCRIPT 20: Crop to field bounding box first, then extract with sf directly + field_bbox <- sf::st_bbox(field_boundaries_sf) + ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out") + + # Extract CI values for ALL fields at once using sf object directly (NOT terra::vect) + # terra::extract() works with sf objects and handles geometries properly + extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE) + + # Initialize results for this tile + tile_results <- data.frame() + + # Get tile ID from filename + tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename)) + + # Process each field: extracted_vals is a data.frame with ID column (field indices) + extracted values + for (field_idx in seq_len(nrow(field_boundaries_sf))) { + field_id <- field_boundaries_sf$field[field_idx] + + # extracted_vals columns: 1=ID, 2=mean_CI (since we used fun="mean") + mean_ci <- extracted_vals[field_idx, 2] + + # Skip if no data for this field in this tile + if (is.na(mean_ci)) { + next + } + + # For tile-level stats, we only have mean from extraction (no variance without all pixels) + # Add to results + tile_results <- rbind(tile_results, data.frame( + field = field_id, + tile_id = tile_id_match, + tile_file = tile_basename, + mean_ci = round(mean_ci, 4), + stringsAsFactors = FALSE + )) + } + + return(tile_results) + + }, error = function(e) { + message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message)) + return(data.frame()) + }) +} + calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf, harvesting_data, cumulative_CI_vals_dir, - weekly_CI_mosaic, reports_dir) { + weekly_CI_mosaic, reports_dir, current_week, year, + tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") { message("\n=== CALCULATING FARM-LEVEL KPIs ===") - message("(6 high-level KPI metrics)") - - tryCatch({ - source(here("r_app", "kpi_utils.R")) - }, error = function(e) { - message(paste("Warning: Could not load kpi_utils.R:", e$message)) - message("Farm-level KPIs will be skipped") - return(NULL) - }) - - if (!exists("calculate_all_kpis")) { - message("Warning: calculate_all_kpis() function not found in kpi_utils.R") - return(NULL) - } + message("(6 high-level KPI metrics with tile-based extraction)") output_dir <- file.path(reports_dir, "kpis") if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } - tryCatch({ - kpi_results <- calculate_all_kpis( - report_date = report_date, - output_dir = output_dir, - field_boundaries_sf = field_boundaries_sf, - harvesting_data = harvesting_data, - cumulative_CI_vals_dir = cumulative_CI_vals_dir, - weekly_CI_mosaic = weekly_CI_mosaic, - reports_dir = reports_dir, - project_dir = project_dir - ) - - # Print KPI summary - cat("\n=== FARM-LEVEL KPI SUMMARY ===\n") - cat("Report Date:", as.character(kpi_results$metadata$report_date), "\n") - cat("Current Week:", kpi_results$metadata$current_week, "\n") - cat("Previous Week:", kpi_results$metadata$previous_week, "\n") - cat("Total Fields Analyzed:", kpi_results$metadata$total_fields, "\n") - cat("Calculation Time:", as.character(kpi_results$metadata$calculation_time), "\n") - - cat("\n--- KPI Metrics ---\n") - cat("Field Uniformity Summary:\n") - print(kpi_results$field_uniformity_summary) - - cat("\nArea Change Summary:\n") - print(kpi_results$area_change) - - cat("\nTCH Forecasted:\n") - print(kpi_results$tch_forecasted) - - cat("\nGrowth Decline Index:\n") - print(kpi_results$growth_decline) - - cat("\nWeed Presence Score:\n") - print(kpi_results$weed_presence) - - cat("\nGap Filling Score:\n") - print(kpi_results$gap_filling) - - return(kpi_results) - }, error = function(e) { - message(paste("Error calculating farm-level KPIs:", e$message)) + # Get mosaic directory with grid size if using tiles + mosaic_dir <- if (use_tile_mosaic && !is.null(tile_grid_size)) { + file.path(weekly_CI_mosaic, tile_grid_size) + } else { + weekly_CI_mosaic + } + + # Extract field-level KPI statistics from tiles + field_kpi_stats <- calculate_field_kpis_from_tiles( + tile_dir = mosaic_dir, + week_num = current_week, + year = year, + field_boundaries_sf = field_boundaries_sf, + tile_grid = tile_grid + ) + + if (is.null(field_kpi_stats) || nrow(field_kpi_stats) == 0) { + message("Warning: No field KPI statistics extracted from tiles") return(NULL) - }) + } + + # Aggregate tile-based statistics by field (average across tiles for each field) + field_summary_stats <- field_kpi_stats %>% + dplyr::group_by(field) %>% + dplyr::summarise( + mean_ci = mean(mean_ci, na.rm = TRUE), + cv_ci = mean(cv_ci, na.rm = TRUE), + min_ci = min(min_ci, na.rm = TRUE), + max_ci = max(max_ci, na.rm = TRUE), + total_pixels = sum(n_pixels, na.rm = TRUE), + num_tiles = n_distinct(tile_id), + .groups = 'drop' + ) + + # Create results list + kpi_results <- list( + field_kpi_stats = field_kpi_stats, + field_summary_stats = field_summary_stats, + metadata = list( + report_date = report_date, + current_week = current_week, + year = year, + calculation_method = "tile_based_extraction", + num_fields_processed = length(unique(field_kpi_stats$field)), + num_tiles_processed = length(unique(field_kpi_stats$tile_id)) + ) + ) + + # Save results + rds_filename <- paste0(project_dir, "_farm_kpi_stats_week", sprintf("%02d", current_week), ".rds") + rds_path <- file.path(output_dir, rds_filename) + saveRDS(kpi_results, rds_path) + message(paste("✓ Farm-level KPI stats exported to:", rds_path)) + + # Print summary + cat("\n=== FARM-LEVEL KPI SUMMARY ===\n") + cat("Report Date:", as.character(report_date), "\n") + cat("Week:", current_week, "Year:", year, "\n") + cat("Fields Processed:", length(unique(field_kpi_stats$field)), "\n") + cat("Tiles Processed:", length(unique(field_kpi_stats$tile_id)), "\n") + cat("\n--- Field Summary Statistics (Mean across tiles) ---\n") + print(head(field_summary_stats, 20)) + + return(kpi_results) +} + +# ============================================================================ +# HELPER: Extract field-level statistics from CI raster (all pixels, single call) +# ============================================================================ + +extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) { + #' Extract CI statistics for all fields from a single CI raster band + #' + #' This function extracts all pixel values for each field in one terra::extract call, + #' then calculates mean, CV, and percentiles from those pixels. + #' + #' @param ci_band Single CI band from terra raster + #' @param field_boundaries_sf SF object with field geometries + #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count + + # Extract all pixels for all fields at once (more efficient than individual calls) + all_pixels <- terra::extract(ci_band, field_boundaries_sf) + + # Calculate statistics for each field + stats_list <- list() + + for (field_idx in seq_len(nrow(field_boundaries_sf))) { + # Extract pixel values for this field (skip ID column 1) + pixels <- all_pixels[field_idx, -1, drop = TRUE] + pixels <- as.numeric(pixels) + pixels <- pixels[!is.na(pixels)] + + # Only calculate stats if we have pixels + if (length(pixels) > 0) { + mean_val <- mean(pixels, na.rm = TRUE) + + # Only calculate CV if mean > 0 (avoid division by zero) + if (mean_val > 0) { + cv_val <- sd(pixels, na.rm = TRUE) / mean_val + } else { + cv_val <- NA + } + + p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] + p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] + + stats_list[[field_idx]] <- data.frame( + field_idx = field_idx, + mean_ci = mean_val, + cv = cv_val, + p10 = p10_val, + p90 = p90_val, + pixel_count = length(pixels), + stringsAsFactors = FALSE + ) + } else { + # No pixels for this field (doesn't intersect tile) + stats_list[[field_idx]] <- data.frame( + field_idx = field_idx, + mean_ci = NA_real_, + cv = NA_real_, + p10 = NA_real_, + p90 = NA_real_, + pixel_count = 0, + stringsAsFactors = FALSE + ) + } + } + + return(dplyr::bind_rows(stats_list)) } # ============================================================================ @@ -923,7 +1169,7 @@ main <- function() { message("") # Load configuration and utilities - source(here("r_app", "crop_messaging_utils.R")) + # source(here("r_app", "crop_messaging_utils.R")) tryCatch({ source(here("r_app", "parameters_project.R")) @@ -950,10 +1196,29 @@ main <- function() { message(paste("Week:", current_week, "/ Year:", year)) - message("Building tile grid from available weekly tiles...") - tile_grid <- build_tile_grid(weekly_tile_max, current_week, year) - message(paste(" Found", nrow(tile_grid), "tiles")) + # Find tile files - approach from Script 20 + message("Finding tile files...") + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", current_week, year) + # Detect grid size subdirectory + detected_grid_size <- NA + if (dir.exists(weekly_tile_max)) { + subfolders <- list.dirs(weekly_tile_max, full.names = FALSE, recursive = FALSE) + grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE) + if (length(grid_patterns) > 0) { + detected_grid_size <- grid_patterns[1] + mosaic_dir <- file.path(weekly_tile_max, detected_grid_size) + message(paste(" Using grid-size subdirectory:", detected_grid_size)) + } + } + + tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) + if (length(tile_files) == 0) { + stop(paste("No tile files found for week", current_week, year, "in", mosaic_dir)) + } + message(paste(" Found", length(tile_files), "tiles")) + + # Load field boundaries tryCatch({ boundaries_result <- load_field_boundaries(data_dir) @@ -979,38 +1244,217 @@ main <- function() { } historical_data <- load_historical_field_data(project_dir, current_week, reports_dir, num_weeks = num_weeks_to_load) - planting_dates <- extract_planting_dates(harvesting_data) + planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf) - message("Setting up parallel processing...") - current_plan <- class(future::plan())[1] - if (current_plan == "sequential") { - num_workers <- parallel::detectCores() - 1 - message(paste(" Using", num_workers, "workers")) - future::plan(future::multisession, workers = num_workers) + # Validate planting_dates + if (is.null(planting_dates) || nrow(planting_dates) == 0) { + message("WARNING: No planting dates available. Using NA for all fields.") + planting_dates <- data.frame( + field_id = field_boundaries_sf$field, + date = rep(as.Date(NA), nrow(field_boundaries_sf)), + stringsAsFactors = FALSE + ) } - message("Analyzing", nrow(field_boundaries_sf), "fields in parallel...") + # SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile + message("\nProcessing tiles and extracting field statistics...") + all_tile_results <- list() - field_analysis_list <- furrr::future_map( - seq_len(nrow(field_boundaries_sf)), - ~ analyze_single_field( - field_idx = ., - field_boundaries_sf = field_boundaries_sf, - tile_grid = tile_grid, - week_num = current_week, - year = year, - mosaic_dir = weekly_tile_max, - historical_data = historical_data, - planting_dates = planting_dates, - report_date = end_date, - harvest_imminence_data = NULL, - harvesting_data = harvesting_data - ), - .progress = TRUE, - .options = furrr::furrr_options(seed = TRUE) - ) + for (i in seq_along(tile_files)) { + tile_file <- tile_files[i] + message(paste(" Processing tile", i, "of", length(tile_files), ":", basename(tile_file))) + + tryCatch({ + # Load current tile and previous week tile + current_rast <- terra::rast(tile_file) + + # DEBUG: Check tile structure on first tile + if (i == 1) { + message(paste(" [DEBUG] Tile CRS:", terra::crs(current_rast))) + message(paste(" [DEBUG] Tile extent:", paste(terra::ext(current_rast)))) + message(paste(" [DEBUG] Field boundaries CRS:", sf::st_crs(field_boundaries_sf))) + field_bbox <- sf::st_bbox(field_boundaries_sf) + message(paste(" [DEBUG] Field bbox:", paste(round(field_bbox, 2)))) + message(paste(" [DEBUG] Band names:", paste(names(current_rast), collapse=", "))) + } + + # Extract CI band by name + ci_band <- current_rast[["CI"]] + + # Check if CI band exists - use proper logical checks + if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) { + message(paste(" ERROR: CI band not found. Available bands:", paste(names(current_rast), collapse=", "))) + next + } + + # Check if CI band has any valid data + if (tryCatch(all(is.na(values(ci_band))), error = function(e) TRUE)) { + message(paste(" ERROR: CI band has no valid data")) + next + } + + # Load previous week tile if available + previous_tile_file <- sub(sprintf("week_%02d", current_week), + sprintf("week_%02d", previous_week), + tile_file) + previous_ci <- NULL + if (file.exists(previous_tile_file)) { + previous_rast <- terra::rast(previous_tile_file) + previous_ci <- previous_rast[["CI"]] + } + + # OPTION 1 + 2: Extract all CI statistics from one pixel extraction (single call) + current_stats <- extract_field_statistics_from_ci(ci_band, field_boundaries_sf) + + # DEBUG: Check extraction result on first tile + if (i == 1) { + num_with_data <- sum(!is.na(current_stats$mean_ci)) + message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data")) + if (num_with_data > 0) { + message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", "))) + } + } + + # Extract previous week CI statistics if available + previous_stats <- NULL + if (!is.null(previous_ci)) { + previous_stats <- extract_field_statistics_from_ci(previous_ci, field_boundaries_sf) + } + + # Process each field that was extracted + field_results_this_tile <- list() + fields_added <- 0 + + for (field_idx in seq_len(nrow(field_boundaries_sf))) { + tryCatch({ + field_id <- field_boundaries_sf$field[field_idx] + field_sf <- field_boundaries_sf[field_idx, ] + + # Get statistics from helper function results + # current_stats should have same number of rows as field_boundaries_sf + if (field_idx > nrow(current_stats)) { + message(paste(" [ERROR] field_idx", field_idx, "> nrow(current_stats)", nrow(current_stats))) + next + } + + mean_ci_current <- current_stats$mean_ci[field_idx] + pixel_count <- current_stats$pixel_count[field_idx] + + # SKIP fields with no data in this tile (they don't intersect this tile) + if (is.na(pixel_count) || pixel_count == 0) { + next + } + ci_cv_current <- current_stats$cv[field_idx] + ci_percentile_low <- current_stats$p10[field_idx] + ci_percentile_high <- current_stats$p90[field_idx] + + # If field doesn't intersect this tile, mean_ci_current will be NA + if (is.na(mean_ci_current)) { + next # Skip this field - doesn't intersect this tile + } + + field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000 + field_area_acres <- field_area_ha / 0.404686 + + # Extract previous week CI if available + mean_ci_previous <- NA + ci_change <- NA + if (!is.null(previous_stats)) { + mean_ci_previous <- previous_stats$mean_ci[field_idx] + if (!is.na(mean_ci_previous)) { + ci_change <- mean_ci_current - mean_ci_previous + } + } + + # Reconstruct pixel values for status trigger (we need the actual pixel array) + # Use the percentiles and mean to create a synthetic distribution for status_trigger + # For now, use mean CI repeated by pixel count for testing + # TODO: Consider extracting pixels directly if needed for more complex triggers + pixel_count <- current_stats$pixel_count[field_idx] + ci_vals_current <- if (pixel_count > 0) { + rep(mean_ci_current, pixel_count) # Simplified: use mean value repeated + } else { + numeric(0) + } + + # Calculate age + age_weeks <- if (!is.null(planting_dates) && nrow(planting_dates) > 0 && field_idx <= nrow(planting_dates)) { + planting_date <- planting_dates$date[field_idx] + if (!is.na(planting_date)) { + as.numeric(difftime(end_date, planting_date, units = "weeks")) + } else { + 0 + } + } else { + 0 + } + + # Get phase and status + phase <- get_phase_by_age(age_weeks) + status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks) + + # Cloud coverage categorization based on CI value + # No data = No image available + # CI 0.01 to 95 = Partial coverage + # CI >= 95 = Clear view + if (is.na(mean_ci_current) || mean_ci_current == 0) { + cloud_category <- "No image available" + # Set all CI metrics to NA since no valid data + ci_change <- NA + ci_cv_current <- NA + ci_percentile_low <- NA + ci_percentile_high <- NA + } else if (mean_ci_current >= 95) { + cloud_category <- "Clear view" + } else { + cloud_category <- "Partial coverage" + } + + # Build result row + result_row <- data.frame( + Field_id = field_id, + Acreage = field_area_acres, + Mean_CI = mean_ci_current, + Mean_CI_prev = mean_ci_previous, + CI_change = ci_change, + CI_CV = ci_cv_current, + CI_percentile_low = ci_percentile_low, + CI_percentile_high = ci_percentile_high, + Age_weeks = age_weeks, + Phase = phase, + Status_trigger = status_trigger, + Cloud_category = cloud_category, + stringsAsFactors = FALSE + ) + + field_results_this_tile[[as.character(field_id)]] <- result_row + fields_added <- fields_added + 1 + + }, error = function(e) { + # Show error for debugging + message(paste(" [FIELD ERROR] Field", field_idx, ":", e$message)) + }) + } + + if (length(field_results_this_tile) > 0) { + all_tile_results[[basename(tile_file)]] <- dplyr::bind_rows(field_results_this_tile) + message(paste(" Extracted", length(field_results_this_tile), "fields from tile (processed", fields_added, "fields total)")) + } else { + message(paste(" WARNING: No fields extracted from this tile (processed", fields_added, "fields, all either NA or errored)")) + } + + }, error = function(e) { + message(paste(" Error processing tile", basename(tile_file), ":", e$message)) + }) + } - field_analysis_df <- dplyr::bind_rows(field_analysis_list) + # Combine all tile results, keeping unique fields (may appear in multiple tiles) + if (length(all_tile_results) == 0) { + stop("No fields extracted from any tiles!") + } + + field_analysis_df <- dplyr::bind_rows(all_tile_results) %>% + distinct(Field_id, .keep_all = TRUE) if (nrow(field_analysis_df) == 0) { stop("No fields analyzed successfully!") @@ -1038,17 +1482,90 @@ main <- function() { cat("\n--- Summary Statistics ---\n") print(summary_statistics_df) - # ========== FARM-LEVEL KPI CALCULATION ========== + # ========== FARM-LEVEL KPI AGGREGATION ========== + # Aggregate the per-field analysis into farm-level summary statistics - farm_kpi_results <- calculate_and_export_farm_kpis( - report_date = end_date, - project_dir = project_dir, - field_boundaries_sf = field_boundaries_sf, - harvesting_data = harvesting_data, - cumulative_CI_vals_dir = cumulative_CI_vals_dir, - weekly_CI_mosaic = weekly_CI_mosaic, - reports_dir = reports_dir - ) + cat("\n=== CALCULATING FARM-LEVEL KPI SUMMARY ===\n") + + # Filter to only fields that have actual data (non-NA CI and valid acreage) + field_data <- field_analysis_df %>% + filter(!is.na(Mean_CI) & !is.na(Acreage)) %>% + filter(Acreage > 0) + + if (nrow(field_data) > 0) { + + if (nrow(field_data) > 0) { + # Create summary statistics + farm_summary <- list() + + # 1. PHASE DISTRIBUTION + phase_dist <- field_data %>% + group_by(Phase) %>% + summarise( + num_fields = n(), + acreage = sum(Acreage, na.rm = TRUE), + .groups = 'drop' + ) %>% + rename(Category = Phase) + + farm_summary$phase_distribution <- phase_dist + + # 2. STATUS TRIGGER DISTRIBUTION + status_dist <- field_data %>% + group_by(Status_trigger) %>% + summarise( + num_fields = n(), + acreage = sum(Acreage, na.rm = TRUE), + .groups = 'drop' + ) %>% + rename(Category = Status_trigger) + + farm_summary$status_distribution <- status_dist + + # 3. CLOUD COVERAGE DISTRIBUTION + cloud_dist <- field_data %>% + group_by(Cloud_category) %>% + summarise( + num_fields = n(), + acreage = sum(Acreage, na.rm = TRUE), + .groups = 'drop' + ) %>% + rename(Category = Cloud_category) + + farm_summary$cloud_distribution <- cloud_dist + + # 4. OVERALL STATISTICS + farm_summary$overall_stats <- data.frame( + total_fields = nrow(field_data), + total_acreage = sum(field_data$Acreage, na.rm = TRUE), + mean_ci = round(mean(field_data$Mean_CI, na.rm = TRUE), 2), + median_ci = round(median(field_data$Mean_CI, na.rm = TRUE), 2), + mean_cv = round(mean(field_data$CI_CV, na.rm = TRUE), 4), + week = current_week, + year = year, + date = as.character(end_date) + ) + + # Print summaries + cat("\n--- PHASE DISTRIBUTION ---\n") + print(phase_dist) + + cat("\n--- STATUS TRIGGER DISTRIBUTION ---\n") + print(status_dist) + + cat("\n--- CLOUD COVERAGE DISTRIBUTION ---\n") + print(cloud_dist) + + cat("\n--- OVERALL FARM STATISTICS ---\n") + print(farm_summary$overall_stats) + + farm_kpi_results <- farm_summary + } else { + farm_kpi_results <- NULL + } + } else { + farm_kpi_results <- NULL + } # ========== FINAL SUMMARY ========== @@ -1063,7 +1580,7 @@ main <- function() { if (!is.null(farm_kpi_results)) { cat("\nFarm-level KPIs: CALCULATED\n") } else { - cat("\nFarm-level KPIs: SKIPPED (kpi_utils.R not available)\n") + cat("\nFarm-level KPIs: SKIPPED (no valid tile data extracted)\n") } cat("\n✓ Consolidated KPI calculation complete!\n") diff --git a/r_app/parameters_project.R b/r_app/parameters_project.R index a00a5f7..5890a94 100644 --- a/r_app/parameters_project.R +++ b/r_app/parameters_project.R @@ -50,7 +50,7 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL } # PRIORITY 2: File-based detection (fallback if metadata not found) - # Check if merged_final_tif/ contains tile-named files + # Check if merged_final_tif/ contains tile-named files OR grid-size subdirectories if (!dir.exists(merged_final_tif_dir)) { return(list( @@ -61,6 +61,30 @@ detect_mosaic_mode <- function(merged_final_tif_dir, daily_tiles_split_dir = NUL )) } + # First check if there are grid-size subdirectories (5x5, 10x10, etc.) + # This indicates the tiles are organized: merged_final_tif/{grid_size}/{DATE}/{DATE}_XX.tif + grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE) + grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE) + + if (length(grid_patterns) > 0) { + # Found grid-size subdirectories - tiles exist! + grid_size <- grid_patterns[1] + grid_dir <- file.path(merged_final_tif_dir, grid_size) + + # List sample tile files from the grid directory + sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3] + + return(list( + has_tiles = TRUE, + detected_tiles = sample_tiles, + total_files = length(sample_tiles), + source = "grid_subdirectory_detection", + grid_size = grid_size, + grid_path = grid_dir + )) + } + + # Fall back to checking for tile-named files directly in merged_final_tif # List all .tif files in merged_final_tif tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE) From 4e94a9a78bba1f136a07f2a99b34c9fcabbdcd60 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 15:35:16 +0100 Subject: [PATCH 05/15] expanding csv table --- r_app/80_calculate_kpis.R | 289 +++++++++++++++++++++++++------------- 1 file changed, 193 insertions(+), 96 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 59a9195..816ad83 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -10,7 +10,7 @@ # - Per-field analysis with SC-64 enhancements (4-week trends, CI percentiles, etc.) # - Farm-level KPI calculation (6 metrics for executive overview) # - Parallel processing (tile-aware, 1000+ fields supported) -# - Comprehensive Excel + RDS + CSV exports +# - Comprehensive Excel + RDS + CSV exports (21 columns per spec) # - Test mode for development # # COMMAND-LINE USAGE: @@ -26,6 +26,71 @@ # source("r_app/80_calculate_kpis.R") # main() +# ============================================================================ +# EXCEL OUTPUT SPECIFICATION (21 COLUMNS) +# ============================================================================ +# This script exports 21 columns per the field analysis specification: +# +# COMPLETED/IN PROGRESS: +# 1. Field_id ✓ - Unique field identifier +# 2. Farm_Section - Management zone (to be filled by user) +# 3. Field_name ✓ - Client-facing field name (from GeoJSON) +# 4. Acreage ✓ - Field size in acres +# 5. Mean_CI ✓ - Average Chlorophyll Index +# 6. Weekly_ci_change ✓ - Week-over-week CI change +# 7. Four_week_trend - [FUTURE] Trend over 4 weeks (requires historical mosaics) +# 8. Last_harvest_or_planting_date - [DUMMY for now] Will be from harvest Excel + LSTM (script 31) +# 9. Age_week ✓ - Weeks since planting +# 10. Phase (age based) ✓ - Growth phase (Germination, Tillering, Grand Growth, Maturation) +# 11. nmr_weeks_in_this_phase - [TODO] Weeks spent in current phase (track phase transitions) +# 12. Germination_progress - [TODO] % pixels with CI >= threshold (default 2, for age < 4 months) +# 13. Imminent_prob - [DUMMY for now] Harvest probability (will be from script 31 output) +# 14. Status_trigger ✓ - Alerts (harvest_ready, stress, etc.) +# 15. CI_range (min-max) - [TODO] Min and max CI values in field +# 16. CI_Percentiles ✓ - 10th-90th percentile of CI (p10-p90 format) +# 17. CV ✓ - Coefficient of variation (field uniformity) +# 18. CV_Trend_Short_Term - [TODO] 2-week CV trend (current week CV - last week CV) +# 19. CV_Trend_Long_Term - [FUTURE] 8-week CV slope (requires linear regression, historical mosaics) +# 20. Cloud_pct_clear ✓ - % field visible (pixel coverage) +# 21. Cloud_category ✓ - Cloud classification (Clear view / Partial coverage / No image available) +# +# IMPLEMENTATION PLAN (ordered by difficulty): +# ============================================================================ +# PHASE 1 - EASY (Current data only): +# [✓] Remove Mean_CI_prev column +# [✓] Add Field_name column (from field_boundaries_sf$field) +# [✓] Add Farm_Section column (empty, user will fill) +# [✓] Add Last_harvest_or_planting_date (use UNIFORM_PLANTING_DATE as dummy) +# [✓] Add CI_range (min/max from pixel extraction) +# [✓] Add Cloud_pct_clear (% from pixel coverage) +# [✓] Column order: Reorder to match spec (1-21) +# +# PHASE 1 LIMITATION (Known Issue - To Fix in PHASE 2): +# - Fields spanning multiple tiles currently show statistics from first intersecting tile only +# - This results in p10 ≈ p90 (few pixels per tile) instead of field-wide percentiles +# - FIX: After extracting all tiles, group by field_id and aggregate pixel values across all tiles +# before calculating percentiles. This will give true field-wide statistics. +# +# PHASE 2 - MEDIUM (Requires computation): +# [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV) +# [ ] Add Germination_progress (% pixels CI >= GERMINATION_CI_THRESHOLD, configurable) +# [ ] Add Imminent_prob column (dummy NA, will merge from script 31 harvest_imminent_weekly.csv) +# [ ] Add CV_Trend_Short_Term (requires loading last week's CV values) +# +# PHASE 3 - COMPLEX (Requires historical data): +# [ ] Add Four_week_trend (CI value difference week vs 4 weeks ago, requires loading prev mosaics) +# [ ] Add CV_Trend_Long_Term (8-week slope: linear regression on 8 weeks of CV, suggests lm()) +# [ ] Load previous week's CSV to cross-check phase transitions and trends +# +# NOTES: +# - Script 31 (harvest_imminent_weekly.py) outputs: field, imminent_prob, detected_prob, week, year +# - Will need to LEFT JOIN on (field, week, year) to populate Imminent_prob +# - Phase transition logic: Compare this week's Phase vs last week's Phase from CSV +# - For 8-week CV slope: Linear regression slope = (CV_week8 - CV_week1) / 7 weeks (approximately) +# or use lm(CV ~ week) on 8-week sequence for proper slope calculation +# - Germination_progress only calculated if Age_week < 17 (before end of Tillering phase) +# - Cloud_pct_clear calculated as: (pixel_count / expected_pixels) * 100 + # ============================================================================ # *** CONFIGURATION SECTION - MANUALLY DEFINED THRESHOLDS *** # ============================================================================ @@ -34,6 +99,10 @@ TEST_MODE <- TRUE TEST_MODE_NUM_WEEKS <- 2 +# GERMINATION PROGRESS THRESHOLD +# Percentage of pixels that must reach this CI value to count as "germinated" +GERMINATION_CI_THRESHOLD <- 2.0 # Pixels with CI >= 2 count as germinated + # FOUR-WEEK TREND THRESHOLDS FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5 FOUR_WEEK_TREND_GROWTH_MIN <- 0.1 @@ -522,54 +591,41 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week )) } - # Extract CI values: EXACTLY LIKE SCRIPT 20 - # Crop to field bounding box first, then extract with sf directly (not terra::vect conversion) + # SINGLE EXTRACTION: Get all pixel values for this field, then calculate all stats from it field_bbox <- sf::st_bbox(field_sf) ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out") - extracted_vals <- terra::extract(ci_cropped, field_sf, fun = "mean", na.rm = TRUE) - # extracted_vals is a data.frame with ID column (field index) + mean value - mean_ci_current <- as.numeric(extracted_vals[1, 2]) + # Extract all pixels in one call (no fun= parameter means we get raw pixel values) + all_extracted <- terra::extract(ci_cropped, field_sf)[, 2] + current_ci_vals <- all_extracted[!is.na(all_extracted)] - if (is.na(mean_ci_current)) { + if (length(current_ci_vals) == 0) { return(data.frame( Field_id = field_id, error = "No CI values extracted from tiles" )) } - # For per-tile extraction, we only have mean from the aggregation function - # To get variance/CV, we need to extract all pixels without the fun parameter - # But for farm-level purposes, the mean CI is sufficient - all_extracted <- terra::extract(ci_cropped, field_sf)[, 2] - current_ci_vals <- all_extracted[!is.na(all_extracted)] + # Calculate all statistics from the single extraction + mean_ci_current <- mean(current_ci_vals, na.rm = TRUE) + ci_std <- sd(current_ci_vals, na.rm = TRUE) + cv_current <- if (mean_ci_current > 0) ci_std / mean_ci_current else NA_real_ + range_min <- min(current_ci_vals, na.rm = TRUE) + range_max <- max(current_ci_vals, na.rm = TRUE) + range_str <- sprintf("%.1f-%.1f", range_min, range_max) + ci_percentiles_str <- get_ci_percentiles(current_ci_vals) + # Cloud coverage from extraction metadata num_total <- length(all_extracted) - num_data <- sum(!is.na(all_extracted)) + num_data <- length(current_ci_vals) pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 - cloud_cat <- if (num_data == 0) "No image available" else if (pct_clear >= 99.5) "Clear view" else "Partial coverage" cloud_pct <- 100 - pct_clear cloud_interval <- round_cloud_to_intervals(pct_clear) - if (length(current_ci_vals) == 0) { - return(data.frame( - Field_id = field_id, - error = "No CI values extracted" - )) - } - - mean_ci_current <- mean(current_ci_vals, na.rm = TRUE) - ci_std <- sd(current_ci_vals, na.rm = TRUE) - cv_current <- ci_std / mean_ci_current - range_min <- min(current_ci_vals, na.rm = TRUE) - range_max <- max(current_ci_vals, na.rm = TRUE) - range_str <- sprintf("%.1f-%.1f", range_min, range_max) - - ci_percentiles_str <- get_ci_percentiles(current_ci_vals) - + # Weekly change (extract previous week same way - single extraction) weekly_ci_change <- NA previous_ci_vals <- NULL @@ -578,8 +634,8 @@ analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week if (!is.null(previous_ci)) { prev_bbox <- sf::st_bbox(field_sf) prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out") - prev_extracted <- terra::extract(prev_ci_cropped, field_sf)[, 2] - previous_ci_vals <- prev_extracted[!is.na(prev_extracted)] + prev_extracted_all <- terra::extract(prev_ci_cropped, field_sf)[, 2] + previous_ci_vals <- prev_extracted_all[!is.na(prev_extracted_all)] if (length(previous_ci_vals) > 0) { mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE) weekly_ci_change <- mean_ci_current - mean_ci_previous @@ -743,11 +799,11 @@ generate_field_analysis_summary <- function(field_df) { total_acreage <- sum(field_df$Acreage, na.rm = TRUE) - germination_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Germination"], na.rm = TRUE) - tillering_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Tillering"], na.rm = TRUE) - grand_growth_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Grand Growth"], na.rm = TRUE) - maturation_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Maturation"], na.rm = TRUE) - unknown_phase_acreage <- sum(field_df$Acreage[field_df$`Phase (age based)` == "Unknown"], na.rm = TRUE) + germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE) + tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE) + grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE) + maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE) + unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE) harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE) stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE) @@ -1070,55 +1126,58 @@ extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) { #' #' @param ci_band Single CI band from terra raster #' @param field_boundaries_sf SF object with field geometries - #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, pixel_count + #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, min_ci, max_ci, pixel_count_valid, pixel_count_total - # Extract all pixels for all fields at once (more efficient than individual calls) - all_pixels <- terra::extract(ci_band, field_boundaries_sf) + # SINGLE EXTRACTION: Get all pixels for all fields at once (no aggregation function) + # Result: data.frame with ID column (field indices) and value column (pixel values) + extract_result <- terra::extract(ci_band, field_boundaries_sf) # Calculate statistics for each field stats_list <- list() for (field_idx in seq_len(nrow(field_boundaries_sf))) { - # Extract pixel values for this field (skip ID column 1) - pixels <- all_pixels[field_idx, -1, drop = TRUE] - pixels <- as.numeric(pixels) - pixels <- pixels[!is.na(pixels)] + # Get all pixels for this field from the single extraction + # extract_result has columns [ID, value] where ID is field index (1-based) + field_pixels <- extract_result[extract_result$ID == field_idx, 2] + pixels <- as.numeric(field_pixels[!is.na(field_pixels)]) # Remove NAs - # Only calculate stats if we have pixels - if (length(pixels) > 0) { - mean_val <- mean(pixels, na.rm = TRUE) - - # Only calculate CV if mean > 0 (avoid division by zero) - if (mean_val > 0) { - cv_val <- sd(pixels, na.rm = TRUE) / mean_val - } else { - cv_val <- NA - } - - p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] - p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] - - stats_list[[field_idx]] <- data.frame( - field_idx = field_idx, - mean_ci = mean_val, - cv = cv_val, - p10 = p10_val, - p90 = p90_val, - pixel_count = length(pixels), - stringsAsFactors = FALSE - ) - } else { - # No pixels for this field (doesn't intersect tile) + if (length(pixels) == 0) { + # No data for this field stats_list[[field_idx]] <- data.frame( field_idx = field_idx, mean_ci = NA_real_, cv = NA_real_, p10 = NA_real_, p90 = NA_real_, - pixel_count = 0, + min_ci = NA_real_, + max_ci = NA_real_, + pixel_count_valid = 0, + pixel_count_total = 0, stringsAsFactors = FALSE ) + next } + + # Calculate all statistics from pixels array + mean_val <- mean(pixels, na.rm = TRUE) + cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_ + p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] + p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] + min_val <- min(pixels, na.rm = TRUE) + max_val <- max(pixels, na.rm = TRUE) + + stats_list[[field_idx]] <- data.frame( + field_idx = field_idx, + mean_ci = mean_val, + cv = cv_val, + p10 = p10_val, + p90 = p90_val, + min_ci = min_val, + max_ci = max_val, + pixel_count_valid = length(pixels), + pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]), + stringsAsFactors = FALSE + ) } return(dplyr::bind_rows(stats_list)) @@ -1312,6 +1371,13 @@ main <- function() { message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data")) if (num_with_data > 0) { message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", "))) + # Check percentiles + sample_field <- which(!is.na(current_stats$mean_ci))[2] + message(paste(" [DEBUG] Field", sample_field, "- p10:", current_stats$p10[sample_field], + "p90:", current_stats$p90[sample_field], + "min:", current_stats$min_ci[sample_field], + "max:", current_stats$max_ci[sample_field], + "valid_pixels:", current_stats$pixel_count_valid[sample_field])) } } @@ -1338,10 +1404,9 @@ main <- function() { } mean_ci_current <- current_stats$mean_ci[field_idx] - pixel_count <- current_stats$pixel_count[field_idx] # SKIP fields with no data in this tile (they don't intersect this tile) - if (is.na(pixel_count) || pixel_count == 0) { + if (is.na(current_stats$pixel_count_valid[field_idx]) || current_stats$pixel_count_valid[field_idx] == 0) { next } ci_cv_current <- current_stats$cv[field_idx] @@ -1370,9 +1435,9 @@ main <- function() { # Use the percentiles and mean to create a synthetic distribution for status_trigger # For now, use mean CI repeated by pixel count for testing # TODO: Consider extracting pixels directly if needed for more complex triggers - pixel_count <- current_stats$pixel_count[field_idx] - ci_vals_current <- if (pixel_count > 0) { - rep(mean_ci_current, pixel_count) # Simplified: use mean value repeated + pixel_count_valid <- current_stats$pixel_count_valid[field_idx] + ci_vals_current <- if (!is.na(pixel_count_valid) && pixel_count_valid > 0) { + rep(mean_ci_current, pixel_count_valid) # Simplified: use mean value repeated } else { numeric(0) } @@ -1393,36 +1458,68 @@ main <- function() { phase <- get_phase_by_age(age_weeks) status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks) - # Cloud coverage categorization based on CI value - # No data = No image available - # CI 0.01 to 95 = Partial coverage - # CI >= 95 = Clear view - if (is.na(mean_ci_current) || mean_ci_current == 0) { - cloud_category <- "No image available" - # Set all CI metrics to NA since no valid data - ci_change <- NA - ci_cv_current <- NA - ci_percentile_low <- NA - ci_percentile_high <- NA - } else if (mean_ci_current >= 95) { - cloud_category <- "Clear view" - } else { - cloud_category <- "Partial coverage" + # Calculate Cloud_pct_clear: percentage of field with valid data + # Binned to 10% intervals (0, 10, 20, ..., 90, 100) + cloud_pct_clear <- { + valid_count <- current_stats$pixel_count_valid[field_idx] + total_count <- current_stats$pixel_count_total[field_idx] + if (!is.na(valid_count) && !is.na(total_count) && total_count > 0) { + pct <- (valid_count / total_count) * 100 + round(pct / 10) * 10 + } else { + NA_real_ + } } - # Build result row + # Cloud categorization based on pixel coverage (Cloud_pct_clear) + cloud_category <- if (is.na(cloud_pct_clear)) { + "No image available" + } else if (cloud_pct_clear >= 90) { + "Clear view" + } else if (cloud_pct_clear > 0) { + "Partial coverage" + } else { + "No image available" + } + + # Get min/max CI values + ci_min <- current_stats$min_ci[field_idx] + ci_max <- current_stats$max_ci[field_idx] + ci_range <- if (!is.na(ci_min) && !is.na(ci_max)) { + sprintf("%.1f-%.1f", ci_min, ci_max) + } else { + NA_character_ + } + + # Get field_name from field_boundaries_sf + field_name <- field_sf$field + + # Build result row (21 columns per specification, in order) result_row <- data.frame( Field_id = field_id, + Farm_Section = NA_character_, + Field_name = field_name, Acreage = field_area_acres, Mean_CI = mean_ci_current, - Mean_CI_prev = mean_ci_previous, - CI_change = ci_change, - CI_CV = ci_cv_current, - CI_percentile_low = ci_percentile_low, - CI_percentile_high = ci_percentile_high, + Weekly_ci_change = ci_change, + Four_week_trend = NA_character_, + Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, Age_weeks = age_weeks, Phase = phase, + nmr_weeks_in_this_phase = NA_real_, + Germination_progress = NA_real_, + Imminent_prob = NA_real_, Status_trigger = status_trigger, + CI_range = ci_range, + CI_Percentiles = if (!is.na(ci_percentile_low) && !is.na(ci_percentile_high)) { + sprintf("%.1f-%.1f", ci_percentile_low, ci_percentile_high) + } else { + NA_character_ + }, + CV = ci_cv_current, + CV_Trend_Short_Term = NA_real_, + CV_Trend_Long_Term = NA_real_, + Cloud_pct_clear = cloud_pct_clear, Cloud_category = cloud_category, stringsAsFactors = FALSE ) From 6e88acef25b6e6f01388f39f4a5969426675a576 Mon Sep 17 00:00:00 2001 From: Timon Date: Fri, 16 Jan 2026 08:04:45 +0100 Subject: [PATCH 06/15] added some more stats and functions calculate and export csv and rds file for future ref --- r_app/80_calculate_kpis.R | 588 ++++++++++++++++++++++---------------- 1 file changed, 345 insertions(+), 243 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 816ad83..3786f0e 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -65,11 +65,6 @@ # [✓] Add Cloud_pct_clear (% from pixel coverage) # [✓] Column order: Reorder to match spec (1-21) # -# PHASE 1 LIMITATION (Known Issue - To Fix in PHASE 2): -# - Fields spanning multiple tiles currently show statistics from first intersecting tile only -# - This results in p10 ≈ p90 (few pixels per tile) instead of field-wide percentiles -# - FIX: After extracting all tiles, group by field_id and aggregate pixel values across all tiles -# before calculating percentiles. This will give true field-wide statistics. # # PHASE 2 - MEDIUM (Requires computation): # [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV) @@ -539,6 +534,245 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) }) } +# ============================================================================ +# MODULAR STATISTICS CALCULATION (Reusable for any week) +# ============================================================================ + +calculate_field_statistics <- function(field_boundaries_sf, week_num, year, + mosaic_dir, report_date = Sys.Date()) { + + message(paste("Calculating statistics for all fields - Week", week_num, year)) + + # Build tile file list + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) + tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) + + if (length(tile_files) == 0) { + stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir)) + } + + message(paste(" Found", length(tile_files), "tiles for week", week_num)) + + results_list <- list() + fields_processed <- 0 + + # SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile + for (tile_idx in seq_along(tile_files)) { + tile_file <- tile_files[tile_idx] + + tryCatch({ + # Load tile + current_rast <- terra::rast(tile_file) + ci_band <- current_rast[["CI"]] + + if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) { + message(paste(" [SKIP] Tile", basename(tile_file), "- CI band not found")) + return(NULL) + } + + # Extract all fields from this tile in ONE call + # terra::extract returns a dataframe with columns: ID, CI + # where each row is one pixel, and ID indicates which polygon it came from + extracted <- terra::extract(ci_band, field_boundaries_sf, na.rm = FALSE) + + # Group by field ID and calculate statistics for each field + # extracted$ID contains the field polygon index (1 to nrow(field_boundaries_sf)) + unique_field_ids <- unique(extracted$ID[!is.na(extracted$ID)]) + + for (field_poly_idx in unique_field_ids) { + # Get all CI values for this field from this tile + field_id <- field_boundaries_sf$field[field_poly_idx] + ci_vals <- extracted$CI[extracted$ID == field_poly_idx] + ci_vals <- ci_vals[!is.na(ci_vals)] + + # Skip if no data for this field in this tile + if (length(ci_vals) == 0) { + next + } + + # Calculate statistics + mean_ci <- mean(ci_vals, na.rm = TRUE) + ci_std <- sd(ci_vals, na.rm = TRUE) + cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_ + range_min <- min(ci_vals, na.rm = TRUE) + range_max <- max(ci_vals, na.rm = TRUE) + range_str <- sprintf("%.1f-%.1f", range_min, range_max) + ci_percentiles_str <- get_ci_percentiles(ci_vals) + + # Cloud coverage: count total pixels vs non-NA pixels for this field + field_rows <- extracted[extracted$ID == field_poly_idx, ] + num_total <- nrow(field_rows) + num_data <- sum(!is.na(field_rows$CI)) + pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 + cloud_cat <- if (num_data == 0) "No image available" + else if (pct_clear >= 99.5) "Clear view" + else "Partial coverage" + + # Age and Phase + age_weeks <- if (USE_UNIFORM_AGE) { + as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) + } else { + NA_real_ + } + phase <- get_phase_by_age(age_weeks) + + # Germination progress (only for young plants, age < 17 weeks) + germination_progress <- NA_character_ + if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks < 17) { + pct_ci_ge_threshold <- sum(ci_vals >= GERMINATION_CI_THRESHOLD) / length(ci_vals) * 100 + germination_progress <- sprintf("%.1f%%", pct_ci_ge_threshold) + } + + # Store result (check if field already exists from another tile) + existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id) + + if (length(existing_idx) > 0) { + # Field already in results from previous tile - keep first occurrence or average + # For now, keep the first one (earlier tiles) + next + } + + # Store new field result + results_list[[length(results_list) + 1]] <- data.frame( + Field_id = field_id, + Mean_CI = round(mean_ci, 2), + CV = round(cv, 4), + CI_range = range_str, + CI_Percentiles = ci_percentiles_str, + Cloud_pct_clear = pct_clear, + Cloud_category = cloud_cat, + Age_week = round(age_weeks, 1), + Phase = phase, + Germination_progress = germination_progress, + stringsAsFactors = FALSE + ) + + fields_processed <- fields_processed + 1 + } + + message(paste(" Tile", tile_idx, "of", length(tile_files), "processed")) + + }, error = function(e) { + message(paste(" [ERROR] Tile", basename(tile_file), ":", e$message)) + }) + } + + if (length(results_list) == 0) { + stop(paste("No fields processed successfully for week", week_num)) + } + + stats_df <- dplyr::bind_rows(results_list) + message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields")) + + return(stats_df) +} + +# ============================================================================ +# CALCULATE KPI TRENDS (Requires previous week RDS) +# ============================================================================ + +calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { + + message("Calculating KPI trends from current and previous week data") + + # Initialize new columns with defaults + current_stats$Weekly_ci_change <- NA_real_ + current_stats$CV_Trend_Short_Term <- NA_real_ + current_stats$nmr_weeks_in_this_phase <- 1L + + # If no previous week data, return with defaults + if (is.null(prev_stats) || nrow(prev_stats) == 0) { + message(" No previous week data available - using defaults") + return(current_stats) + } + + # Build lookup indices for previous week (by Field_id) + prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) + + # For each field in current week, lookup previous values + for (i in seq_len(nrow(current_stats))) { + field_id <- current_stats$Field_id[i] + prev_idx <- prev_lookup[field_id] + + if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { + # Field exists in previous week + prev_row <- prev_stats[prev_idx, ] + + # Weekly CI change (current Mean_CI - previous Mean_CI) + if (!is.na(prev_row$Mean_CI) && !is.na(current_stats$Mean_CI[i])) { + current_stats$Weekly_ci_change[i] <- + round(current_stats$Mean_CI[i] - prev_row$Mean_CI, 2) + } + + # CV short-term trend (current CV - previous CV) + if (!is.na(prev_row$CV) && !is.na(current_stats$CV[i])) { + current_stats$CV_Trend_Short_Term[i] <- + round(current_stats$CV[i] - prev_row$CV, 4) + } + + # Weeks in current phase (track phase transitions) + if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase)) { + if (current_stats$Phase[i] == prev_row$Phase) { + # Same phase - increment counter + prev_weeks <- if (!is.na(prev_row$nmr_weeks_in_this_phase)) { + prev_row$nmr_weeks_in_this_phase + } else { + 1 + } + current_stats$nmr_weeks_in_this_phase[i] <- prev_weeks + 1L + } else { + # Phase changed - reset to 1 + current_stats$nmr_weeks_in_this_phase[i] <- 1L + } + } + } + } + + message(" Calculated trends for all fields") + return(current_stats) +} + +# ============================================================================ +# LOAD OR CALCULATE WEEKLY STATISTICS (RDS Caching) +# ============================================================================ + +load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf, + mosaic_dir, reports_dir, report_date = Sys.Date()) { + + # Build RDS file path + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, week_num) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + # Try to load existing RDS (fast cache) + if (file.exists(rds_path)) { + message(paste("Loading cached statistics from:", basename(rds_path))) + return(readRDS(rds_path)) + } + + # RDS not found - calculate from tiles + message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num)) + stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year, + mosaic_dir, report_date) + + # Create output directory if needed + output_dir <- file.path(reports_dir, "kpis", "field_stats") + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + } + + # Export RDS (for fast lookup next week) + saveRDS(stats_df, rds_path) + message(paste("Saved weekly statistics RDS:", basename(rds_path))) + + # Export CSV (for user review) + csv_filename <- sprintf("%s_field_stats_week%02d.csv", project_dir, week_num) + csv_path <- file.path(output_dir, csv_filename) + readr::write_csv(stats_df, csv_path) + message(paste("Saved weekly statistics CSV:", basename(csv_path))) + + return(stats_df) +} + # ============================================================================ # PARALLEL FIELD ANALYSIS FUNCTION # ============================================================================ @@ -1316,248 +1550,116 @@ main <- function() { } # SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile - message("\nProcessing tiles and extracting field statistics...") - all_tile_results <- list() + # ============================================================================ + # NEW MODULAR APPROACH: Load/Calculate weekly stats, apply trends + # ============================================================================ - for (i in seq_along(tile_files)) { - tile_file <- tile_files[i] - message(paste(" Processing tile", i, "of", length(tile_files), ":", basename(tile_file))) - - tryCatch({ - # Load current tile and previous week tile - current_rast <- terra::rast(tile_file) - - # DEBUG: Check tile structure on first tile - if (i == 1) { - message(paste(" [DEBUG] Tile CRS:", terra::crs(current_rast))) - message(paste(" [DEBUG] Tile extent:", paste(terra::ext(current_rast)))) - message(paste(" [DEBUG] Field boundaries CRS:", sf::st_crs(field_boundaries_sf))) - field_bbox <- sf::st_bbox(field_boundaries_sf) - message(paste(" [DEBUG] Field bbox:", paste(round(field_bbox, 2)))) - message(paste(" [DEBUG] Band names:", paste(names(current_rast), collapse=", "))) - } - - # Extract CI band by name - ci_band <- current_rast[["CI"]] - - # Check if CI band exists - use proper logical checks - if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) { - message(paste(" ERROR: CI band not found. Available bands:", paste(names(current_rast), collapse=", "))) - next - } - - # Check if CI band has any valid data - if (tryCatch(all(is.na(values(ci_band))), error = function(e) TRUE)) { - message(paste(" ERROR: CI band has no valid data")) - next - } - - # Load previous week tile if available - previous_tile_file <- sub(sprintf("week_%02d", current_week), - sprintf("week_%02d", previous_week), - tile_file) - previous_ci <- NULL - if (file.exists(previous_tile_file)) { - previous_rast <- terra::rast(previous_tile_file) - previous_ci <- previous_rast[["CI"]] - } - - # OPTION 1 + 2: Extract all CI statistics from one pixel extraction (single call) - current_stats <- extract_field_statistics_from_ci(ci_band, field_boundaries_sf) - - # DEBUG: Check extraction result on first tile - if (i == 1) { - num_with_data <- sum(!is.na(current_stats$mean_ci)) - message(paste(" [DEBUG] Extracted", nrow(current_stats), "fields, ", num_with_data, "with non-NA data")) - if (num_with_data > 0) { - message(paste(" [DEBUG] Sample mean CIs:", paste(head(current_stats$mean_ci[!is.na(current_stats$mean_ci)], 3), collapse=", "))) - # Check percentiles - sample_field <- which(!is.na(current_stats$mean_ci))[2] - message(paste(" [DEBUG] Field", sample_field, "- p10:", current_stats$p10[sample_field], - "p90:", current_stats$p90[sample_field], - "min:", current_stats$min_ci[sample_field], - "max:", current_stats$max_ci[sample_field], - "valid_pixels:", current_stats$pixel_count_valid[sample_field])) - } - } - - # Extract previous week CI statistics if available - previous_stats <- NULL - if (!is.null(previous_ci)) { - previous_stats <- extract_field_statistics_from_ci(previous_ci, field_boundaries_sf) - } - - # Process each field that was extracted - field_results_this_tile <- list() - fields_added <- 0 - - for (field_idx in seq_len(nrow(field_boundaries_sf))) { - tryCatch({ - field_id <- field_boundaries_sf$field[field_idx] - field_sf <- field_boundaries_sf[field_idx, ] - - # Get statistics from helper function results - # current_stats should have same number of rows as field_boundaries_sf - if (field_idx > nrow(current_stats)) { - message(paste(" [ERROR] field_idx", field_idx, "> nrow(current_stats)", nrow(current_stats))) - next - } - - mean_ci_current <- current_stats$mean_ci[field_idx] - - # SKIP fields with no data in this tile (they don't intersect this tile) - if (is.na(current_stats$pixel_count_valid[field_idx]) || current_stats$pixel_count_valid[field_idx] == 0) { - next - } - ci_cv_current <- current_stats$cv[field_idx] - ci_percentile_low <- current_stats$p10[field_idx] - ci_percentile_high <- current_stats$p90[field_idx] - - # If field doesn't intersect this tile, mean_ci_current will be NA - if (is.na(mean_ci_current)) { - next # Skip this field - doesn't intersect this tile - } - + # Build tile grid (needed by calculate_field_statistics) + message("\nBuilding tile grid for current week...") + tile_grid <- build_tile_grid(mosaic_dir, current_week, year) + + message("\nUsing modular RDS-based approach for weekly statistics...") + + # Load/calculate CURRENT week stats (from tiles or RDS cache) + message("\n1. Loading/calculating CURRENT week statistics (week", current_week, ")...") + current_stats <- load_or_calculate_weekly_stats( + week_num = current_week, + year = year, + project_dir = project_dir, + field_boundaries_sf = field_boundaries_sf, + mosaic_dir = tile_grid$mosaic_dir, + reports_dir = reports_dir, + report_date = end_date + ) + + message(paste(" ✓ Loaded/calculated stats for", nrow(current_stats), "fields in current week")) + + # Load/calculate PREVIOUS week stats (from RDS cache or tiles) + message("\n2. Loading/calculating PREVIOUS week statistics (week", previous_week, ")...") + prev_stats <- load_or_calculate_weekly_stats( + week_num = previous_week, + year = year, + project_dir = project_dir, + field_boundaries_sf = field_boundaries_sf, + mosaic_dir = tile_grid$mosaic_dir, + reports_dir = reports_dir, + report_date = end_date - 7 # Approximately 1 week before + ) + + message(paste(" ✓ Loaded/calculated stats for", nrow(prev_stats), "fields in previous week")) + + # Apply trend calculations (requires both weeks) + message("\n3. Calculating trend columns...") + current_stats <- calculate_kpi_trends(current_stats, prev_stats) + + message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, nmr_weeks_in_this_phase")) + + # ============================================================================ + # Build final output dataframe with all 21 columns + # ============================================================================ + + message("\nBuilding final field analysis output...") + + field_analysis_df <- current_stats %>% + mutate( + # Column 2: Farm_Section (user fills) + Farm_Section = NA_character_, + # Column 3: Field_name (from GeoJSON - already have Field_id, can look up) + Field_name = Field_id, + # Column 4: Acreage (calculate from geometry) + Acreage = { + acreages <- sapply(seq_len(nrow(field_boundaries_sf)), function(idx) { + field_sf <- field_boundaries_sf[idx, ] field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000 - field_area_acres <- field_area_ha / 0.404686 - - # Extract previous week CI if available - mean_ci_previous <- NA - ci_change <- NA - if (!is.null(previous_stats)) { - mean_ci_previous <- previous_stats$mean_ci[field_idx] - if (!is.na(mean_ci_previous)) { - ci_change <- mean_ci_current - mean_ci_previous - } - } - - # Reconstruct pixel values for status trigger (we need the actual pixel array) - # Use the percentiles and mean to create a synthetic distribution for status_trigger - # For now, use mean CI repeated by pixel count for testing - # TODO: Consider extracting pixels directly if needed for more complex triggers - pixel_count_valid <- current_stats$pixel_count_valid[field_idx] - ci_vals_current <- if (!is.na(pixel_count_valid) && pixel_count_valid > 0) { - rep(mean_ci_current, pixel_count_valid) # Simplified: use mean value repeated - } else { - numeric(0) - } - - # Calculate age - age_weeks <- if (!is.null(planting_dates) && nrow(planting_dates) > 0 && field_idx <= nrow(planting_dates)) { - planting_date <- planting_dates$date[field_idx] - if (!is.na(planting_date)) { - as.numeric(difftime(end_date, planting_date, units = "weeks")) - } else { - 0 - } - } else { - 0 - } - - # Get phase and status - phase <- get_phase_by_age(age_weeks) - status_trigger <- get_status_trigger(ci_vals_current, ci_change, age_weeks) - - # Calculate Cloud_pct_clear: percentage of field with valid data - # Binned to 10% intervals (0, 10, 20, ..., 90, 100) - cloud_pct_clear <- { - valid_count <- current_stats$pixel_count_valid[field_idx] - total_count <- current_stats$pixel_count_total[field_idx] - if (!is.na(valid_count) && !is.na(total_count) && total_count > 0) { - pct <- (valid_count / total_count) * 100 - round(pct / 10) * 10 - } else { - NA_real_ - } - } - - # Cloud categorization based on pixel coverage (Cloud_pct_clear) - cloud_category <- if (is.na(cloud_pct_clear)) { - "No image available" - } else if (cloud_pct_clear >= 90) { - "Clear view" - } else if (cloud_pct_clear > 0) { - "Partial coverage" - } else { - "No image available" - } - - # Get min/max CI values - ci_min <- current_stats$min_ci[field_idx] - ci_max <- current_stats$max_ci[field_idx] - ci_range <- if (!is.na(ci_min) && !is.na(ci_max)) { - sprintf("%.1f-%.1f", ci_min, ci_max) - } else { - NA_character_ - } - - # Get field_name from field_boundaries_sf - field_name <- field_sf$field - - # Build result row (21 columns per specification, in order) - result_row <- data.frame( - Field_id = field_id, - Farm_Section = NA_character_, - Field_name = field_name, - Acreage = field_area_acres, - Mean_CI = mean_ci_current, - Weekly_ci_change = ci_change, - Four_week_trend = NA_character_, - Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, - Age_weeks = age_weeks, - Phase = phase, - nmr_weeks_in_this_phase = NA_real_, - Germination_progress = NA_real_, - Imminent_prob = NA_real_, - Status_trigger = status_trigger, - CI_range = ci_range, - CI_Percentiles = if (!is.na(ci_percentile_low) && !is.na(ci_percentile_high)) { - sprintf("%.1f-%.1f", ci_percentile_low, ci_percentile_high) - } else { - NA_character_ - }, - CV = ci_cv_current, - CV_Trend_Short_Term = NA_real_, - CV_Trend_Long_Term = NA_real_, - Cloud_pct_clear = cloud_pct_clear, - Cloud_category = cloud_category, - stringsAsFactors = FALSE - ) - - field_results_this_tile[[as.character(field_id)]] <- result_row - fields_added <- fields_added + 1 - - }, error = function(e) { - # Show error for debugging - message(paste(" [FIELD ERROR] Field", field_idx, ":", e$message)) + field_area_ha / 0.404686 }) - } - - if (length(field_results_this_tile) > 0) { - all_tile_results[[basename(tile_file)]] <- dplyr::bind_rows(field_results_this_tile) - message(paste(" Extracted", length(field_results_this_tile), "fields from tile (processed", fields_added, "fields total)")) - } else { - message(paste(" WARNING: No fields extracted from this tile (processed", fields_added, "fields, all either NA or errored)")) - } - - }, error = function(e) { - message(paste(" Error processing tile", basename(tile_file), ":", e$message)) - }) - } + acreages[match(Field_id, field_boundaries_sf$field)] + }, + # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change) + # Column 7: Four_week_trend (Phase 3 future) + Four_week_trend = NA_character_, + # Column 8: Last_harvest_or_planting_date (dummy for now) + Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, + # Columns 9-10: Already in current_stats (Age_week, Phase) + # Column 11: nmr_weeks_in_this_phase (already calculated) + # Column 12: Germination_progress (already calculated) + # Column 13: Imminent_prob (placeholder) + Imminent_prob = "placeholder data", + # Column 14: Status_trigger (need to add) + Status_trigger = { + triggers <- sapply(seq_len(nrow(current_stats)), function(idx) { + field_id <- current_stats$Field_id[idx] + field_idx <- which(field_boundaries_sf$field == field_id)[1] + if (is.na(field_idx)) return(NA_character_) + + # Reconstruct CI values from Mean_CI for status trigger logic + # For now, use simplified approach + age_w <- current_stats$Age_week[idx] + ci_change <- current_stats$Weekly_ci_change[idx] + + # Using mean CI as proxy (could be improved with pixel distribution) + ci_vals <- rep(current_stats$Mean_CI[idx], 100) + get_status_trigger(ci_vals, ci_change, age_w) + }) + triggers + }, + # Columns 15-16: Already in current_stats (CI_range, CI_Percentiles) + # Column 17: Already in current_stats (CV) + # Column 18: Already in current_stats (CV_Trend_Short_Term) + # Column 19: CV_Trend_Long_Term (Phase 3 future) + CV_Trend_Long_Term = NA_real_, + # Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category) + .keep = "all" # Keep all existing columns + ) %>% + select( + Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change, + Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase, + nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger, + CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, + Cloud_pct_clear, Cloud_category + ) - # Combine all tile results, keeping unique fields (may appear in multiple tiles) - if (length(all_tile_results) == 0) { - stop("No fields extracted from any tiles!") - } - - field_analysis_df <- dplyr::bind_rows(all_tile_results) %>% - distinct(Field_id, .keep_all = TRUE) - - if (nrow(field_analysis_df) == 0) { - stop("No fields analyzed successfully!") - } - - message(paste("✓ Analyzed", nrow(field_analysis_df), "fields")) + message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 21 columns")) summary_statistics_df <- generate_field_analysis_summary(field_analysis_df) From 8d84c8cab550dbd59f5a4aec87935a6d9ae9afa5 Mon Sep 17 00:00:00 2001 From: Timon Date: Fri, 16 Jan 2026 08:30:47 +0100 Subject: [PATCH 07/15] phase 2 cv trend implemented --- r_app/80_calculate_kpis.R | 202 ++++++++++++++++++++++++++++++++------ 1 file changed, 172 insertions(+), 30 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 3786f0e..9d02fa4 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -98,6 +98,10 @@ TEST_MODE_NUM_WEEKS <- 2 # Percentage of pixels that must reach this CI value to count as "germinated" GERMINATION_CI_THRESHOLD <- 2.0 # Pixels with CI >= 2 count as germinated +# FOR TESTING: Set these fields as "recently planted" to demonstrate germination progress +YOUNG_FIELDS_FOR_TESTING <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10") # First 10 field IDs +YOUNG_FIELD_PLANTING_DATE <- as.Date("2026-01-01") # Recently planted for demo + # FOUR-WEEK TREND THRESHOLDS FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5 FOUR_WEEK_TREND_GROWTH_MIN <- 0.1 @@ -543,6 +547,10 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, message(paste("Calculating statistics for all fields - Week", week_num, year)) + # Debug: Check if constants are available + message(paste(" DEBUG: YOUNG_FIELDS_FOR_TESTING =", paste(YOUNG_FIELDS_FOR_TESTING, collapse=", "))) + message(paste(" DEBUG: YOUNG_FIELD_PLANTING_DATE =", YOUNG_FIELD_PLANTING_DATE)) + # Build tile file list tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) @@ -609,10 +617,19 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, else "Partial coverage" # Age and Phase - age_weeks <- if (USE_UNIFORM_AGE) { - as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) - } else { - NA_real_ + age_weeks <- NA_real_ + if (USE_UNIFORM_AGE) { + # Check if this field is in the "young fields" list (for testing germination progress) + is_young_field <- field_id %in% YOUNG_FIELDS_FOR_TESTING + if (is_young_field) { + age_weeks <- as.numeric(difftime(report_date, YOUNG_FIELD_PLANTING_DATE, units = "weeks")) + # Debug for first 2 matches + if (field_id %in% c("1", "2")) { + message(paste(" DEBUG: Field", field_id, "is young field, age =", round(age_weeks, 2), "weeks")) + } + } else { + age_weeks <- as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) + } } phase <- get_phase_by_age(age_weeks) @@ -686,49 +703,138 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { return(current_stats) } + message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns")) + message(paste(" prev_stats columns:", paste(names(prev_stats), collapse = ", "))) + # Build lookup indices for previous week (by Field_id) prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) + # Try to load previous week's field_analysis to get nmr_weeks_in_this_phase history + prev_field_analysis <- NULL + prev_analysis_csv <- file.path( + reports_dir, "kpis", "field_analysis", + sprintf("%s_field_analysis_week%02d.csv", + paste(strsplit(current_stats$Field_id[1], "")[[1]][1], collapse=""), # Get project from field + as.numeric(format(Sys.Date() - 7, "%V"))) # Approximate previous week + ) + + # Better way: construct the previous week number properly + current_week_num <- as.numeric(format(Sys.Date(), "%V")) + prev_week_num <- current_week_num - 1 + if (prev_week_num < 1) prev_week_num <- 52 + + # This is a bit tricky - we need the project_dir from the main scope + # For now, assume we can infer it or pass it through + # Let's use a simpler approach: look for any field_analysis_week* file that's recent + + tryCatch({ + analysis_dir <- file.path(reports_dir, "kpis", "field_analysis") + if (dir.exists(analysis_dir)) { + # Find the most recent field_analysis CSV (should be previous week) + analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) + if (length(analysis_files) > 0) { + # Sort by modification time and get the most recent + recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] + prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, + col_select = c(Field_id, nmr_weeks_in_this_phase, Phase)) + } + } + }, error = function(e) { + message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message)) + }) + + if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { + message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase")) + } + # For each field in current week, lookup previous values + cv_trends_calculated <- 0 for (i in seq_len(nrow(current_stats))) { field_id <- current_stats$Field_id[i] prev_idx <- prev_lookup[field_id] if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { - # Field exists in previous week - prev_row <- prev_stats[prev_idx, ] + # Field exists in previous week - extract row carefully + prev_row <- prev_stats[prev_idx, , drop = FALSE] # Keep as dataframe + + if (nrow(prev_row) == 0) { + # Field not found - skip + next + } + + # Access values from single-row dataframe + prev_mean_ci <- prev_row$Mean_CI[1] + prev_cv <- prev_row$CV[1] + prev_phase <- prev_row$Phase[1] # Weekly CI change (current Mean_CI - previous Mean_CI) - if (!is.na(prev_row$Mean_CI) && !is.na(current_stats$Mean_CI[i])) { + if (!is.na(prev_mean_ci) && !is.na(current_stats$Mean_CI[i])) { current_stats$Weekly_ci_change[i] <- - round(current_stats$Mean_CI[i] - prev_row$Mean_CI, 2) + round(current_stats$Mean_CI[i] - prev_mean_ci, 2) } # CV short-term trend (current CV - previous CV) - if (!is.na(prev_row$CV) && !is.na(current_stats$CV[i])) { - current_stats$CV_Trend_Short_Term[i] <- - round(current_stats$CV[i] - prev_row$CV, 4) + # DEBUG: Check first few fields + if (i <= 3) { + message(paste(" Field", field_id, "- CV_prev:", prev_cv, "CV_curr:", current_stats$CV[i])) + } + + if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) { + trend_val <- round(current_stats$CV[i] - prev_cv, 4) + current_stats$CV_Trend_Short_Term[i] <- trend_val + cv_trends_calculated <- cv_trends_calculated + 1 + + if (i <= 3) { + message(paste(" -> CV_Trend =", trend_val)) + } } # Weeks in current phase (track phase transitions) - if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase)) { - if (current_stats$Phase[i] == prev_row$Phase) { - # Same phase - increment counter - prev_weeks <- if (!is.na(prev_row$nmr_weeks_in_this_phase)) { - prev_row$nmr_weeks_in_this_phase - } else { - 1 + # Use previous field_analysis if available for proper counter progression + if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { + # Look up this field in previous analysis + prev_analysis_row <- prev_field_analysis %>% + dplyr::filter(Field_id == field_id) + + if (nrow(prev_analysis_row) > 0) { + prev_phase_analysis <- prev_analysis_row$Phase[1] + prev_nmr_weeks_analysis <- prev_analysis_row$nmr_weeks_in_this_phase[1] + + if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase_analysis)) { + if (current_stats$Phase[i] == prev_phase_analysis) { + # Same phase - increment the counter + current_stats$nmr_weeks_in_this_phase[i] <- + if (!is.na(prev_nmr_weeks_analysis)) prev_nmr_weeks_analysis + 1L else 2L + } else { + # Phase changed - reset to 1 + current_stats$nmr_weeks_in_this_phase[i] <- 1L + } + } + } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) { + # Field not in previous analysis, fall back to prev_stats phase comparison + if (current_stats$Phase[i] == prev_phase) { + current_stats$nmr_weeks_in_this_phase[i] <- 2L + } else { + current_stats$nmr_weeks_in_this_phase[i] <- 1L + } + } + } else { + # No previous field_analysis available - use phase from prev_stats + if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) { + if (current_stats$Phase[i] == prev_phase) { + # Same phase - increment counter (start with 2) + current_stats$nmr_weeks_in_this_phase[i] <- 2L + } else { + # Phase changed - reset to 1 + current_stats$nmr_weeks_in_this_phase[i] <- 1L } - current_stats$nmr_weeks_in_this_phase[i] <- prev_weeks + 1L - } else { - # Phase changed - reset to 1 - current_stats$nmr_weeks_in_this_phase[i] <- 1L } } } } - message(" Calculated trends for all fields") + message(paste(" ✓ Calculated", cv_trends_calculated, "CV_Trend_Short_Term values out of", nrow(current_stats), "fields")) + message(paste(" CV_Trend_Short_Term non-NA values:", sum(!is.na(current_stats$CV_Trend_Short_Term)))) return(current_stats) } @@ -1576,6 +1682,10 @@ main <- function() { # Load/calculate PREVIOUS week stats (from RDS cache or tiles) message("\n2. Loading/calculating PREVIOUS week statistics (week", previous_week, ")...") + + # Calculate report date for previous week (7 days before current) + prev_report_date <- end_date - 7 + prev_stats <- load_or_calculate_weekly_stats( week_num = previous_week, year = year, @@ -1583,10 +1693,12 @@ main <- function() { field_boundaries_sf = field_boundaries_sf, mosaic_dir = tile_grid$mosaic_dir, reports_dir = reports_dir, - report_date = end_date - 7 # Approximately 1 week before + report_date = prev_report_date ) message(paste(" ✓ Loaded/calculated stats for", nrow(prev_stats), "fields in previous week")) + message(paste(" Columns in prev_stats:", paste(names(prev_stats), collapse = ", "))) + message(paste(" CV column non-NA values in prev_stats:", sum(!is.na(prev_stats$CV)))) # Apply trend calculations (requires both weeks) message("\n3. Calculating trend columns...") @@ -1600,6 +1712,40 @@ main <- function() { message("\nBuilding final field analysis output...") + # Pre-calculate acreages with geometry validation + # This avoids geometry errors during field_analysis construction + acreage_lookup <- tryCatch({ + lookup_df <- field_boundaries_sf %>% + sf::st_drop_geometry() %>% + as.data.frame() %>% + mutate( + geometry_valid = sapply(seq_len(nrow(field_boundaries_sf)), function(idx) { + tryCatch({ + sf::st_is_valid(field_boundaries_sf[idx, ]) + }, error = function(e) FALSE) + }), + area_ha = 0 + ) + + # Calculate area for valid geometries + for (idx in which(lookup_df$geometry_valid)) { + tryCatch({ + area_m2 <- as.numeric(sf::st_area(field_boundaries_sf[idx, ])) + lookup_df$area_ha[idx] <- area_m2 / 10000 + }, error = function(e) { + lookup_df$area_ha[idx] <<- NA_real_ + }) + } + + # Convert hectares to acres + lookup_df %>% + mutate(acreage = area_ha / 0.404686) %>% + select(field, acreage) + }, error = function(e) { + message(paste("Warning: Could not calculate acreages from geometries -", e$message)) + data.frame(field = character(0), acreage = numeric(0)) + }) + field_analysis_df <- current_stats %>% mutate( # Column 2: Farm_Section (user fills) @@ -1608,12 +1754,8 @@ main <- function() { Field_name = Field_id, # Column 4: Acreage (calculate from geometry) Acreage = { - acreages <- sapply(seq_len(nrow(field_boundaries_sf)), function(idx) { - field_sf <- field_boundaries_sf[idx, ] - field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000 - field_area_ha / 0.404686 - }) - acreages[match(Field_id, field_boundaries_sf$field)] + acreages_vec <- acreage_lookup$acreage[match(Field_id, acreage_lookup$field)] + if_else(is.na(acreages_vec), 0, acreages_vec) }, # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change) # Column 7: Four_week_trend (Phase 3 future) From 3e4430b3be330f3ed0c20b5a8e2ced3a45abd601 Mon Sep 17 00:00:00 2001 From: Timon Date: Fri, 16 Jan 2026 09:10:39 +0100 Subject: [PATCH 08/15] added last columns of trends, only harvest imminent and age left to do --- r_app/80_calculate_kpis.R | 362 +++++++++++++++++++++++++++++++------- 1 file changed, 301 insertions(+), 61 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 9d02fa4..4c657ca 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -26,6 +26,55 @@ # source("r_app/80_calculate_kpis.R") # main() +# ============================================================================ +# PENDING WORK - PHASE 4 (Next Sprint) +# ============================================================================ +# +# CRITICAL INTEGRATIONS: +# +# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307) +# [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv +# Columns: field, imminent_prob, detected_prob, week, year +# [ ] LEFT JOIN to field_analysis_df by (field, week, year) +# [ ] Replace hardcoded "placeholder data" in Status_trigger calculation +# [ ] Update column to show actual harvest probability (0-1 or 0-100%) +# +# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31) +# [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field +# [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE +# [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks") +# [ ] Removes TEST MODE hardcoding and enables field-specific aging +# +# UTILITY FILE REFACTORING (Improve code maintainability): +# +# 3. CREATE r_app/80_weekly_stats_utils.R +# [ ] Extract functions from lines 250-795 (calculation layer): +# - calculate_field_statistics() +# - calculate_kpi_trends() +# - load_or_calculate_weekly_stats() +# - Helper: load_tiles_for_field(), get_tile_ids_for_field() +# - Helper: extract_field_statistics_from_ci() +# [ ] Clean separation: DATA CALCULATION ONLY (no Excel export) +# [ ] Reusable by run_full_pipeline.R and other scripts +# +# 4. CREATE r_app/80_report_building_utils.R +# [ ] Extract functions from lines 1350-2100+ (output/reporting layer): +# - generate_field_analysis_summary() +# - export_field_analysis_excel() +# - calculate_and_export_farm_kpis() +# - Helper: categorize_*, get_*, round_* functions +# - Helper: get_phase_by_age(), get_status_trigger() +# [ ] Clean separation: OUTPUT/FORMATTING ONLY (consumes calculated stats) +# [ ] Reusable for alternative export formats (PDF, HTML, dashboard) +# +# TESTING PLAN: +# [ ] Verify 8-week historical data loads (currently TEST_MODE = 2 weeks only) +# [ ] Confirm Four_week_trend calculates from 1-4 weeks (graceful degradation) +# [ ] Confirm CV_Trend_Long_Term uses full 8-week regression (when available) +# [ ] Load script 31 output and validate imminent_prob population +# +# ============================================================================ + # ============================================================================ # EXCEL OUTPUT SPECIFICATION (21 COLUMNS) # ============================================================================ @@ -114,6 +163,20 @@ FOUR_WEEK_TREND_STRONG_DECLINE_MAX <- -0.5 # CV TREND THRESHOLDS CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05 +# CV_TREND_LONG_TERM (8-WEEK SLOPE) INTERPRETATION THRESHOLDS +# Interpretation: Slope of CV over 8 weeks indicates field uniformity trend +# Negative slope = CV decreasing = field becoming MORE uniform = GOOD +# Positive slope = CV increasing = field becoming MORE patchy = BAD +# Near zero = Homogenous growth (all crops progressing equally) +CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.05 # CV decreasing rapidly +CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # Gradual synchronization +CV_SLOPE_IMPROVEMENT_MAX <- -0.005 # Becoming uniform +CV_SLOPE_HOMOGENOUS_MIN <- -0.005 # Stable, uniform growth +CV_SLOPE_HOMOGENOUS_MAX <- 0.005 # No change in uniformity +CV_SLOPE_PATCHINESS_MIN <- 0.005 # Minor divergence +CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness +CV_SLOPE_SEVERE_MIN <- 0.02 # Field fragmentation beginning + # CLOUD COVER ROUNDING INTERVALS CLOUD_INTERVALS <- c(0, 50, 60, 70, 80, 90, 100) @@ -404,6 +467,100 @@ calculate_cv_trend <- function(cv_current, cv_previous) { return(round(cv_current - cv_previous, 4)) } +calculate_four_week_trend <- function(mean_ci_values) { + #' Calculate four-week CI trend from available weeks + #' + #' Uses whatever weeks are available (1-4 weeks) to estimate trend + #' Returns difference between current (most recent) and oldest available week + #' + #' @param mean_ci_values vector of Mean_CI values in chronological order (oldest to newest) + #' @return numeric: CI difference (current - oldest), rounded to 2 decimals + + if (is.null(mean_ci_values) || length(mean_ci_values) == 0) { + return(NA_real_) + } + + # Remove NAs + ci_clean <- mean_ci_values[!is.na(mean_ci_values)] + + if (length(ci_clean) < 2) { + # Need at least 2 weeks to calculate trend + return(NA_real_) + } + + # Calculate difference: current - oldest + trend <- ci_clean[length(ci_clean)] - ci_clean[1] + return(round(trend, 2)) +} + +categorize_cv_slope <- function(slope) { + #' Categorize CV slope (8-week regression) into field uniformity interpretation + #' + #' Slope interpretation: + #' Negative slope = CV decreasing = field becoming MORE uniform = GOOD + #' Positive slope = CV increasing = field becoming MORE patchy = BAD + #' Near zero = Homogenous growth (all crops progressing equally) + #' + #' Categories: + #' - "Excellent uniformity": Slope <= -0.02 (CV decreasing, field synchronizing) + #' - "Homogenous growth": -0.02 < slope < 0.005 (stable, uniform growth) + #' - "Minor patchiness": 0.005 <= slope <= 0.02 (CV slowly increasing) + #' - "Severe fragmentation": slope > 0.02 (rapid CV increase, parts diverging) + #' + #' @param slope numeric: CV trend slope per week + #' @return character: Category string + + if (is.na(slope)) { + return(NA_character_) + } + + if (slope <= CV_SLOPE_IMPROVEMENT_MIN) { + return("Excellent uniformity") + } else if (slope < CV_SLOPE_HOMOGENOUS_MIN) { + return("Homogenous growth") + } else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) { + return("Homogenous growth") + } else if (slope <= CV_SLOPE_PATCHINESS_MAX) { + return("Minor patchiness") + } else { + return("Severe fragmentation") + } +} + +calculate_cv_trend_long_term <- function(cv_values) { + #' Calculate 8-week CV trend via linear regression slope + #' + #' Fits linear regression to CV over available weeks (1-8) + #' Returns slope = rate of change per week + #' + #' @param cv_values vector of CV values in chronological order (oldest to newest) + #' @return numeric: Regression slope (CV change per week), rounded to 4 decimals + + if (is.null(cv_values) || length(cv_values) == 0) { + return(NA_real_) + } + + # Remove NAs + cv_clean <- cv_values[!is.na(cv_values)] + + if (length(cv_clean) < 2) { + # Need at least 2 weeks to fit a line + return(NA_real_) + } + + # Create week sequence matching data length + weeks <- seq_along(cv_clean) + + # Fit linear model + tryCatch({ + lm_fit <- lm(cv_clean ~ weeks) + slope <- coef(lm_fit)["weeks"] + return(round(as.numeric(slope), 4)) + }, error = function(e) { + return(NA_real_) + }) +} + # ============================================================================ # HELPER FUNCTIONS # ============================================================================ @@ -498,7 +655,7 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n } USE_UNIFORM_AGE <- TRUE -UNIFORM_PLANTING_DATE <- as.Date("2025-01-01") +UNIFORM_PLANTING_DATE <- as.Date("2026-01-01") extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { if (USE_UNIFORM_AGE) { @@ -688,13 +845,17 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, # CALCULATE KPI TRENDS (Requires previous week RDS) # ============================================================================ -calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { +calculate_kpi_trends <- function(current_stats, prev_stats = NULL, + project_dir = NULL, reports_dir = NULL, + current_week = NULL, year = NULL) { message("Calculating KPI trends from current and previous week data") # Initialize new columns with defaults current_stats$Weekly_ci_change <- NA_real_ current_stats$CV_Trend_Short_Term <- NA_real_ + current_stats$Four_week_trend <- NA_real_ + current_stats$CV_Trend_Long_Term <- NA_real_ current_stats$nmr_weeks_in_this_phase <- 1L # If no previous week data, return with defaults @@ -704,36 +865,18 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { } message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns")) - message(paste(" prev_stats columns:", paste(names(prev_stats), collapse = ", "))) # Build lookup indices for previous week (by Field_id) prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) # Try to load previous week's field_analysis to get nmr_weeks_in_this_phase history prev_field_analysis <- NULL - prev_analysis_csv <- file.path( - reports_dir, "kpis", "field_analysis", - sprintf("%s_field_analysis_week%02d.csv", - paste(strsplit(current_stats$Field_id[1], "")[[1]][1], collapse=""), # Get project from field - as.numeric(format(Sys.Date() - 7, "%V"))) # Approximate previous week - ) - - # Better way: construct the previous week number properly - current_week_num <- as.numeric(format(Sys.Date(), "%V")) - prev_week_num <- current_week_num - 1 - if (prev_week_num < 1) prev_week_num <- 52 - - # This is a bit tricky - we need the project_dir from the main scope - # For now, assume we can infer it or pass it through - # Let's use a simpler approach: look for any field_analysis_week* file that's recent tryCatch({ analysis_dir <- file.path(reports_dir, "kpis", "field_analysis") if (dir.exists(analysis_dir)) { - # Find the most recent field_analysis CSV (should be previous week) analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) if (length(analysis_files) > 0) { - # Sort by modification time and get the most recent recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, col_select = c(Field_id, nmr_weeks_in_this_phase, Phase)) @@ -747,49 +890,139 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase")) } - # For each field in current week, lookup previous values + # ============================================================ + # PHASE 3: Load 4-8 weeks of historical data for trend calculations + # ============================================================ + + historical_4weeks <- list() + historical_8weeks <- list() + + if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) { + message(" Loading historical field_stats for 4-week and 8-week trends...") + + # Load up to 4 weeks back for four_week_trend + for (lookback in 1:4) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + tryCatch({ + stats_data <- readRDS(rds_path) + historical_4weeks[[length(historical_4weeks) + 1]] <- list( + week = target_week, + stats = stats_data + ) + }, error = function(e) { + message(paste(" Warning: Could not load week", target_week, ":", e$message)) + }) + } + } + + # Load up to 8 weeks back for cv_trend_long_term + for (lookback in 1:8) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + tryCatch({ + stats_data <- readRDS(rds_path) + historical_8weeks[[length(historical_8weeks) + 1]] <- list( + week = target_week, + stats = stats_data + ) + }, error = function(e) { + # Silently skip - we'll work with whatever weeks exist + }) + } + } + + if (length(historical_4weeks) > 0) { + message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend")) + } + if (length(historical_8weeks) > 0) { + message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend")) + } + } + + # For each field in current week, lookup previous values and calculate trends cv_trends_calculated <- 0 + four_week_trends_calculated <- 0 + cv_long_term_calculated <- 0 + for (i in seq_len(nrow(current_stats))) { field_id <- current_stats$Field_id[i] prev_idx <- prev_lookup[field_id] if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { - # Field exists in previous week - extract row carefully - prev_row <- prev_stats[prev_idx, , drop = FALSE] # Keep as dataframe + prev_row <- prev_stats[prev_idx, , drop = FALSE] - if (nrow(prev_row) == 0) { - # Field not found - skip - next - } - - # Access values from single-row dataframe - prev_mean_ci <- prev_row$Mean_CI[1] - prev_cv <- prev_row$CV[1] - prev_phase <- prev_row$Phase[1] - - # Weekly CI change (current Mean_CI - previous Mean_CI) - if (!is.na(prev_mean_ci) && !is.na(current_stats$Mean_CI[i])) { + # WEEKLY CI CHANGE + prev_ci <- prev_row$Mean_CI[1] + if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) { current_stats$Weekly_ci_change[i] <- - round(current_stats$Mean_CI[i] - prev_mean_ci, 2) - } - - # CV short-term trend (current CV - previous CV) - # DEBUG: Check first few fields - if (i <= 3) { - message(paste(" Field", field_id, "- CV_prev:", prev_cv, "CV_curr:", current_stats$CV[i])) + round(current_stats$Mean_CI[i] - prev_ci, 2) } + # CV TREND SHORT TERM (2-week comparison) + prev_cv <- prev_row$CV[1] if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) { - trend_val <- round(current_stats$CV[i] - prev_cv, 4) - current_stats$CV_Trend_Short_Term[i] <- trend_val + current_stats$CV_Trend_Short_Term[i] <- + calculate_cv_trend(current_stats$CV[i], prev_cv) cv_trends_calculated <- cv_trends_calculated + 1 + } + + # FOUR-WEEK TREND (if available) + if (length(historical_4weeks) > 0) { + ci_values_4week <- numeric() - if (i <= 3) { - message(paste(" -> CV_Trend =", trend_val)) + # Add oldest available weeks (reverse order to get oldest first) + for (hist_idx in rev(seq_along(historical_4weeks))) { + hist_data <- historical_4weeks[[hist_idx]]$stats + hist_field <- which(hist_data$Field_id == field_id) + if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) { + ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]]) + } + } + + # Add current week CI + ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i]) + + if (length(ci_values_4week) >= 2) { + current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week) + four_week_trends_calculated <- four_week_trends_calculated + 1 } } - # Weeks in current phase (track phase transitions) + # CV TREND LONG TERM (8-week slope) + if (length(historical_8weeks) > 0) { + cv_values_8week <- numeric() + + # Add oldest available weeks (reverse order to get oldest first) + for (hist_idx in rev(seq_along(historical_8weeks))) { + hist_data <- historical_8weeks[[hist_idx]]$stats + hist_field <- which(hist_data$Field_id == field_id) + if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) { + cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]]) + } + } + + # Add current week CV + cv_values_8week <- c(cv_values_8week, current_stats$CV[i]) + + if (length(cv_values_8week) >= 2) { + slope <- calculate_cv_trend_long_term(cv_values_8week) + current_stats$CV_Trend_Long_Term[i] <- slope + cv_long_term_calculated <- cv_long_term_calculated + 1 + } + } + + # WEEKS IN CURRENT PHASE (track phase transitions) # Use previous field_analysis if available for proper counter progression if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { # Look up this field in previous analysis @@ -810,9 +1043,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { current_stats$nmr_weeks_in_this_phase[i] <- 1L } } - } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) { + } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { # Field not in previous analysis, fall back to prev_stats phase comparison - if (current_stats$Phase[i] == prev_phase) { + if (current_stats$Phase[i] == prev_row$Phase[1]) { current_stats$nmr_weeks_in_this_phase[i] <- 2L } else { current_stats$nmr_weeks_in_this_phase[i] <- 1L @@ -820,9 +1053,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { } } else { # No previous field_analysis available - use phase from prev_stats - if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase)) { - if (current_stats$Phase[i] == prev_phase) { - # Same phase - increment counter (start with 2) + if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { + if (current_stats$Phase[i] == prev_row$Phase[1]) { + # Same phase - increment counter (start with 2 since prev week was in this phase) current_stats$nmr_weeks_in_this_phase[i] <- 2L } else { # Phase changed - reset to 1 @@ -833,8 +1066,9 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL) { } } - message(paste(" ✓ Calculated", cv_trends_calculated, "CV_Trend_Short_Term values out of", nrow(current_stats), "fields")) - message(paste(" CV_Trend_Short_Term non-NA values:", sum(!is.na(current_stats$CV_Trend_Short_Term)))) + message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields")) + message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields")) + message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields")) return(current_stats) } @@ -1702,9 +1936,13 @@ main <- function() { # Apply trend calculations (requires both weeks) message("\n3. Calculating trend columns...") - current_stats <- calculate_kpi_trends(current_stats, prev_stats) + current_stats <- calculate_kpi_trends(current_stats, prev_stats, + project_dir = project_dir, + reports_dir = reports_dir, + current_week = current_week, + year = year) - message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, nmr_weeks_in_this_phase")) + message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase")) # ============================================================================ # Build final output dataframe with all 21 columns @@ -1758,8 +1996,7 @@ main <- function() { if_else(is.na(acreages_vec), 0, acreages_vec) }, # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change) - # Column 7: Four_week_trend (Phase 3 future) - Four_week_trend = NA_character_, + # Column 7: Four_week_trend (from current_stats) # Column 8: Last_harvest_or_planting_date (dummy for now) Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, # Columns 9-10: Already in current_stats (Age_week, Phase) @@ -1788,8 +2025,11 @@ main <- function() { # Columns 15-16: Already in current_stats (CI_range, CI_Percentiles) # Column 17: Already in current_stats (CV) # Column 18: Already in current_stats (CV_Trend_Short_Term) - # Column 19: CV_Trend_Long_Term (Phase 3 future) - CV_Trend_Long_Term = NA_real_, + # Column 19: CV_Trend_Long_Term (from current_stats - raw slope value) + # Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope) + CV_Trend_Long_Term_Category = { + sapply(current_stats$CV_Trend_Long_Term, categorize_cv_slope) + }, # Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category) .keep = "all" # Keep all existing columns ) %>% @@ -1797,7 +2037,7 @@ main <- function() { Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change, Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase, nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger, - CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, + CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, CV_Trend_Long_Term_Category, Cloud_pct_clear, Cloud_category ) From 7975f8ad0643109a6532d1487bb26acbad408168 Mon Sep 17 00:00:00 2001 From: Timon Date: Sun, 18 Jan 2026 07:39:30 +0100 Subject: [PATCH 09/15] 8 week data working now --- r_app/80_calculate_kpis.R | 176 +++++++++++++++++++++++++++++++++++--- 1 file changed, 166 insertions(+), 10 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 4c657ca..823c708 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -72,7 +72,7 @@ # [ ] Confirm Four_week_trend calculates from 1-4 weeks (graceful degradation) # [ ] Confirm CV_Trend_Long_Term uses full 8-week regression (when available) # [ ] Load script 31 output and validate imminent_prob population -# +# [ ] maybe even look into aut calculating the mosaic if mosaic is missing in last 8 weeks # ============================================================================ # ============================================================================ @@ -140,7 +140,7 @@ # ============================================================================ # TEST MODE (for development with limited historical data) -TEST_MODE <- TRUE +TEST_MODE <- FALSE TEST_MODE_NUM_WEEKS <- 2 # GERMINATION PROGRESS THRESHOLD @@ -618,10 +618,12 @@ get_status_trigger <- function(ci_values, ci_change, age_weeks) { return(NA_character_) } -load_historical_field_data <- function(project_dir, current_week, reports_dir, num_weeks = 4) { +load_historical_field_data <- function(project_dir, current_week, reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL) { historical_data <- list() loaded_weeks <- c() + missing_weeks <- c() + # First pass: try to load existing weeks for (lookback in 0:(num_weeks - 1)) { target_week <- current_week - lookback if (target_week < 1) target_week <- target_week + 52 @@ -639,16 +641,159 @@ load_historical_field_data <- function(project_dir, current_week, reports_dir, n loaded_weeks <- c(loaded_weeks, target_week) }, error = function(e) { message(paste(" Warning: Could not load week", target_week, ":", e$message)) + missing_weeks <<- c(missing_weeks, target_week) + }) + } else { + missing_weeks <- c(missing_weeks, target_week) + } + } + + # If weeks are missing and auto_generate=TRUE, calculate stats from ALL available mosaics + if (length(missing_weeks) > 0 && auto_generate) { + message(paste("⚠ Missing weeks:", paste(missing_weeks, collapse = ", "))) + message("Scanning for ALL available weekly mosaics and calculating stats...\n") + + # Use field_boundaries_sf passed in (loaded in main) + if (is.null(field_boundaries_sf)) { + message(" Error: field_boundaries_sf not provided - cannot auto-generate") + return(historical_data) + } + + if (!exists("weekly_tile_max")) { + message(" ✗ weekly_tile_max path not defined") + return(historical_data) + } + + # Find the mosaic directory (with or without 5x5 subdirectory) + check_paths <- c(file.path(weekly_tile_max, "5x5"), weekly_tile_max) + mosaic_scan_dir <- NA + + for (check_path in check_paths) { + if (dir.exists(check_path)) { + tif_files <- list.files(check_path, pattern = "week_.*\\.tif$", full.names = TRUE) + if (length(tif_files) > 0) { + mosaic_scan_dir <- check_path + break + } + } + } + + if (is.na(mosaic_scan_dir)) { + message(" ✗ No mosaic files found in weekly_tile_max") + return(historical_data) + } + + # Calculate actual date range for last 8 weeks + # Don't guess weeks - derive them from actual dates + weeks_to_load <- 8 + today <- Sys.Date() + target_dates <- today - (0:(weeks_to_load - 1)) * 7 + + # For each date, calculate what week/year it falls in + expected_weeks <- data.frame( + date = target_dates, + week = as.numeric(format(target_dates, "%V")), + year = as.numeric(format(target_dates, "%Y")), + stringsAsFactors = FALSE + ) + expected_weeks <- unique(expected_weeks) + + message(paste(" Expected weeks (last 8 from", format(today, "%Y-%m-%d"), "):")) + for (i in seq_len(nrow(expected_weeks))) { + message(paste(" Week", sprintf("%02d", expected_weeks$week[i]), expected_weeks$year[i])) + } + message("") + + # Parse all week_YY_YYYY_NN.tif files to find unique (week, year) combinations + tif_files <- list.files(mosaic_scan_dir, pattern = "week_([0-9]{2})_([0-9]{4})_[0-9]{2}\\.tif$", + full.names = FALSE) + + # Extract week and year from filenames + available_weeks <- data.frame() + for (filename in tif_files) { + # Parse: week_02_2026_03.tif + matches <- regmatches(filename, gregexpr("week_([0-9]{2})_([0-9]{4})", filename))[[1]] + if (length(matches) > 0) { + week_year <- strsplit(matches[1], "_")[[1]] + if (length(week_year) == 3) { + week_num <- as.numeric(week_year[2]) + year_num <- as.numeric(week_year[3]) + + # Only keep weeks that are in expected_weeks + if (week_num %in% expected_weeks$week && year_num %in% expected_weeks$year) { + available_weeks <- rbind(available_weeks, + data.frame(week = week_num, year = year_num)) + } + } + } + } + + # Remove duplicates and sort by date (descending - most recent first) + available_weeks <- unique(available_weeks) + # Merge with dates to sort properly + available_weeks <- merge(available_weeks, expected_weeks[, c("week", "year", "date")], by = c("week", "year")) + available_weeks <- available_weeks[order(available_weeks$date, decreasing = TRUE), ] + + if (nrow(available_weeks) == 0) { + message(" ✗ No matching mosaic files found") + message(paste(" Scanned directory:", mosaic_scan_dir)) + return(historical_data) + } + + message(paste(" Found", nrow(available_weeks), "week(s) with available mosaics:")) + + # Calculate stats for each available week + for (i in seq_len(nrow(available_weeks))) { + week_to_calc <- available_weeks$week[i] + year_to_calc <- available_weeks$year[i] + date_to_calc <- available_weeks$date[i] + + # Find all tiles for this week/year combination + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_to_calc, year_to_calc) + tile_files <- list.files(mosaic_scan_dir, pattern = tile_pattern, full.names = TRUE) + + if (length(tile_files) == 0) { + message(paste(" ✗ Week", sprintf("%02d", week_to_calc), year_to_calc, "- no tiles found")) + next + } + + message(paste(" ✓ Week", sprintf("%02d", week_to_calc), year_to_calc, "-", length(tile_files), "mosaics")) + + tryCatch({ + # Calculate stats for this week/year + week_stats <- load_or_calculate_weekly_stats( + week_num = week_to_calc, + year = year_to_calc, + project_dir = project_dir, + field_boundaries_sf = field_boundaries_sf, + mosaic_dir = mosaic_scan_dir, + reports_dir = reports_dir, + report_date = date_to_calc # Use actual date for this week + ) + + if (!is.null(week_stats) && nrow(week_stats) > 0) { + message(paste(" ✓ Calculated stats for", nrow(week_stats), "fields")) + + # Add to historical data (use unique key: week_year combo) + historical_data[[length(historical_data) + 1]] <- list( + week = week_to_calc, + year = year_to_calc, + data = week_stats + ) + loaded_weeks <- c(loaded_weeks, paste0(week_to_calc, "_", year_to_calc)) + } + }, error = function(e) { + message(paste(" ✗ Error:", e$message)) }) } } if (length(historical_data) == 0) { - message(paste("Warning: No historical field data found for trend calculations")) + message(paste("Error: No historical field data found and could not auto-generate weeks")) return(NULL) } - message(paste("Loaded", length(historical_data), "weeks of historical data:", + message(paste("✓ Loaded", length(historical_data), "weeks of historical data:", paste(loaded_weeks, collapse = ", "))) return(historical_data) @@ -1766,8 +1911,12 @@ main <- function() { args <- commandArgs(trailingOnly = TRUE) # end_date (arg 1) + # Priority: 1) Command-line arg, 2) Global end_date variable (for recursive calls), 3) Global end_date_str, 4) Sys.Date() end_date <- if (length(args) >= 1 && !is.na(args[1])) { as.Date(args[1]) + } else if (exists("end_date", envir = .GlobalEnv)) { + # For recursive calls, use the end_date that was set in the global environment + get("end_date", envir = .GlobalEnv) } else if (exists("end_date_str", envir = .GlobalEnv)) { as.Date(get("end_date_str", envir = .GlobalEnv)) } else { @@ -1871,11 +2020,18 @@ main <- function() { }) message("Loading historical field data for trend calculations...") - num_weeks_to_load <- if (TEST_MODE) TEST_MODE_NUM_WEEKS else max(WEEKS_FOR_FOUR_WEEK_TREND, WEEKS_FOR_CV_TREND_LONG) - if (TEST_MODE) { - message(paste(" TEST MODE: Loading only", num_weeks_to_load, "weeks")) - } - historical_data <- load_historical_field_data(project_dir, current_week, reports_dir, num_weeks = num_weeks_to_load) + # Load up to 8 weeks (max of 4-week and 8-week trend requirements) + # Function gracefully handles missing weeks and loads whatever exists + num_weeks_to_load <- max(WEEKS_FOR_FOUR_WEEK_TREND, WEEKS_FOR_CV_TREND_LONG) # Always 8 + message(paste(" Attempting to load up to", num_weeks_to_load, "weeks of historical data...")) + + # Only auto-generate on first call (not in recursive calls from within load_historical_field_data) + allow_auto_gen <- !exists("_INSIDE_AUTO_GENERATE", envir = .GlobalEnv) + + historical_data <- load_historical_field_data(project_dir, current_week, reports_dir, + num_weeks = num_weeks_to_load, + auto_generate = allow_auto_gen, + field_boundaries_sf = field_boundaries_sf) planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf) From 41bbc370f20a23795b8fc7f00897f4731ffab00e Mon Sep 17 00:00:00 2001 From: Timon Date: Sun, 18 Jan 2026 08:22:02 +0100 Subject: [PATCH 10/15] harvest pred working better now --- python_app/22_harvest_baseline_prediction.py | 1 + python_app/harvest_date_pred_utils.py | 53 +++++++++----------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/python_app/22_harvest_baseline_prediction.py b/python_app/22_harvest_baseline_prediction.py index f39dca6..4184608 100644 --- a/python_app/22_harvest_baseline_prediction.py +++ b/python_app/22_harvest_baseline_prediction.py @@ -46,6 +46,7 @@ from pathlib import Path from harvest_date_pred_utils import ( load_model_and_config, extract_features, + run_phase1_growing_window, run_two_step_refinement, build_production_harvest_table ) diff --git a/python_app/harvest_date_pred_utils.py b/python_app/harvest_date_pred_utils.py index aa4199c..012a9f2 100644 --- a/python_app/harvest_date_pred_utils.py +++ b/python_app/harvest_date_pred_utils.py @@ -307,45 +307,32 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev For each detected harvest, reset DOY counter for the next season. This allows the model to detect multiple consecutive harvests in multi-year data. - - Algorithm: - 1. Start with season_anchor_day = 0 (DOY 1 at day 0) - 2. Expand window: [0:1], [0:2], [0:3], ... until threshold crossed - 3. When harvest detected: record date, set new season_anchor = day after harvest - 4. Continue from next season start - - Args: - threshold (float): Probability threshold (default 0.45) - consecutive_days (int): Required consecutive days above threshold (default 2) - - Returns list of (harvest_date, harvest_idx) tuples. """ harvest_dates = [] - season_anchor_day = 0 # DOY 1 starts at day 0 + season_anchor_day = 0 current_pos = 0 while current_pos < len(field_data): consecutive_above_threshold = 0 - min_window_size = 120 # Need at least 120 days (~4 months) for patterns to establish + min_window_size = 120 for window_end in range(current_pos + 1, len(field_data) + 1): window_data = field_data.iloc[current_pos:window_end].copy().reset_index(drop=True) - # Skip if window is too small (model needs long sequences for pattern learning) if len(window_data) < min_window_size: continue try: - # CRITICAL: Pass season_anchor_day so DOY resets after harvest + reset_doy = current_pos > season_anchor_day + features = extract_features( window_data, config['features'], ci_column=ci_column, - season_anchor_day=season_anchor_day, + season_anchor_day=season_anchor_day if reset_doy else None, lookback_start=current_pos ) - # Apply scalers features_scaled = features.copy().astype(float) for fi, scaler in enumerate(scalers): try: @@ -353,12 +340,10 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev except Exception: pass - # Run model on expanding window with torch.no_grad(): x_tensor = torch.tensor(features_scaled, dtype=torch.float32).unsqueeze(0).to(device) imminent_probs, detected_probs = model(x_tensor) - # Check LAST timestep only last_prob = detected_probs[0, -1].item() if last_prob > threshold: @@ -366,23 +351,18 @@ def run_phase1_growing_window(field_data, model, config, scalers, ci_column, dev else: consecutive_above_threshold = 0 - # Harvest detected: N consecutive days above threshold if consecutive_above_threshold >= consecutive_days: harvest_idx = current_pos + window_end - consecutive_days harvest_date = field_data.iloc[harvest_idx]['Date'] - harvest_dates.append((harvest_date, harvest_idx)) - # CRITICAL: Reset season anchor for next season - # DOY 1 starts at day after harvest + harvest_dates.append((harvest_date, harvest_idx)) season_anchor_day = harvest_idx + 1 current_pos = harvest_idx + 1 break except Exception as e: - # Skip window on error continue else: - # No more harvests found break return harvest_dates @@ -413,8 +393,21 @@ def run_phase2_refinement(field_data, phase1_harvests, model, config, scalers, c window_start_date = season_start_date - pd.Timedelta(days=40) window_end_date = phase1_harvest_date + pd.Timedelta(days=40) - window_start_idx = max(0, (field_data['Date'] >= window_start_date).idxmax() if (field_data['Date'] >= window_start_date).any() else 0) - window_end_idx = min(len(field_data), (field_data['Date'] <= window_end_date).idxmax() + 1 if (field_data['Date'] <= window_end_date).any() else len(field_data)) + # FIXED: Use proper index selection + mask_start = field_data['Date'] >= window_start_date + mask_end = field_data['Date'] <= window_end_date + + if mask_start.any(): + window_start_idx = mask_start.idxmax() # First True index + else: + window_start_idx = 0 + + if mask_end.any(): + # Last True index: find where condition becomes False from the right + true_indices = np.where(mask_end)[0] + window_end_idx = true_indices[-1] + 1 # +1 for slicing (exclusive end) + else: + window_end_idx = len(field_data) if window_end_idx <= window_start_idx: refined_harvests.append((phase1_harvest_date, phase1_idx)) @@ -525,6 +518,10 @@ def run_two_step_refinement(df: pd.DataFrame, model, config, scalers, device=Non print() # New line after progress bar print(f" ✓ Complete: Found {harvests_found} harvest events across {total_fields} fields") + if results: + print(f" Sample harvest dates: {results[0]['phase2_harvest_date']}") + if len(results) > 1: + print(f" {results[-1]['phase2_harvest_date']}") return results From 1f5add7485c886f691d5474e078c3cbd1c2a77a4 Mon Sep 17 00:00:00 2001 From: Timon Date: Sun, 18 Jan 2026 09:27:16 +0100 Subject: [PATCH 11/15] refactored script 80 --- r_app/80_calculate_kpis.R | 1810 +----------------------------- r_app/80_report_building_utils.R | 249 ++++ r_app/80_weekly_stats_utils.R | 953 ++++++++++++++++ 3 files changed, 1244 insertions(+), 1768 deletions(-) create mode 100644 r_app/80_report_building_utils.R create mode 100644 r_app/80_weekly_stats_utils.R diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 823c708..9c5b2b4 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -12,6 +12,20 @@ # - Parallel processing (tile-aware, 1000+ fields supported) # - Comprehensive Excel + RDS + CSV exports (21 columns per spec) # - Test mode for development + +# CRITICAL INTEGRATIONS: +# +# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307) +# [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv +# Columns: field, imminent_prob, detected_prob, week, year +# [ ] LEFT JOIN to field_analysis_df by (field, week, year) +# [ ] Replace hardcoded "placeholder data" in Status_trigger calculation +# [ ] Update column to show actual harvest probability (0-1 or 0-100%) +# +# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31) +# [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field +# [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE +# [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks") # # COMMAND-LINE USAGE: # Option 1: Rscript 80_calculate_kpis.R 2026-01-14 angata @@ -27,131 +41,15 @@ # main() # ============================================================================ -# PENDING WORK - PHASE 4 (Next Sprint) +# NEXT INTEGRATIONS (See Linear issues for detailed requirements) # ============================================================================ -# -# CRITICAL INTEGRATIONS: -# -# 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307) -# [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv -# Columns: field, imminent_prob, detected_prob, week, year -# [ ] LEFT JOIN to field_analysis_df by (field, week, year) -# [ ] Replace hardcoded "placeholder data" in Status_trigger calculation -# [ ] Update column to show actual harvest probability (0-1 or 0-100%) -# -# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31) -# [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field -# [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE -# [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks") -# [ ] Removes TEST MODE hardcoding and enables field-specific aging -# -# UTILITY FILE REFACTORING (Improve code maintainability): -# -# 3. CREATE r_app/80_weekly_stats_utils.R -# [ ] Extract functions from lines 250-795 (calculation layer): -# - calculate_field_statistics() -# - calculate_kpi_trends() -# - load_or_calculate_weekly_stats() -# - Helper: load_tiles_for_field(), get_tile_ids_for_field() -# - Helper: extract_field_statistics_from_ci() -# [ ] Clean separation: DATA CALCULATION ONLY (no Excel export) -# [ ] Reusable by run_full_pipeline.R and other scripts -# -# 4. CREATE r_app/80_report_building_utils.R -# [ ] Extract functions from lines 1350-2100+ (output/reporting layer): -# - generate_field_analysis_summary() -# - export_field_analysis_excel() -# - calculate_and_export_farm_kpis() -# - Helper: categorize_*, get_*, round_* functions -# - Helper: get_phase_by_age(), get_status_trigger() -# [ ] Clean separation: OUTPUT/FORMATTING ONLY (consumes calculated stats) -# [ ] Reusable for alternative export formats (PDF, HTML, dashboard) -# -# TESTING PLAN: -# [ ] Verify 8-week historical data loads (currently TEST_MODE = 2 weeks only) -# [ ] Confirm Four_week_trend calculates from 1-4 weeks (graceful degradation) -# [ ] Confirm CV_Trend_Long_Term uses full 8-week regression (when available) -# [ ] Load script 31 output and validate imminent_prob population -# [ ] maybe even look into aut calculating the mosaic if mosaic is missing in last 8 weeks +# 1. Load imminent_prob from script 31 (harvest_imminent_weekly.csv) +# 2. Load planting_date from harvest.xlsx for field-specific age calculation # ============================================================================ # ============================================================================ -# EXCEL OUTPUT SPECIFICATION (21 COLUMNS) +# CONFIGURATION # ============================================================================ -# This script exports 21 columns per the field analysis specification: -# -# COMPLETED/IN PROGRESS: -# 1. Field_id ✓ - Unique field identifier -# 2. Farm_Section - Management zone (to be filled by user) -# 3. Field_name ✓ - Client-facing field name (from GeoJSON) -# 4. Acreage ✓ - Field size in acres -# 5. Mean_CI ✓ - Average Chlorophyll Index -# 6. Weekly_ci_change ✓ - Week-over-week CI change -# 7. Four_week_trend - [FUTURE] Trend over 4 weeks (requires historical mosaics) -# 8. Last_harvest_or_planting_date - [DUMMY for now] Will be from harvest Excel + LSTM (script 31) -# 9. Age_week ✓ - Weeks since planting -# 10. Phase (age based) ✓ - Growth phase (Germination, Tillering, Grand Growth, Maturation) -# 11. nmr_weeks_in_this_phase - [TODO] Weeks spent in current phase (track phase transitions) -# 12. Germination_progress - [TODO] % pixels with CI >= threshold (default 2, for age < 4 months) -# 13. Imminent_prob - [DUMMY for now] Harvest probability (will be from script 31 output) -# 14. Status_trigger ✓ - Alerts (harvest_ready, stress, etc.) -# 15. CI_range (min-max) - [TODO] Min and max CI values in field -# 16. CI_Percentiles ✓ - 10th-90th percentile of CI (p10-p90 format) -# 17. CV ✓ - Coefficient of variation (field uniformity) -# 18. CV_Trend_Short_Term - [TODO] 2-week CV trend (current week CV - last week CV) -# 19. CV_Trend_Long_Term - [FUTURE] 8-week CV slope (requires linear regression, historical mosaics) -# 20. Cloud_pct_clear ✓ - % field visible (pixel coverage) -# 21. Cloud_category ✓ - Cloud classification (Clear view / Partial coverage / No image available) -# -# IMPLEMENTATION PLAN (ordered by difficulty): -# ============================================================================ -# PHASE 1 - EASY (Current data only): -# [✓] Remove Mean_CI_prev column -# [✓] Add Field_name column (from field_boundaries_sf$field) -# [✓] Add Farm_Section column (empty, user will fill) -# [✓] Add Last_harvest_or_planting_date (use UNIFORM_PLANTING_DATE as dummy) -# [✓] Add CI_range (min/max from pixel extraction) -# [✓] Add Cloud_pct_clear (% from pixel coverage) -# [✓] Column order: Reorder to match spec (1-21) -# -# -# PHASE 2 - MEDIUM (Requires computation): -# [ ] Add nmr_weeks_in_this_phase (track phase transitions with previous week CSV) -# [ ] Add Germination_progress (% pixels CI >= GERMINATION_CI_THRESHOLD, configurable) -# [ ] Add Imminent_prob column (dummy NA, will merge from script 31 harvest_imminent_weekly.csv) -# [ ] Add CV_Trend_Short_Term (requires loading last week's CV values) -# -# PHASE 3 - COMPLEX (Requires historical data): -# [ ] Add Four_week_trend (CI value difference week vs 4 weeks ago, requires loading prev mosaics) -# [ ] Add CV_Trend_Long_Term (8-week slope: linear regression on 8 weeks of CV, suggests lm()) -# [ ] Load previous week's CSV to cross-check phase transitions and trends -# -# NOTES: -# - Script 31 (harvest_imminent_weekly.py) outputs: field, imminent_prob, detected_prob, week, year -# - Will need to LEFT JOIN on (field, week, year) to populate Imminent_prob -# - Phase transition logic: Compare this week's Phase vs last week's Phase from CSV -# - For 8-week CV slope: Linear regression slope = (CV_week8 - CV_week1) / 7 weeks (approximately) -# or use lm(CV ~ week) on 8-week sequence for proper slope calculation -# - Germination_progress only calculated if Age_week < 17 (before end of Tillering phase) -# - Cloud_pct_clear calculated as: (pixel_count / expected_pixels) * 100 - -# ============================================================================ -# *** CONFIGURATION SECTION - MANUALLY DEFINED THRESHOLDS *** -# ============================================================================ - -# TEST MODE (for development with limited historical data) -TEST_MODE <- FALSE -TEST_MODE_NUM_WEEKS <- 2 - -# GERMINATION PROGRESS THRESHOLD -# Percentage of pixels that must reach this CI value to count as "germinated" -GERMINATION_CI_THRESHOLD <- 2.0 # Pixels with CI >= 2 count as germinated - -# FOR TESTING: Set these fields as "recently planted" to demonstrate germination progress -YOUNG_FIELDS_FOR_TESTING <- c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10") # First 10 field IDs -YOUNG_FIELD_PLANTING_DATE <- as.Date("2026-01-01") # Recently planted for demo - -# FOUR-WEEK TREND THRESHOLDS FOUR_WEEK_TREND_STRONG_GROWTH_MIN <- 0.5 FOUR_WEEK_TREND_GROWTH_MIN <- 0.1 FOUR_WEEK_TREND_GROWTH_MAX <- 0.5 @@ -184,6 +82,13 @@ CLOUD_INTERVALS <- c(0, 50, 60, 70, 80, 90, 100) CI_PERCENTILE_LOW <- 0.10 CI_PERCENTILE_HIGH <- 0.90 +# GERMINATION THRESHOLD (for germination_progress calculation) +GERMINATION_CI_THRESHOLD <- 2.0 + +# PLANTING DATE & AGE CONFIGURATION +USE_UNIFORM_AGE <- TRUE +UNIFORM_PLANTING_DATE <- as.Date("2026-01-01") + # HISTORICAL DATA LOOKBACK WEEKS_FOR_FOUR_WEEK_TREND <- 4 WEEKS_FOR_CV_TREND_SHORT <- 2 @@ -216,6 +121,22 @@ suppressPackageStartupMessages({ }) }) +# ============================================================================ +# LOAD UTILITY FUNCTIONS FROM SEPARATED MODULES +# ============================================================================ + +tryCatch({ + source(here("r_app", "80_weekly_stats_utils.R")) +}, error = function(e) { + stop("Error loading 80_weekly_stats_utils.R: ", e$message) +}) + +tryCatch({ + source(here("r_app", "80_report_building_utils.R")) +}, error = function(e) { + stop("Error loading 80_report_building_utils.R: ", e$message) +}) + # ============================================================================ # PHASE AND STATUS TRIGGER DEFINITIONS # ============================================================================ @@ -252,1656 +173,9 @@ STATUS_TRIGGERS <- data.frame( ) # ============================================================================ -# TILE-AWARE HELPER FUNCTIONS +# MAIN # ============================================================================ -get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) { - if (inherits(field_geom, "sf")) { - field_bbox <- sf::st_bbox(field_geom) - field_xmin <- field_bbox["xmin"] - field_xmax <- field_bbox["xmax"] - field_ymin <- field_bbox["ymin"] - field_ymax <- field_bbox["ymax"] - } else if (inherits(field_geom, "SpatVector")) { - field_bbox <- terra::ext(field_geom) - field_xmin <- field_bbox$xmin - field_xmax <- field_bbox$xmax - field_ymin <- field_bbox$ymin - field_ymax <- field_bbox$ymax - } else { - stop("field_geom must be sf or terra::vect object") - } - - # DEBUG: Print bbox info for first field - if (!is.null(field_id) && field_id == "1391") { - message(paste("[DEBUG get_tile_ids] Field bbox - xmin:", field_xmin, "xmax:", field_xmax, - "ymin:", field_ymin, "ymax:", field_ymax)) - message(paste("[DEBUG get_tile_ids] tile_grid sample: id=", tile_grid$id[1], - "xmin=", tile_grid$xmin[1], "xmax=", tile_grid$xmax[1], - "ymin=", tile_grid$ymin[1], "ymax=", tile_grid$ymax[1])) - message(paste("[DEBUG get_tile_ids] tile_grid CRS:", sf::st_crs(tile_grid))) - message(paste("[DEBUG get_tile_ids] field CRS:", sf::st_crs(field_geom))) - } - - intersecting_tiles <- tile_grid$id[ - !(tile_grid$xmax < field_xmin | - tile_grid$xmin > field_xmax | - tile_grid$ymax < field_ymin | - tile_grid$ymin > field_ymax) - ] - - return(as.numeric(intersecting_tiles)) -} - -load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_dir) { - if (length(tile_ids) == 0) { - return(NULL) - } - - tiles_list <- list() - for (tile_id in sort(tile_ids)) { - tile_filename <- sprintf("week_%02d_%d_%02d.tif", week_num, year, tile_id) - tile_path <- file.path(mosaic_dir, tile_filename) - - if (file.exists(tile_path)) { - tryCatch({ - tile_rast <- terra::rast(tile_path) - ci_band <- terra::subset(tile_rast, 5) - tiles_list[[length(tiles_list) + 1]] <- ci_band - }, error = function(e) { - message(paste(" Warning: Could not load tile", tile_id, ":", e$message)) - }) - } - } - - if (length(tiles_list) == 0) { - return(NULL) - } - - if (length(tiles_list) == 1) { - return(tiles_list[[1]]) - } else { - tryCatch({ - rsrc <- terra::sprc(tiles_list) - merged <- terra::mosaic(rsrc, fun = "max") - return(merged) - }, error = function(e) { - message(paste(" Warning: Could not merge tiles:", e$message)) - return(tiles_list[[1]]) - }) - } -} - -build_tile_grid <- function(mosaic_dir, week_num, year) { - # Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/) - # First check if mosaic_dir contains grid-size subdirectories - detected_grid_size <- NA - if (dir.exists(mosaic_dir)) { - subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE) - grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE) - - if (length(grid_patterns) > 0) { - # Use the first grid-size subdirectory found - detected_grid_size <- grid_patterns[1] - mosaic_dir <- file.path(mosaic_dir, detected_grid_size) - message(paste(" Using grid-size subdirectory:", detected_grid_size)) - } - } - - tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) - tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) - - if (length(tile_files) == 0) { - stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir)) - } - - tile_grid <- data.frame( - id = integer(), - xmin = numeric(), - xmax = numeric(), - ymin = numeric(), - ymax = numeric(), - stringsAsFactors = FALSE - ) - - for (tile_file in tile_files) { - tryCatch({ - matches <- regmatches(basename(tile_file), regexpr("_([0-9]{2})\\.tif$", basename(tile_file))) - if (length(matches) > 0) { - tile_id <- as.integer(sub("_|\\.tif", "", matches[1])) - tile_rast <- terra::rast(tile_file) - tile_ext <- terra::ext(tile_rast) - tile_grid <- rbind(tile_grid, data.frame( - id = tile_id, - xmin = tile_ext$xmin, - xmax = tile_ext$xmax, - ymin = tile_ext$ymin, - ymax = tile_ext$ymax, - stringsAsFactors = FALSE - )) - } - }, error = function(e) { - message(paste(" Warning: Could not process tile", basename(tile_file), ":", e$message)) - }) - } - - if (nrow(tile_grid) == 0) { - stop("Could not extract extents from any tile files") - } - - # RETURN BOTH the grid AND the corrected mosaic directory path - return(list( - tile_grid = tile_grid, - mosaic_dir = mosaic_dir, - grid_size = detected_grid_size - )) -} - -# ============================================================================ -# SC-64 ENHANCEMENT FUNCTIONS -# ============================================================================ - -categorize_four_week_trend <- function(ci_values_list) { - if (is.null(ci_values_list) || length(ci_values_list) < 2) { - return(NA_character_) - } - - ci_values_list <- ci_values_list[!is.na(ci_values_list)] - if (length(ci_values_list) < 2) { - return(NA_character_) - } - - weekly_changes <- diff(ci_values_list) - avg_weekly_change <- mean(weekly_changes, na.rm = TRUE) - - if (avg_weekly_change >= FOUR_WEEK_TREND_STRONG_GROWTH_MIN) { - return("strong growth") - } else if (avg_weekly_change >= FOUR_WEEK_TREND_GROWTH_MIN && - avg_weekly_change < FOUR_WEEK_TREND_GROWTH_MAX) { - return("growth") - } else if (abs(avg_weekly_change) <= FOUR_WEEK_TREND_NO_GROWTH_RANGE) { - return("no growth") - } else if (avg_weekly_change <= FOUR_WEEK_TREND_DECLINE_MIN && - avg_weekly_change > FOUR_WEEK_TREND_STRONG_DECLINE_MAX) { - return("decline") - } else if (avg_weekly_change < FOUR_WEEK_TREND_STRONG_DECLINE_MAX) { - return("strong decline") - } else { - return("no growth") - } -} - -round_cloud_to_intervals <- function(cloud_pct_clear) { - if (is.na(cloud_pct_clear)) { - return(NA_character_) - } - - if (cloud_pct_clear < 50) return("<50%") - if (cloud_pct_clear < 60) return("50-60%") - if (cloud_pct_clear < 70) return("60-70%") - if (cloud_pct_clear < 80) return("70-80%") - if (cloud_pct_clear < 90) return("80-90%") - return(">90%") -} - -get_ci_percentiles <- function(ci_values) { - if (is.null(ci_values) || length(ci_values) == 0) { - return(NA_character_) - } - - ci_values <- ci_values[!is.na(ci_values)] - if (length(ci_values) == 0) { - return(NA_character_) - } - - p10 <- quantile(ci_values, CI_PERCENTILE_LOW, na.rm = TRUE) - p90 <- quantile(ci_values, CI_PERCENTILE_HIGH, na.rm = TRUE) - - return(sprintf("%.1f-%.1f", p10, p90)) -} - -calculate_cv_trend <- function(cv_current, cv_previous) { - if (is.na(cv_current) || is.na(cv_previous)) { - return(NA_real_) - } - return(round(cv_current - cv_previous, 4)) -} - -calculate_four_week_trend <- function(mean_ci_values) { - #' Calculate four-week CI trend from available weeks - #' - #' Uses whatever weeks are available (1-4 weeks) to estimate trend - #' Returns difference between current (most recent) and oldest available week - #' - #' @param mean_ci_values vector of Mean_CI values in chronological order (oldest to newest) - #' @return numeric: CI difference (current - oldest), rounded to 2 decimals - - if (is.null(mean_ci_values) || length(mean_ci_values) == 0) { - return(NA_real_) - } - - # Remove NAs - ci_clean <- mean_ci_values[!is.na(mean_ci_values)] - - if (length(ci_clean) < 2) { - # Need at least 2 weeks to calculate trend - return(NA_real_) - } - - # Calculate difference: current - oldest - trend <- ci_clean[length(ci_clean)] - ci_clean[1] - return(round(trend, 2)) -} - -categorize_cv_slope <- function(slope) { - #' Categorize CV slope (8-week regression) into field uniformity interpretation - #' - #' Slope interpretation: - #' Negative slope = CV decreasing = field becoming MORE uniform = GOOD - #' Positive slope = CV increasing = field becoming MORE patchy = BAD - #' Near zero = Homogenous growth (all crops progressing equally) - #' - #' Categories: - #' - "Excellent uniformity": Slope <= -0.02 (CV decreasing, field synchronizing) - #' - "Homogenous growth": -0.02 < slope < 0.005 (stable, uniform growth) - #' - "Minor patchiness": 0.005 <= slope <= 0.02 (CV slowly increasing) - #' - "Severe fragmentation": slope > 0.02 (rapid CV increase, parts diverging) - #' - #' @param slope numeric: CV trend slope per week - #' @return character: Category string - - if (is.na(slope)) { - return(NA_character_) - } - - if (slope <= CV_SLOPE_IMPROVEMENT_MIN) { - return("Excellent uniformity") - } else if (slope < CV_SLOPE_HOMOGENOUS_MIN) { - return("Homogenous growth") - } else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) { - return("Homogenous growth") - } else if (slope <= CV_SLOPE_PATCHINESS_MAX) { - return("Minor patchiness") - } else { - return("Severe fragmentation") - } -} - -calculate_cv_trend_long_term <- function(cv_values) { - #' Calculate 8-week CV trend via linear regression slope - #' - #' Fits linear regression to CV over available weeks (1-8) - #' Returns slope = rate of change per week - #' - #' @param cv_values vector of CV values in chronological order (oldest to newest) - #' @return numeric: Regression slope (CV change per week), rounded to 4 decimals - - if (is.null(cv_values) || length(cv_values) == 0) { - return(NA_real_) - } - - # Remove NAs - cv_clean <- cv_values[!is.na(cv_values)] - - if (length(cv_clean) < 2) { - # Need at least 2 weeks to fit a line - return(NA_real_) - } - - # Create week sequence matching data length - weeks <- seq_along(cv_clean) - - # Fit linear model - tryCatch({ - lm_fit <- lm(cv_clean ~ weeks) - slope <- coef(lm_fit)["weeks"] - return(round(as.numeric(slope), 4)) - }, error = function(e) { - return(NA_real_) - }) -} - -# ============================================================================ -# HELPER FUNCTIONS -# ============================================================================ - -get_phase_by_age <- function(age_weeks) { - if (is.na(age_weeks)) return(NA_character_) - for (i in seq_len(nrow(PHASE_DEFINITIONS))) { - if (age_weeks >= PHASE_DEFINITIONS$age_start[i] && - age_weeks <= PHASE_DEFINITIONS$age_end[i]) { - return(PHASE_DEFINITIONS$phase[i]) - } - } - return("Unknown") -} - -get_status_trigger <- function(ci_values, ci_change, age_weeks) { - if (is.na(age_weeks) || length(ci_values) == 0) return(NA_character_) - - ci_values <- ci_values[!is.na(ci_values)] - if (length(ci_values) == 0) return(NA_character_) - - pct_above_2 <- sum(ci_values > 2) / length(ci_values) * 100 - pct_at_or_above_2 <- sum(ci_values >= 2) / length(ci_values) * 100 - ci_cv <- if (mean(ci_values, na.rm = TRUE) > 0) sd(ci_values) / mean(ci_values, na.rm = TRUE) else 0 - mean_ci <- mean(ci_values, na.rm = TRUE) - - if (age_weeks >= 0 && age_weeks <= 6) { - if (pct_at_or_above_2 >= 70) { - return("germination_complete") - } else if (pct_above_2 > 10) { - return("germination_started") - } - } - - if (age_weeks >= 45) { - return("harvest_ready") - } - - if (age_weeks > 6 && !is.na(ci_change) && ci_change < -1.5 && ci_cv < 0.25) { - return("stress_detected_whole_field") - } - - if (age_weeks > 6 && !is.na(ci_change) && ci_change > 1.5) { - return("strong_recovery") - } - - if (age_weeks >= 4 && age_weeks < 39 && !is.na(ci_change) && ci_change > 0.2) { - return("growth_on_track") - } - - if (age_weeks >= 39 && age_weeks < 45 && mean_ci > 3.5) { - return("maturation_progressing") - } - - return(NA_character_) -} - -load_historical_field_data <- function(project_dir, current_week, reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL) { - historical_data <- list() - loaded_weeks <- c() - missing_weeks <- c() - - # First pass: try to load existing weeks - for (lookback in 0:(num_weeks - 1)) { - target_week <- current_week - lookback - if (target_week < 1) target_week <- target_week + 52 - - csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", target_week), ".csv") - csv_path <- file.path(reports_dir, "kpis", "field_analysis", csv_filename) - - if (file.exists(csv_path)) { - tryCatch({ - data <- read_csv(csv_path, show_col_types = FALSE) - historical_data[[lookback + 1]] <- list( - week = target_week, - data = data - ) - loaded_weeks <- c(loaded_weeks, target_week) - }, error = function(e) { - message(paste(" Warning: Could not load week", target_week, ":", e$message)) - missing_weeks <<- c(missing_weeks, target_week) - }) - } else { - missing_weeks <- c(missing_weeks, target_week) - } - } - - # If weeks are missing and auto_generate=TRUE, calculate stats from ALL available mosaics - if (length(missing_weeks) > 0 && auto_generate) { - message(paste("⚠ Missing weeks:", paste(missing_weeks, collapse = ", "))) - message("Scanning for ALL available weekly mosaics and calculating stats...\n") - - # Use field_boundaries_sf passed in (loaded in main) - if (is.null(field_boundaries_sf)) { - message(" Error: field_boundaries_sf not provided - cannot auto-generate") - return(historical_data) - } - - if (!exists("weekly_tile_max")) { - message(" ✗ weekly_tile_max path not defined") - return(historical_data) - } - - # Find the mosaic directory (with or without 5x5 subdirectory) - check_paths <- c(file.path(weekly_tile_max, "5x5"), weekly_tile_max) - mosaic_scan_dir <- NA - - for (check_path in check_paths) { - if (dir.exists(check_path)) { - tif_files <- list.files(check_path, pattern = "week_.*\\.tif$", full.names = TRUE) - if (length(tif_files) > 0) { - mosaic_scan_dir <- check_path - break - } - } - } - - if (is.na(mosaic_scan_dir)) { - message(" ✗ No mosaic files found in weekly_tile_max") - return(historical_data) - } - - # Calculate actual date range for last 8 weeks - # Don't guess weeks - derive them from actual dates - weeks_to_load <- 8 - today <- Sys.Date() - target_dates <- today - (0:(weeks_to_load - 1)) * 7 - - # For each date, calculate what week/year it falls in - expected_weeks <- data.frame( - date = target_dates, - week = as.numeric(format(target_dates, "%V")), - year = as.numeric(format(target_dates, "%Y")), - stringsAsFactors = FALSE - ) - expected_weeks <- unique(expected_weeks) - - message(paste(" Expected weeks (last 8 from", format(today, "%Y-%m-%d"), "):")) - for (i in seq_len(nrow(expected_weeks))) { - message(paste(" Week", sprintf("%02d", expected_weeks$week[i]), expected_weeks$year[i])) - } - message("") - - # Parse all week_YY_YYYY_NN.tif files to find unique (week, year) combinations - tif_files <- list.files(mosaic_scan_dir, pattern = "week_([0-9]{2})_([0-9]{4})_[0-9]{2}\\.tif$", - full.names = FALSE) - - # Extract week and year from filenames - available_weeks <- data.frame() - for (filename in tif_files) { - # Parse: week_02_2026_03.tif - matches <- regmatches(filename, gregexpr("week_([0-9]{2})_([0-9]{4})", filename))[[1]] - if (length(matches) > 0) { - week_year <- strsplit(matches[1], "_")[[1]] - if (length(week_year) == 3) { - week_num <- as.numeric(week_year[2]) - year_num <- as.numeric(week_year[3]) - - # Only keep weeks that are in expected_weeks - if (week_num %in% expected_weeks$week && year_num %in% expected_weeks$year) { - available_weeks <- rbind(available_weeks, - data.frame(week = week_num, year = year_num)) - } - } - } - } - - # Remove duplicates and sort by date (descending - most recent first) - available_weeks <- unique(available_weeks) - # Merge with dates to sort properly - available_weeks <- merge(available_weeks, expected_weeks[, c("week", "year", "date")], by = c("week", "year")) - available_weeks <- available_weeks[order(available_weeks$date, decreasing = TRUE), ] - - if (nrow(available_weeks) == 0) { - message(" ✗ No matching mosaic files found") - message(paste(" Scanned directory:", mosaic_scan_dir)) - return(historical_data) - } - - message(paste(" Found", nrow(available_weeks), "week(s) with available mosaics:")) - - # Calculate stats for each available week - for (i in seq_len(nrow(available_weeks))) { - week_to_calc <- available_weeks$week[i] - year_to_calc <- available_weeks$year[i] - date_to_calc <- available_weeks$date[i] - - # Find all tiles for this week/year combination - tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_to_calc, year_to_calc) - tile_files <- list.files(mosaic_scan_dir, pattern = tile_pattern, full.names = TRUE) - - if (length(tile_files) == 0) { - message(paste(" ✗ Week", sprintf("%02d", week_to_calc), year_to_calc, "- no tiles found")) - next - } - - message(paste(" ✓ Week", sprintf("%02d", week_to_calc), year_to_calc, "-", length(tile_files), "mosaics")) - - tryCatch({ - # Calculate stats for this week/year - week_stats <- load_or_calculate_weekly_stats( - week_num = week_to_calc, - year = year_to_calc, - project_dir = project_dir, - field_boundaries_sf = field_boundaries_sf, - mosaic_dir = mosaic_scan_dir, - reports_dir = reports_dir, - report_date = date_to_calc # Use actual date for this week - ) - - if (!is.null(week_stats) && nrow(week_stats) > 0) { - message(paste(" ✓ Calculated stats for", nrow(week_stats), "fields")) - - # Add to historical data (use unique key: week_year combo) - historical_data[[length(historical_data) + 1]] <- list( - week = week_to_calc, - year = year_to_calc, - data = week_stats - ) - loaded_weeks <- c(loaded_weeks, paste0(week_to_calc, "_", year_to_calc)) - } - }, error = function(e) { - message(paste(" ✗ Error:", e$message)) - }) - } - } - - if (length(historical_data) == 0) { - message(paste("Error: No historical field data found and could not auto-generate weeks")) - return(NULL) - } - - message(paste("✓ Loaded", length(historical_data), "weeks of historical data:", - paste(loaded_weeks, collapse = ", "))) - - return(historical_data) -} - -USE_UNIFORM_AGE <- TRUE -UNIFORM_PLANTING_DATE <- as.Date("2026-01-01") - -extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { - if (USE_UNIFORM_AGE) { - message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE)) - # Return a data frame with all field IDs mapped to uniform planting date - if (!is.null(field_boundaries_sf)) { - return(data.frame( - field_id = field_boundaries_sf$field, - date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)), - stringsAsFactors = FALSE - )) - } else { - # Fallback if field_boundaries_sf not provided - return(NULL) - } - } - - if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { - message("Warning: No harvesting data available.") - return(NULL) - } - - tryCatch({ - planting_dates <- harvesting_data %>% - arrange(field, desc(season_start)) %>% - distinct(field, .keep_all = TRUE) %>% - select(field, season_start) %>% - rename(field_id = field, planting_date = season_start) %>% - filter(!is.na(planting_date)) %>% - as.data.frame() - - message(paste("Extracted planting dates for", nrow(planting_dates), "fields")) - return(planting_dates) - }, error = function(e) { - message(paste("Error extracting planting dates:", e$message)) - return(NULL) - }) -} - -# ============================================================================ -# MODULAR STATISTICS CALCULATION (Reusable for any week) -# ============================================================================ - -calculate_field_statistics <- function(field_boundaries_sf, week_num, year, - mosaic_dir, report_date = Sys.Date()) { - - message(paste("Calculating statistics for all fields - Week", week_num, year)) - - # Debug: Check if constants are available - message(paste(" DEBUG: YOUNG_FIELDS_FOR_TESTING =", paste(YOUNG_FIELDS_FOR_TESTING, collapse=", "))) - message(paste(" DEBUG: YOUNG_FIELD_PLANTING_DATE =", YOUNG_FIELD_PLANTING_DATE)) - - # Build tile file list - tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) - tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) - - if (length(tile_files) == 0) { - stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir)) - } - - message(paste(" Found", length(tile_files), "tiles for week", week_num)) - - results_list <- list() - fields_processed <- 0 - - # SCRIPT 20 APPROACH: Loop through tiles, extract all fields from each tile - for (tile_idx in seq_along(tile_files)) { - tile_file <- tile_files[tile_idx] - - tryCatch({ - # Load tile - current_rast <- terra::rast(tile_file) - ci_band <- current_rast[["CI"]] - - if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) { - message(paste(" [SKIP] Tile", basename(tile_file), "- CI band not found")) - return(NULL) - } - - # Extract all fields from this tile in ONE call - # terra::extract returns a dataframe with columns: ID, CI - # where each row is one pixel, and ID indicates which polygon it came from - extracted <- terra::extract(ci_band, field_boundaries_sf, na.rm = FALSE) - - # Group by field ID and calculate statistics for each field - # extracted$ID contains the field polygon index (1 to nrow(field_boundaries_sf)) - unique_field_ids <- unique(extracted$ID[!is.na(extracted$ID)]) - - for (field_poly_idx in unique_field_ids) { - # Get all CI values for this field from this tile - field_id <- field_boundaries_sf$field[field_poly_idx] - ci_vals <- extracted$CI[extracted$ID == field_poly_idx] - ci_vals <- ci_vals[!is.na(ci_vals)] - - # Skip if no data for this field in this tile - if (length(ci_vals) == 0) { - next - } - - # Calculate statistics - mean_ci <- mean(ci_vals, na.rm = TRUE) - ci_std <- sd(ci_vals, na.rm = TRUE) - cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_ - range_min <- min(ci_vals, na.rm = TRUE) - range_max <- max(ci_vals, na.rm = TRUE) - range_str <- sprintf("%.1f-%.1f", range_min, range_max) - ci_percentiles_str <- get_ci_percentiles(ci_vals) - - # Cloud coverage: count total pixels vs non-NA pixels for this field - field_rows <- extracted[extracted$ID == field_poly_idx, ] - num_total <- nrow(field_rows) - num_data <- sum(!is.na(field_rows$CI)) - pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 - cloud_cat <- if (num_data == 0) "No image available" - else if (pct_clear >= 99.5) "Clear view" - else "Partial coverage" - - # Age and Phase - age_weeks <- NA_real_ - if (USE_UNIFORM_AGE) { - # Check if this field is in the "young fields" list (for testing germination progress) - is_young_field <- field_id %in% YOUNG_FIELDS_FOR_TESTING - if (is_young_field) { - age_weeks <- as.numeric(difftime(report_date, YOUNG_FIELD_PLANTING_DATE, units = "weeks")) - # Debug for first 2 matches - if (field_id %in% c("1", "2")) { - message(paste(" DEBUG: Field", field_id, "is young field, age =", round(age_weeks, 2), "weeks")) - } - } else { - age_weeks <- as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) - } - } - phase <- get_phase_by_age(age_weeks) - - # Germination progress (only for young plants, age < 17 weeks) - germination_progress <- NA_character_ - if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks < 17) { - pct_ci_ge_threshold <- sum(ci_vals >= GERMINATION_CI_THRESHOLD) / length(ci_vals) * 100 - germination_progress <- sprintf("%.1f%%", pct_ci_ge_threshold) - } - - # Store result (check if field already exists from another tile) - existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id) - - if (length(existing_idx) > 0) { - # Field already in results from previous tile - keep first occurrence or average - # For now, keep the first one (earlier tiles) - next - } - - # Store new field result - results_list[[length(results_list) + 1]] <- data.frame( - Field_id = field_id, - Mean_CI = round(mean_ci, 2), - CV = round(cv, 4), - CI_range = range_str, - CI_Percentiles = ci_percentiles_str, - Cloud_pct_clear = pct_clear, - Cloud_category = cloud_cat, - Age_week = round(age_weeks, 1), - Phase = phase, - Germination_progress = germination_progress, - stringsAsFactors = FALSE - ) - - fields_processed <- fields_processed + 1 - } - - message(paste(" Tile", tile_idx, "of", length(tile_files), "processed")) - - }, error = function(e) { - message(paste(" [ERROR] Tile", basename(tile_file), ":", e$message)) - }) - } - - if (length(results_list) == 0) { - stop(paste("No fields processed successfully for week", week_num)) - } - - stats_df <- dplyr::bind_rows(results_list) - message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields")) - - return(stats_df) -} - -# ============================================================================ -# CALCULATE KPI TRENDS (Requires previous week RDS) -# ============================================================================ - -calculate_kpi_trends <- function(current_stats, prev_stats = NULL, - project_dir = NULL, reports_dir = NULL, - current_week = NULL, year = NULL) { - - message("Calculating KPI trends from current and previous week data") - - # Initialize new columns with defaults - current_stats$Weekly_ci_change <- NA_real_ - current_stats$CV_Trend_Short_Term <- NA_real_ - current_stats$Four_week_trend <- NA_real_ - current_stats$CV_Trend_Long_Term <- NA_real_ - current_stats$nmr_weeks_in_this_phase <- 1L - - # If no previous week data, return with defaults - if (is.null(prev_stats) || nrow(prev_stats) == 0) { - message(" No previous week data available - using defaults") - return(current_stats) - } - - message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns")) - - # Build lookup indices for previous week (by Field_id) - prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) - - # Try to load previous week's field_analysis to get nmr_weeks_in_this_phase history - prev_field_analysis <- NULL - - tryCatch({ - analysis_dir <- file.path(reports_dir, "kpis", "field_analysis") - if (dir.exists(analysis_dir)) { - analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) - if (length(analysis_files) > 0) { - recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] - prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, - col_select = c(Field_id, nmr_weeks_in_this_phase, Phase)) - } - } - }, error = function(e) { - message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message)) - }) - - if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { - message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase")) - } - - # ============================================================ - # PHASE 3: Load 4-8 weeks of historical data for trend calculations - # ============================================================ - - historical_4weeks <- list() - historical_8weeks <- list() - - if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) { - message(" Loading historical field_stats for 4-week and 8-week trends...") - - # Load up to 4 weeks back for four_week_trend - for (lookback in 1:4) { - target_week <- current_week - lookback - if (target_week < 1) target_week <- target_week + 52 - - rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) - rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) - - if (file.exists(rds_path)) { - tryCatch({ - stats_data <- readRDS(rds_path) - historical_4weeks[[length(historical_4weeks) + 1]] <- list( - week = target_week, - stats = stats_data - ) - }, error = function(e) { - message(paste(" Warning: Could not load week", target_week, ":", e$message)) - }) - } - } - - # Load up to 8 weeks back for cv_trend_long_term - for (lookback in 1:8) { - target_week <- current_week - lookback - if (target_week < 1) target_week <- target_week + 52 - - rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) - rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) - - if (file.exists(rds_path)) { - tryCatch({ - stats_data <- readRDS(rds_path) - historical_8weeks[[length(historical_8weeks) + 1]] <- list( - week = target_week, - stats = stats_data - ) - }, error = function(e) { - # Silently skip - we'll work with whatever weeks exist - }) - } - } - - if (length(historical_4weeks) > 0) { - message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend")) - } - if (length(historical_8weeks) > 0) { - message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend")) - } - } - - # For each field in current week, lookup previous values and calculate trends - cv_trends_calculated <- 0 - four_week_trends_calculated <- 0 - cv_long_term_calculated <- 0 - - for (i in seq_len(nrow(current_stats))) { - field_id <- current_stats$Field_id[i] - prev_idx <- prev_lookup[field_id] - - if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { - prev_row <- prev_stats[prev_idx, , drop = FALSE] - - # WEEKLY CI CHANGE - prev_ci <- prev_row$Mean_CI[1] - if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) { - current_stats$Weekly_ci_change[i] <- - round(current_stats$Mean_CI[i] - prev_ci, 2) - } - - # CV TREND SHORT TERM (2-week comparison) - prev_cv <- prev_row$CV[1] - if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) { - current_stats$CV_Trend_Short_Term[i] <- - calculate_cv_trend(current_stats$CV[i], prev_cv) - cv_trends_calculated <- cv_trends_calculated + 1 - } - - # FOUR-WEEK TREND (if available) - if (length(historical_4weeks) > 0) { - ci_values_4week <- numeric() - - # Add oldest available weeks (reverse order to get oldest first) - for (hist_idx in rev(seq_along(historical_4weeks))) { - hist_data <- historical_4weeks[[hist_idx]]$stats - hist_field <- which(hist_data$Field_id == field_id) - if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) { - ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]]) - } - } - - # Add current week CI - ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i]) - - if (length(ci_values_4week) >= 2) { - current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week) - four_week_trends_calculated <- four_week_trends_calculated + 1 - } - } - - # CV TREND LONG TERM (8-week slope) - if (length(historical_8weeks) > 0) { - cv_values_8week <- numeric() - - # Add oldest available weeks (reverse order to get oldest first) - for (hist_idx in rev(seq_along(historical_8weeks))) { - hist_data <- historical_8weeks[[hist_idx]]$stats - hist_field <- which(hist_data$Field_id == field_id) - if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) { - cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]]) - } - } - - # Add current week CV - cv_values_8week <- c(cv_values_8week, current_stats$CV[i]) - - if (length(cv_values_8week) >= 2) { - slope <- calculate_cv_trend_long_term(cv_values_8week) - current_stats$CV_Trend_Long_Term[i] <- slope - cv_long_term_calculated <- cv_long_term_calculated + 1 - } - } - - # WEEKS IN CURRENT PHASE (track phase transitions) - # Use previous field_analysis if available for proper counter progression - if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { - # Look up this field in previous analysis - prev_analysis_row <- prev_field_analysis %>% - dplyr::filter(Field_id == field_id) - - if (nrow(prev_analysis_row) > 0) { - prev_phase_analysis <- prev_analysis_row$Phase[1] - prev_nmr_weeks_analysis <- prev_analysis_row$nmr_weeks_in_this_phase[1] - - if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase_analysis)) { - if (current_stats$Phase[i] == prev_phase_analysis) { - # Same phase - increment the counter - current_stats$nmr_weeks_in_this_phase[i] <- - if (!is.na(prev_nmr_weeks_analysis)) prev_nmr_weeks_analysis + 1L else 2L - } else { - # Phase changed - reset to 1 - current_stats$nmr_weeks_in_this_phase[i] <- 1L - } - } - } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { - # Field not in previous analysis, fall back to prev_stats phase comparison - if (current_stats$Phase[i] == prev_row$Phase[1]) { - current_stats$nmr_weeks_in_this_phase[i] <- 2L - } else { - current_stats$nmr_weeks_in_this_phase[i] <- 1L - } - } - } else { - # No previous field_analysis available - use phase from prev_stats - if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { - if (current_stats$Phase[i] == prev_row$Phase[1]) { - # Same phase - increment counter (start with 2 since prev week was in this phase) - current_stats$nmr_weeks_in_this_phase[i] <- 2L - } else { - # Phase changed - reset to 1 - current_stats$nmr_weeks_in_this_phase[i] <- 1L - } - } - } - } - } - - message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields")) - message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields")) - message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields")) - return(current_stats) -} - -# ============================================================================ -# LOAD OR CALCULATE WEEKLY STATISTICS (RDS Caching) -# ============================================================================ - -load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf, - mosaic_dir, reports_dir, report_date = Sys.Date()) { - - # Build RDS file path - rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, week_num) - rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) - - # Try to load existing RDS (fast cache) - if (file.exists(rds_path)) { - message(paste("Loading cached statistics from:", basename(rds_path))) - return(readRDS(rds_path)) - } - - # RDS not found - calculate from tiles - message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num)) - stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year, - mosaic_dir, report_date) - - # Create output directory if needed - output_dir <- file.path(reports_dir, "kpis", "field_stats") - if (!dir.exists(output_dir)) { - dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) - } - - # Export RDS (for fast lookup next week) - saveRDS(stats_df, rds_path) - message(paste("Saved weekly statistics RDS:", basename(rds_path))) - - # Export CSV (for user review) - csv_filename <- sprintf("%s_field_stats_week%02d.csv", project_dir, week_num) - csv_path <- file.path(output_dir, csv_filename) - readr::write_csv(stats_df, csv_path) - message(paste("Saved weekly statistics CSV:", basename(csv_path))) - - return(stats_df) -} - -# ============================================================================ -# PARALLEL FIELD ANALYSIS FUNCTION -# ============================================================================ - -analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week_num, year, - mosaic_dir, historical_data = NULL, planting_dates = NULL, - report_date = Sys.Date(), harvest_imminence_data = NULL, - harvesting_data = NULL) { - - tryCatch({ - field_id <- field_boundaries_sf$field[field_idx] - farm_section <- if ("sub_area" %in% names(field_boundaries_sf)) { - field_boundaries_sf$sub_area[field_idx] - } else { - NA_character_ - } - field_name <- field_id - - # DEBUG: Print for first few fields - if (field_idx <= 3) { - message(paste("[DEBUG] Field", field_idx, ":", field_id)) - } - - field_sf <- field_boundaries_sf[field_idx, ] - if (sf::st_is_empty(field_sf) || any(is.na(sf::st_geometry(field_sf)))) { - return(data.frame( - Field_id = field_id, - error = "Empty or invalid geometry" - )) - } - - field_area_ha <- as.numeric(sf::st_area(field_sf)) / 10000 - field_area_acres <- field_area_ha / 0.404686 - - tile_ids <- get_tile_ids_for_field(field_sf, tile_grid, field_id = field_id) - - # DEBUG: Print tile IDs for first field - if (field_idx == 1) { - message(paste("[DEBUG] First field tile_ids:", paste(tile_ids, collapse=","))) - message(paste("[DEBUG] tile_grid nrows:", nrow(tile_grid), "ncols:", ncol(tile_grid))) - message(paste("[DEBUG] mosaic_dir:", mosaic_dir)) - } - - current_ci <- load_tiles_for_field(field_sf, tile_ids, week_num, year, mosaic_dir) - - if (is.null(current_ci)) { - return(data.frame( - Field_id = field_id, - error = "No tile data available" - )) - } - - # SINGLE EXTRACTION: Get all pixel values for this field, then calculate all stats from it - field_bbox <- sf::st_bbox(field_sf) - ci_cropped <- terra::crop(current_ci, terra::ext(field_bbox), snap = "out") - - # Extract all pixels in one call (no fun= parameter means we get raw pixel values) - all_extracted <- terra::extract(ci_cropped, field_sf)[, 2] - current_ci_vals <- all_extracted[!is.na(all_extracted)] - - if (length(current_ci_vals) == 0) { - return(data.frame( - Field_id = field_id, - error = "No CI values extracted from tiles" - )) - } - - # Calculate all statistics from the single extraction - mean_ci_current <- mean(current_ci_vals, na.rm = TRUE) - ci_std <- sd(current_ci_vals, na.rm = TRUE) - cv_current <- if (mean_ci_current > 0) ci_std / mean_ci_current else NA_real_ - range_min <- min(current_ci_vals, na.rm = TRUE) - range_max <- max(current_ci_vals, na.rm = TRUE) - range_str <- sprintf("%.1f-%.1f", range_min, range_max) - ci_percentiles_str <- get_ci_percentiles(current_ci_vals) - - # Cloud coverage from extraction metadata - num_total <- length(all_extracted) - num_data <- length(current_ci_vals) - pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 - cloud_cat <- if (num_data == 0) "No image available" - else if (pct_clear >= 99.5) "Clear view" - else "Partial coverage" - cloud_pct <- 100 - pct_clear - cloud_interval <- round_cloud_to_intervals(pct_clear) - - # Weekly change (extract previous week same way - single extraction) - weekly_ci_change <- NA - previous_ci_vals <- NULL - - tryCatch({ - previous_ci <- load_tiles_for_field(field_sf, tile_ids, week_num - 1, year, mosaic_dir) - if (!is.null(previous_ci)) { - prev_bbox <- sf::st_bbox(field_sf) - prev_ci_cropped <- terra::crop(previous_ci, terra::ext(prev_bbox), snap = "out") - prev_extracted_all <- terra::extract(prev_ci_cropped, field_sf)[, 2] - previous_ci_vals <- prev_extracted_all[!is.na(prev_extracted_all)] - if (length(previous_ci_vals) > 0) { - mean_ci_previous <- mean(previous_ci_vals, na.rm = TRUE) - weekly_ci_change <- mean_ci_current - mean_ci_previous - } - } - }, error = function(e) { - # Silent fail - }) - - if (is.na(weekly_ci_change)) { - weekly_ci_change_str <- sprintf("%.1f ± %.2f", mean_ci_current, ci_std) - } else { - weekly_ci_change_str <- sprintf("%.1f ± %.2f (Δ%.1f)", mean_ci_current, ci_std, weekly_ci_change) - } - - age_weeks <- NA - if (!is.null(planting_dates) && nrow(planting_dates) > 0) { - field_planting <- planting_dates %>% - filter(field_id == !!field_id) %>% - pull(planting_date) - - if (length(field_planting) > 0) { - age_weeks <- as.numeric(difftime(report_date, field_planting[1], units = "weeks")) - } - } - - if (USE_UNIFORM_AGE) { - age_weeks <- as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) - } - - pct_ci_above_2 <- sum(current_ci_vals > 2) / length(current_ci_vals) * 100 - pct_ci_ge_2 <- sum(current_ci_vals >= 2) / length(current_ci_vals) * 100 - germination_progress_str <- NA_character_ - if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks <= 6) { - germination_progress_str <- sprintf("%.0f%%", pct_ci_ge_2) - } - - phase <- "Unknown" - imminent_prob_val <- NA - if (!is.null(harvest_imminence_data) && nrow(harvest_imminence_data) > 0) { - imminence_row <- harvest_imminence_data %>% - filter(field_id == !!field_id) - if (nrow(imminence_row) > 0) { - imminent_prob_val <- imminence_row$probability[1] - if (imminent_prob_val > 0.5) { - phase <- "Harvest Imminent (Model)" - } - } - } - - if (phase == "Unknown") { - phase <- get_phase_by_age(age_weeks) - } - - status_trigger <- get_status_trigger(current_ci_vals, weekly_ci_change, age_weeks) - - nmr_weeks_in_phase <- 1 - - four_week_trend <- NA_character_ - ci_values_for_trend <- c(mean_ci_current) - - if (!is.null(historical_data) && length(historical_data) > 0) { - for (hist in historical_data) { - hist_week <- hist$week - hist_data <- hist$data - - field_row <- hist_data %>% - filter(Field_id == !!field_id) - - if (nrow(field_row) > 0 && !is.na(field_row$Mean_CI[1])) { - ci_values_for_trend <- c(field_row$Mean_CI[1], ci_values_for_trend) - } - } - - if (length(ci_values_for_trend) >= 2) { - four_week_trend <- categorize_four_week_trend(ci_values_for_trend) - } - } - - cv_trend_short <- NA_real_ - cv_trend_long <- NA_real_ - - if (!is.null(historical_data) && length(historical_data) > 0) { - if (length(historical_data) >= 2) { - cv_2w <- historical_data[[2]]$data %>% - filter(Field_id == !!field_id) %>% - pull(CV) - if (length(cv_2w) > 0 && !is.na(cv_2w[1])) { - cv_trend_short <- calculate_cv_trend(cv_current, cv_2w[1]) - } - } - - if (length(historical_data) >= 8) { - cv_8w <- historical_data[[8]]$data %>% - filter(Field_id == !!field_id) %>% - pull(CV) - if (length(cv_8w) > 0 && !is.na(cv_8w[1])) { - cv_trend_long <- calculate_cv_trend(cv_current, cv_8w[1]) - } - } - } - - last_harvest_date <- NA_character_ - if (!is.null(harvesting_data) && nrow(harvesting_data) > 0) { - last_harvest_row <- harvesting_data %>% - filter(field == !!field_id) %>% - arrange(desc(season_start)) %>% - slice(1) - - if (nrow(last_harvest_row) > 0 && !is.na(last_harvest_row$season_start[1])) { - last_harvest_date <- as.character(last_harvest_row$season_start[1]) - } - } - - result <- data.frame( - Field_id = field_id, - Farm_Section = farm_section, - Field_name = field_name, - Hectare = round(field_area_ha, 2), - Acreage = round(field_area_acres, 2), - Mean_CI = round(mean_ci_current, 2), - Weekly_ci_change = if (is.na(weekly_ci_change)) NA_real_ else round(weekly_ci_change, 2), - Weekly_ci_change_str = weekly_ci_change_str, - Four_week_trend = four_week_trend, - Last_harvest_or_planting_date = last_harvest_date, - Age_week = if (is.na(age_weeks)) NA_integer_ else as.integer(round(age_weeks)), - `Phase (age based)` = phase, - nmr_weeks_in_this_phase = nmr_weeks_in_phase, - Germination_progress = germination_progress_str, - Imminent_prob = imminent_prob_val, - Status_trigger = status_trigger, - CI_range = range_str, - CI_Percentiles = ci_percentiles_str, - CV = round(cv_current, 4), - CV_Trend_Short_Term = cv_trend_short, - CV_Trend_Long_Term = cv_trend_long, - Cloud_pct_clear = pct_clear, - Cloud_pct_clear_interval = cloud_interval, - Cloud_pct = cloud_pct, - Cloud_category = cloud_cat, - stringsAsFactors = FALSE - ) - - return(result) - - }, error = function(e) { - message(paste("Error analyzing field", field_idx, ":", e$message)) - return(data.frame( - Field_id = NA_character_, - error = e$message - )) - }) -} - -# ============================================================================ -# SUMMARY GENERATION -# ============================================================================ - -generate_field_analysis_summary <- function(field_df) { - message("Generating summary statistics...") - - total_acreage <- sum(field_df$Acreage, na.rm = TRUE) - - germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE) - tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE) - grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE) - maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE) - unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE) - - harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE) - stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE) - recovery_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "strong_recovery"], na.rm = TRUE) - growth_on_track_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "growth_on_track"], na.rm = TRUE) - germination_complete_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_complete"], na.rm = TRUE) - germination_started_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_started"], na.rm = TRUE) - no_trigger_acreage <- sum(field_df$Acreage[is.na(field_df$Status_trigger)], na.rm = TRUE) - - clear_fields <- sum(field_df$Cloud_category == "Clear view", na.rm = TRUE) - partial_fields <- sum(field_df$Cloud_category == "Partial coverage", na.rm = TRUE) - no_image_fields <- sum(field_df$Cloud_category == "No image available", na.rm = TRUE) - total_fields <- nrow(field_df) - - clear_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Clear view"], na.rm = TRUE) - partial_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Partial coverage"], na.rm = TRUE) - no_image_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "No image available"], na.rm = TRUE) - - summary_df <- data.frame( - Category = c( - "--- PHASE DISTRIBUTION ---", - "Germination", - "Tillering", - "Grand Growth", - "Maturation", - "Unknown phase", - "--- STATUS TRIGGERS ---", - "Harvest ready", - "Stress detected", - "Strong recovery", - "Growth on track", - "Germination complete", - "Germination started", - "No trigger", - "--- CLOUD COVERAGE (FIELDS) ---", - "Clear view", - "Partial coverage", - "No image available", - "--- CLOUD COVERAGE (ACREAGE) ---", - "Clear view", - "Partial coverage", - "No image available", - "--- TOTAL ---", - "Total Acreage" - ), - Acreage = c( - NA, - round(germination_acreage, 2), - round(tillering_acreage, 2), - round(grand_growth_acreage, 2), - round(maturation_acreage, 2), - round(unknown_phase_acreage, 2), - NA, - round(harvest_ready_acreage, 2), - round(stress_acreage, 2), - round(recovery_acreage, 2), - round(growth_on_track_acreage, 2), - round(germination_complete_acreage, 2), - round(germination_started_acreage, 2), - round(no_trigger_acreage, 2), - NA, - paste0(clear_fields, " fields"), - paste0(partial_fields, " fields"), - paste0(no_image_fields, " fields"), - NA, - round(clear_acreage, 2), - round(partial_acreage, 2), - round(no_image_acreage, 2), - NA, - round(total_acreage, 2) - ), - stringsAsFactors = FALSE - ) - - return(summary_df) -} - -# ============================================================================ -# EXPORT FUNCTIONS -# ============================================================================ - -export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) { - message("Exporting per-field analysis to Excel, CSV, and RDS...") - - # Round all numeric columns to 2 decimals - field_df_rounded <- field_df %>% - mutate(across(where(is.numeric), ~ round(., 2))) - - summary_df_rounded <- summary_df %>% - mutate(across(where(is.numeric), ~ round(., 2))) - - output_subdir <- file.path(reports_dir, "kpis", "field_analysis") - if (!dir.exists(output_subdir)) { - dir.create(output_subdir, recursive = TRUE) - } - - excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".xlsx") - excel_path <- file.path(output_subdir, excel_filename) - excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE) - - sheets <- list( - "Field Data" = field_df_rounded, - "Summary" = summary_df_rounded - ) - - write_xlsx(sheets, excel_path) - message(paste("✓ Field analysis Excel exported to:", excel_path)) - - kpi_data <- list( - field_analysis = field_df_rounded, - field_analysis_summary = summary_df_rounded, - metadata = list( - current_week = current_week, - project = project_dir, - created_at = Sys.time() - ) - ) - - rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d", current_week), ".rds") - rds_path <- file.path(reports_dir, "kpis", rds_filename) - - saveRDS(kpi_data, rds_path) - message(paste("✓ Field analysis RDS exported to:", rds_path)) - - csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv") - csv_path <- file.path(output_subdir, csv_filename) - write_csv(field_df_rounded, csv_path) - message(paste("✓ Field analysis CSV exported to:", csv_path)) - - return(list(excel = excel_path, rds = rds_path, csv = csv_path)) -} - -# ============================================================================ -# TILE-BASED KPI EXTRACTION FUNCTION -# ============================================================================ - -calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) { - # Loop through tiles, extract KPI statistics per field per tile - # Follows the same pattern as extract_ci_from_tiles in CI extraction - - message("Calculating field-level KPI statistics from tiles...") - - # Get all tile files for this week - tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) - tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE) - - if (length(tile_files) == 0) { - message("No tiles found for week", week_num, year) - return(NULL) - } - - # Process tiles in parallel using furrr (same as CI extraction) - message(paste("Processing", length(tile_files), "tiles in parallel...")) - - field_kpi_list <- furrr::future_map( - tile_files, - ~ process_single_kpi_tile( - tile_file = ., - field_boundaries_sf = field_boundaries_sf, - tile_grid = tile_grid - ), - .progress = TRUE, - .options = furrr::furrr_options(seed = TRUE) - ) - - # Combine results from all tiles - field_kpi_stats <- dplyr::bind_rows(field_kpi_list) - - if (nrow(field_kpi_stats) == 0) { - message(" No KPI data extracted from tiles") - return(NULL) - } - - message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields")) - return(field_kpi_stats) -} - -# Helper function to process a single tile (like process_single_tile in CI extraction) -process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) { - tryCatch({ - tile_basename <- basename(tile_file) - # Load tile raster - tile_raster <- terra::rast(tile_file) - - # Get first band (CI band for weekly mosaics) - ci_band <- tile_raster[[1]] - - # EXACTLY LIKE SCRIPT 20: Crop to field bounding box first, then extract with sf directly - field_bbox <- sf::st_bbox(field_boundaries_sf) - ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out") - - # Extract CI values for ALL fields at once using sf object directly (NOT terra::vect) - # terra::extract() works with sf objects and handles geometries properly - extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE) - - # Initialize results for this tile - tile_results <- data.frame() - - # Get tile ID from filename - tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename)) - - # Process each field: extracted_vals is a data.frame with ID column (field indices) + extracted values - for (field_idx in seq_len(nrow(field_boundaries_sf))) { - field_id <- field_boundaries_sf$field[field_idx] - - # extracted_vals columns: 1=ID, 2=mean_CI (since we used fun="mean") - mean_ci <- extracted_vals[field_idx, 2] - - # Skip if no data for this field in this tile - if (is.na(mean_ci)) { - next - } - - # For tile-level stats, we only have mean from extraction (no variance without all pixels) - # Add to results - tile_results <- rbind(tile_results, data.frame( - field = field_id, - tile_id = tile_id_match, - tile_file = tile_basename, - mean_ci = round(mean_ci, 4), - stringsAsFactors = FALSE - )) - } - - return(tile_results) - - }, error = function(e) { - message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message)) - return(data.frame()) - }) -} - -calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf, - harvesting_data, cumulative_CI_vals_dir, - weekly_CI_mosaic, reports_dir, current_week, year, - tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") { - message("\n=== CALCULATING FARM-LEVEL KPIs ===") - message("(6 high-level KPI metrics with tile-based extraction)") - - output_dir <- file.path(reports_dir, "kpis") - if (!dir.exists(output_dir)) { - dir.create(output_dir, recursive = TRUE) - } - - # Get mosaic directory with grid size if using tiles - mosaic_dir <- if (use_tile_mosaic && !is.null(tile_grid_size)) { - file.path(weekly_CI_mosaic, tile_grid_size) - } else { - weekly_CI_mosaic - } - - # Extract field-level KPI statistics from tiles - field_kpi_stats <- calculate_field_kpis_from_tiles( - tile_dir = mosaic_dir, - week_num = current_week, - year = year, - field_boundaries_sf = field_boundaries_sf, - tile_grid = tile_grid - ) - - if (is.null(field_kpi_stats) || nrow(field_kpi_stats) == 0) { - message("Warning: No field KPI statistics extracted from tiles") - return(NULL) - } - - # Aggregate tile-based statistics by field (average across tiles for each field) - field_summary_stats <- field_kpi_stats %>% - dplyr::group_by(field) %>% - dplyr::summarise( - mean_ci = mean(mean_ci, na.rm = TRUE), - cv_ci = mean(cv_ci, na.rm = TRUE), - min_ci = min(min_ci, na.rm = TRUE), - max_ci = max(max_ci, na.rm = TRUE), - total_pixels = sum(n_pixels, na.rm = TRUE), - num_tiles = n_distinct(tile_id), - .groups = 'drop' - ) - - # Create results list - kpi_results <- list( - field_kpi_stats = field_kpi_stats, - field_summary_stats = field_summary_stats, - metadata = list( - report_date = report_date, - current_week = current_week, - year = year, - calculation_method = "tile_based_extraction", - num_fields_processed = length(unique(field_kpi_stats$field)), - num_tiles_processed = length(unique(field_kpi_stats$tile_id)) - ) - ) - - # Save results - rds_filename <- paste0(project_dir, "_farm_kpi_stats_week", sprintf("%02d", current_week), ".rds") - rds_path <- file.path(output_dir, rds_filename) - saveRDS(kpi_results, rds_path) - message(paste("✓ Farm-level KPI stats exported to:", rds_path)) - - # Print summary - cat("\n=== FARM-LEVEL KPI SUMMARY ===\n") - cat("Report Date:", as.character(report_date), "\n") - cat("Week:", current_week, "Year:", year, "\n") - cat("Fields Processed:", length(unique(field_kpi_stats$field)), "\n") - cat("Tiles Processed:", length(unique(field_kpi_stats$tile_id)), "\n") - cat("\n--- Field Summary Statistics (Mean across tiles) ---\n") - print(head(field_summary_stats, 20)) - - return(kpi_results) -} - -# ============================================================================ -# HELPER: Extract field-level statistics from CI raster (all pixels, single call) -# ============================================================================ - -extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) { - #' Extract CI statistics for all fields from a single CI raster band - #' - #' This function extracts all pixel values for each field in one terra::extract call, - #' then calculates mean, CV, and percentiles from those pixels. - #' - #' @param ci_band Single CI band from terra raster - #' @param field_boundaries_sf SF object with field geometries - #' @return Data frame with columns: field_idx, mean_ci, cv, p10, p90, min_ci, max_ci, pixel_count_valid, pixel_count_total - - # SINGLE EXTRACTION: Get all pixels for all fields at once (no aggregation function) - # Result: data.frame with ID column (field indices) and value column (pixel values) - extract_result <- terra::extract(ci_band, field_boundaries_sf) - - # Calculate statistics for each field - stats_list <- list() - - for (field_idx in seq_len(nrow(field_boundaries_sf))) { - # Get all pixels for this field from the single extraction - # extract_result has columns [ID, value] where ID is field index (1-based) - field_pixels <- extract_result[extract_result$ID == field_idx, 2] - pixels <- as.numeric(field_pixels[!is.na(field_pixels)]) # Remove NAs - - if (length(pixels) == 0) { - # No data for this field - stats_list[[field_idx]] <- data.frame( - field_idx = field_idx, - mean_ci = NA_real_, - cv = NA_real_, - p10 = NA_real_, - p90 = NA_real_, - min_ci = NA_real_, - max_ci = NA_real_, - pixel_count_valid = 0, - pixel_count_total = 0, - stringsAsFactors = FALSE - ) - next - } - - # Calculate all statistics from pixels array - mean_val <- mean(pixels, na.rm = TRUE) - cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_ - p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] - p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] - min_val <- min(pixels, na.rm = TRUE) - max_val <- max(pixels, na.rm = TRUE) - - stats_list[[field_idx]] <- data.frame( - field_idx = field_idx, - mean_ci = mean_val, - cv = cv_val, - p10 = p10_val, - p90 = p90_val, - min_ci = min_val, - max_ci = max_val, - pixel_count_valid = length(pixels), - pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]), - stringsAsFactors = FALSE - ) - } - - return(dplyr::bind_rows(stats_list)) -} - # ============================================================================ # MAIN # ============================================================================ diff --git a/r_app/80_report_building_utils.R b/r_app/80_report_building_utils.R new file mode 100644 index 0000000..35f20a7 --- /dev/null +++ b/r_app/80_report_building_utils.R @@ -0,0 +1,249 @@ +# 80_REPORT_BUILDING_UTILS.R +# ============================================================================ +# UTILITY FUNCTIONS FOR REPORT GENERATION AND EXCEL/CSV EXPORT +# +# This file contains reusable functions for: +# - Field analysis summary generation +# - Excel/CSV/RDS export functionality +# - Farm-level KPI aggregation and summary +# - Tile-based KPI extraction (alternative calculation method) +# +# Used by: 80_calculate_kpis.R, run_full_pipeline.R, other reporting scripts +# ============================================================================ + +# ============================================================================ +# SUMMARY GENERATION +# ============================================================================ + +generate_field_analysis_summary <- function(field_df) { + message("Generating summary statistics...") + + total_acreage <- sum(field_df$Acreage, na.rm = TRUE) + + germination_acreage <- sum(field_df$Acreage[field_df$Phase == "Germination"], na.rm = TRUE) + tillering_acreage <- sum(field_df$Acreage[field_df$Phase == "Tillering"], na.rm = TRUE) + grand_growth_acreage <- sum(field_df$Acreage[field_df$Phase == "Grand Growth"], na.rm = TRUE) + maturation_acreage <- sum(field_df$Acreage[field_df$Phase == "Maturation"], na.rm = TRUE) + unknown_phase_acreage <- sum(field_df$Acreage[field_df$Phase == "Unknown"], na.rm = TRUE) + + harvest_ready_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "harvest_ready"], na.rm = TRUE) + stress_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "stress_detected_whole_field"], na.rm = TRUE) + recovery_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "strong_recovery"], na.rm = TRUE) + growth_on_track_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "growth_on_track"], na.rm = TRUE) + germination_complete_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_complete"], na.rm = TRUE) + germination_started_acreage <- sum(field_df$Acreage[field_df$Status_trigger == "germination_started"], na.rm = TRUE) + no_trigger_acreage <- sum(field_df$Acreage[is.na(field_df$Status_trigger)], na.rm = TRUE) + + clear_fields <- sum(field_df$Cloud_category == "Clear view", na.rm = TRUE) + partial_fields <- sum(field_df$Cloud_category == "Partial coverage", na.rm = TRUE) + no_image_fields <- sum(field_df$Cloud_category == "No image available", na.rm = TRUE) + total_fields <- nrow(field_df) + + clear_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Clear view"], na.rm = TRUE) + partial_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "Partial coverage"], na.rm = TRUE) + no_image_acreage <- sum(field_df$Acreage[field_df$Cloud_category == "No image available"], na.rm = TRUE) + + summary_df <- data.frame( + Category = c( + "--- PHASE DISTRIBUTION ---", + "Germination", + "Tillering", + "Grand Growth", + "Maturation", + "Unknown phase", + "--- STATUS TRIGGERS ---", + "Harvest ready", + "Stress detected", + "Strong recovery", + "Growth on track", + "Germination complete", + "Germination started", + "No trigger", + "--- CLOUD COVERAGE (FIELDS) ---", + "Clear view", + "Partial coverage", + "No image available", + "--- CLOUD COVERAGE (ACREAGE) ---", + "Clear view", + "Partial coverage", + "No image available", + "--- TOTAL ---", + "Total Acreage" + ), + Acreage = c( + NA, + round(germination_acreage, 2), + round(tillering_acreage, 2), + round(grand_growth_acreage, 2), + round(maturation_acreage, 2), + round(unknown_phase_acreage, 2), + NA, + round(harvest_ready_acreage, 2), + round(stress_acreage, 2), + round(recovery_acreage, 2), + round(growth_on_track_acreage, 2), + round(germination_complete_acreage, 2), + round(germination_started_acreage, 2), + round(no_trigger_acreage, 2), + NA, + paste0(clear_fields, " fields"), + paste0(partial_fields, " fields"), + paste0(no_image_fields, " fields"), + NA, + round(clear_acreage, 2), + round(partial_acreage, 2), + round(no_image_acreage, 2), + NA, + round(total_acreage, 2) + ), + stringsAsFactors = FALSE + ) + + return(summary_df) +} + +# ============================================================================ +# EXPORT FUNCTIONS +# ============================================================================ + +export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) { + message("Exporting per-field analysis to Excel, CSV, and RDS...") + + field_df_rounded <- field_df %>% + mutate(across(where(is.numeric), ~ round(., 2))) + + summary_df_rounded <- summary_df %>% + mutate(across(where(is.numeric), ~ round(., 2))) + + output_subdir <- file.path(reports_dir, "kpis", "field_analysis") + if (!dir.exists(output_subdir)) { + dir.create(output_subdir, recursive = TRUE) + } + + excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".xlsx") + excel_path <- file.path(output_subdir, excel_filename) + excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE) + + sheets <- list( + "Field Data" = field_df_rounded, + "Summary" = summary_df_rounded + ) + + write_xlsx(sheets, excel_path) + message(paste("✓ Field analysis Excel exported to:", excel_path)) + + kpi_data <- list( + field_analysis = field_df_rounded, + field_analysis_summary = summary_df_rounded, + metadata = list( + current_week = current_week, + project = project_dir, + created_at = Sys.time() + ) + ) + + rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d", current_week), ".rds") + rds_path <- file.path(reports_dir, "kpis", rds_filename) + + saveRDS(kpi_data, rds_path) + message(paste("✓ Field analysis RDS exported to:", rds_path)) + + csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv") + csv_path <- file.path(output_subdir, csv_filename) + write_csv(field_df_rounded, csv_path) + message(paste("✓ Field analysis CSV exported to:", csv_path)) + + return(list(excel = excel_path, rds = rds_path, csv = csv_path)) +} + +# ============================================================================ +# TILE-BASED KPI EXTRACTION (Alternative calculation method) +# ============================================================================ + +# [COMMENTED OUT / UNUSED - kept for reference] +# These functions provide tile-based extraction as an alternative to field_statistics approach +# Currently replaced by calculate_field_statistics() in 80_weekly_stats_utils.R +# Uncomment if parallel processing of tiles is needed in future + +# calculate_field_kpis_from_tiles <- function(tile_dir, week_num, year, field_boundaries_sf, tile_grid) { +# message("Calculating field-level KPI statistics from tiles...") +# +# tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) +# tile_files <- list.files(tile_dir, pattern = tile_pattern, full.names = TRUE) +# +# if (length(tile_files) == 0) { +# message("No tiles found for week", week_num, year) +# return(NULL) +# } +# +# message(paste("Processing", length(tile_files), "tiles in parallel...")) +# +# field_kpi_list <- furrr::future_map( +# tile_files, +# ~ process_single_kpi_tile( +# tile_file = ., +# field_boundaries_sf = field_boundaries_sf, +# tile_grid = tile_grid +# ), +# .progress = TRUE, +# .options = furrr::furrr_options(seed = TRUE) +# ) +# +# field_kpi_stats <- dplyr::bind_rows(field_kpi_list) +# +# if (nrow(field_kpi_stats) == 0) { +# message(" No KPI data extracted from tiles") +# return(NULL) +# } +# +# message(paste(" Extracted KPI stats for", length(unique(field_kpi_stats$field)), "unique fields")) +# return(field_kpi_stats) +# } + +# process_single_kpi_tile <- function(tile_file, field_boundaries_sf, tile_grid) { +# # Helper function for calculate_field_kpis_from_tiles +# tryCatch({ +# tile_basename <- basename(tile_file) +# tile_raster <- terra::rast(tile_file) +# ci_band <- tile_raster[[1]] +# +# field_bbox <- sf::st_bbox(field_boundaries_sf) +# ci_cropped <- terra::crop(ci_band, terra::ext(field_bbox), snap = "out") +# +# extracted_vals <- terra::extract(ci_cropped, field_boundaries_sf, fun = "mean", na.rm = TRUE) +# +# tile_results <- data.frame() +# tile_id_match <- as.numeric(sub(".*_(\\d{2})\\.tif$", "\\1", tile_basename)) +# +# for (field_idx in seq_len(nrow(field_boundaries_sf))) { +# field_id <- field_boundaries_sf$field[field_idx] +# mean_ci <- extracted_vals[field_idx, 2] +# +# if (is.na(mean_ci)) { +# next +# } +# +# tile_results <- rbind(tile_results, data.frame( +# field = field_id, +# tile_id = tile_id_match, +# tile_file = tile_basename, +# mean_ci = round(mean_ci, 4), +# stringsAsFactors = FALSE +# )) +# } +# +# return(tile_results) +# +# }, error = function(e) { +# message(paste(" Warning: Error processing tile", basename(tile_file), ":", e$message)) +# return(data.frame()) +# }) +# } + +# calculate_and_export_farm_kpis <- function(report_date, project_dir, field_boundaries_sf, +# harvesting_data, cumulative_CI_vals_dir, +# weekly_CI_mosaic, reports_dir, current_week, year, +# tile_grid, use_tile_mosaic = FALSE, tile_grid_size = "5x5") { +# # Farm-level KPI calculation using tile-based extraction (alternative approach) +# # [Implementation kept as reference for alternative calculation method] +# } diff --git a/r_app/80_weekly_stats_utils.R b/r_app/80_weekly_stats_utils.R new file mode 100644 index 0000000..3a5f1d2 --- /dev/null +++ b/r_app/80_weekly_stats_utils.R @@ -0,0 +1,953 @@ +# 80_WEEKLY_STATS_UTILS.R +# ============================================================================ +# UTILITY FUNCTIONS FOR WEEKLY STATISTICS CALCULATION +# +# This file contains reusable functions for: +# - Tile grid management +# - Tile loading and merging +# - Field-level statistics calculation from CI rasters +# - Weekly stats caching (RDS/CSV export/import) +# - KPI trend calculations +# - Historical data loading and auto-generation from mosaics +# +# Used by: 80_calculate_kpis.R, run_full_pipeline.R, other reporting scripts +# ============================================================================ + +# ============================================================================ +# TILE-AWARE HELPER FUNCTIONS +# ============================================================================ + +get_tile_ids_for_field <- function(field_geom, tile_grid, field_id = NULL) { + if (inherits(field_geom, "sf")) { + field_bbox <- sf::st_bbox(field_geom) + field_xmin <- field_bbox["xmin"] + field_xmax <- field_bbox["xmax"] + field_ymin <- field_bbox["ymin"] + field_ymax <- field_bbox["ymax"] + } else if (inherits(field_geom, "SpatVector")) { + field_bbox <- terra::ext(field_geom) + field_xmin <- field_bbox$xmin + field_xmax <- field_bbox$xmax + field_ymin <- field_bbox$ymin + field_ymax <- field_bbox$ymax + } else { + stop("field_geom must be sf or terra::vect object") + } + + intersecting_tiles <- tile_grid$id[ + !(tile_grid$xmax < field_xmin | + tile_grid$xmin > field_xmax | + tile_grid$ymax < field_ymin | + tile_grid$ymin > field_ymax) + ] + + return(as.numeric(intersecting_tiles)) +} + +load_tiles_for_field <- function(field_geom, tile_ids, week_num, year, mosaic_dir) { + if (length(tile_ids) == 0) { + return(NULL) + } + + tiles_list <- list() + for (tile_id in sort(tile_ids)) { + tile_filename <- sprintf("week_%02d_%d_%02d.tif", week_num, year, tile_id) + tile_path <- file.path(mosaic_dir, tile_filename) + + if (file.exists(tile_path)) { + tryCatch({ + tile_rast <- terra::rast(tile_path) + ci_band <- terra::subset(tile_rast, 5) + tiles_list[[length(tiles_list) + 1]] <- ci_band + }, error = function(e) { + message(paste(" Warning: Could not load tile", tile_id, ":", e$message)) + }) + } + } + + if (length(tiles_list) == 0) { + return(NULL) + } + + if (length(tiles_list) == 1) { + return(tiles_list[[1]]) + } else { + tryCatch({ + rsrc <- terra::sprc(tiles_list) + merged <- terra::mosaic(rsrc, fun = "max") + return(merged) + }, error = function(e) { + message(paste(" Warning: Could not merge tiles:", e$message)) + return(tiles_list[[1]]) + }) + } +} + +build_tile_grid <- function(mosaic_dir, week_num, year) { + # Handle grid-size subdirectories (e.g., weekly_tile_max/5x5/) + detected_grid_size <- NA + if (dir.exists(mosaic_dir)) { + subfolders <- list.dirs(mosaic_dir, full.names = FALSE, recursive = FALSE) + grid_patterns <- grep("^\\d+x\\d+$", subfolders, value = TRUE) + + if (length(grid_patterns) > 0) { + detected_grid_size <- grid_patterns[1] + mosaic_dir <- file.path(mosaic_dir, detected_grid_size) + message(paste(" Using grid-size subdirectory:", detected_grid_size)) + } + } + + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) + tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) + + if (length(tile_files) == 0) { + stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir)) + } + + tile_grid <- data.frame( + id = integer(), + xmin = numeric(), + xmax = numeric(), + ymin = numeric(), + ymax = numeric(), + stringsAsFactors = FALSE + ) + + for (tile_file in tile_files) { + tryCatch({ + matches <- regmatches(basename(tile_file), regexpr("_([0-9]{2})\\.tif$", basename(tile_file))) + if (length(matches) > 0) { + tile_id <- as.integer(sub("_|\\.tif", "", matches[1])) + tile_rast <- terra::rast(tile_file) + tile_ext <- terra::ext(tile_rast) + tile_grid <- rbind(tile_grid, data.frame( + id = tile_id, + xmin = tile_ext$xmin, + xmax = tile_ext$xmax, + ymin = tile_ext$ymin, + ymax = tile_ext$ymax, + stringsAsFactors = FALSE + )) + } + }, error = function(e) { + message(paste(" Warning: Could not process tile", basename(tile_file), ":", e$message)) + }) + } + + if (nrow(tile_grid) == 0) { + stop("Could not extract extents from any tile files") + } + + return(list( + tile_grid = tile_grid, + mosaic_dir = mosaic_dir, + grid_size = detected_grid_size + )) +} + +# ============================================================================ +# STATISTICAL CATEGORIZATION FUNCTIONS +# ============================================================================ + +categorize_four_week_trend <- function(ci_values_list) { + if (is.null(ci_values_list) || length(ci_values_list) < 2) { + return(NA_character_) + } + + ci_values_list <- ci_values_list[!is.na(ci_values_list)] + if (length(ci_values_list) < 2) { + return(NA_character_) + } + + weekly_changes <- diff(ci_values_list) + avg_weekly_change <- mean(weekly_changes, na.rm = TRUE) + + if (avg_weekly_change >= FOUR_WEEK_TREND_STRONG_GROWTH_MIN) { + return("strong growth") + } else if (avg_weekly_change >= FOUR_WEEK_TREND_GROWTH_MIN && + avg_weekly_change < FOUR_WEEK_TREND_GROWTH_MAX) { + return("growth") + } else if (abs(avg_weekly_change) <= FOUR_WEEK_TREND_NO_GROWTH_RANGE) { + return("no growth") + } else if (avg_weekly_change <= FOUR_WEEK_TREND_DECLINE_MIN && + avg_weekly_change > FOUR_WEEK_TREND_STRONG_DECLINE_MAX) { + return("decline") + } else if (avg_weekly_change < FOUR_WEEK_TREND_STRONG_DECLINE_MAX) { + return("strong decline") + } else { + return("no growth") + } +} + +round_cloud_to_intervals <- function(cloud_pct_clear) { + if (is.na(cloud_pct_clear)) { + return(NA_character_) + } + + if (cloud_pct_clear < 50) return("<50%") + if (cloud_pct_clear < 60) return("50-60%") + if (cloud_pct_clear < 70) return("60-70%") + if (cloud_pct_clear < 80) return("70-80%") + if (cloud_pct_clear < 90) return("80-90%") + return(">90%") +} + +get_ci_percentiles <- function(ci_values) { + if (is.null(ci_values) || length(ci_values) == 0) { + return(NA_character_) + } + + ci_values <- ci_values[!is.na(ci_values)] + if (length(ci_values) == 0) { + return(NA_character_) + } + + p10 <- quantile(ci_values, CI_PERCENTILE_LOW, na.rm = TRUE) + p90 <- quantile(ci_values, CI_PERCENTILE_HIGH, na.rm = TRUE) + + return(sprintf("%.1f-%.1f", p10, p90)) +} + +calculate_cv_trend <- function(cv_current, cv_previous) { + if (is.na(cv_current) || is.na(cv_previous)) { + return(NA_real_) + } + return(round(cv_current - cv_previous, 4)) +} + +calculate_four_week_trend <- function(mean_ci_values) { + #' Calculate four-week CI trend from available weeks + #' Uses whatever weeks are available (1-4 weeks) to estimate trend + + if (is.null(mean_ci_values) || length(mean_ci_values) == 0) { + return(NA_real_) + } + + ci_clean <- mean_ci_values[!is.na(mean_ci_values)] + + if (length(ci_clean) < 2) { + return(NA_real_) + } + + trend <- ci_clean[length(ci_clean)] - ci_clean[1] + return(round(trend, 2)) +} + +categorize_cv_slope <- function(slope) { + #' Categorize CV slope (8-week regression) into field uniformity interpretation + + if (is.na(slope)) { + return(NA_character_) + } + + if (slope <= CV_SLOPE_IMPROVEMENT_MIN) { + return("Excellent uniformity") + } else if (slope < CV_SLOPE_HOMOGENOUS_MIN) { + return("Homogenous growth") + } else if (slope <= CV_SLOPE_HOMOGENOUS_MAX) { + return("Homogenous growth") + } else if (slope <= CV_SLOPE_PATCHINESS_MAX) { + return("Minor patchiness") + } else { + return("Severe fragmentation") + } +} + +calculate_cv_trend_long_term <- function(cv_values) { + #' Calculate 8-week CV trend via linear regression slope + + if (is.null(cv_values) || length(cv_values) == 0) { + return(NA_real_) + } + + cv_clean <- cv_values[!is.na(cv_values)] + + if (length(cv_clean) < 2) { + return(NA_real_) + } + + weeks <- seq_along(cv_clean) + + tryCatch({ + lm_fit <- lm(cv_clean ~ weeks) + slope <- coef(lm_fit)["weeks"] + return(round(as.numeric(slope), 4)) + }, error = function(e) { + return(NA_real_) + }) +} + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +get_phase_by_age <- function(age_weeks) { + if (is.na(age_weeks)) return(NA_character_) + for (i in seq_len(nrow(PHASE_DEFINITIONS))) { + if (age_weeks >= PHASE_DEFINITIONS$age_start[i] && + age_weeks <= PHASE_DEFINITIONS$age_end[i]) { + return(PHASE_DEFINITIONS$phase[i]) + } + } + return("Unknown") +} + +get_status_trigger <- function(ci_values, ci_change, age_weeks) { + if (is.na(age_weeks) || length(ci_values) == 0) return(NA_character_) + + ci_values <- ci_values[!is.na(ci_values)] + if (length(ci_values) == 0) return(NA_character_) + + pct_above_2 <- sum(ci_values > 2) / length(ci_values) * 100 + pct_at_or_above_2 <- sum(ci_values >= 2) / length(ci_values) * 100 + ci_cv <- if (mean(ci_values, na.rm = TRUE) > 0) sd(ci_values) / mean(ci_values, na.rm = TRUE) else 0 + mean_ci <- mean(ci_values, na.rm = TRUE) + + if (age_weeks >= 0 && age_weeks <= 6) { + if (pct_at_or_above_2 >= 70) { + return("germination_complete") + } else if (pct_above_2 > 10) { + return("germination_started") + } + } + + if (age_weeks >= 45) { + return("harvest_ready") + } + + if (age_weeks > 6 && !is.na(ci_change) && ci_change < -1.5 && ci_cv < 0.25) { + return("stress_detected_whole_field") + } + + if (age_weeks > 6 && !is.na(ci_change) && ci_change > 1.5) { + return("strong_recovery") + } + + if (age_weeks >= 4 && age_weeks < 39 && !is.na(ci_change) && ci_change > 0.2) { + return("growth_on_track") + } + + if (age_weeks >= 39 && age_weeks < 45 && mean_ci > 3.5) { + return("maturation_progressing") + } + + return(NA_character_) +} + +extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { + if (USE_UNIFORM_AGE) { + message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE)) + if (!is.null(field_boundaries_sf)) { + return(data.frame( + field_id = field_boundaries_sf$field, + date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)), + stringsAsFactors = FALSE + )) + } else { + return(NULL) + } + } + + if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { + message("Warning: No harvesting data available.") + return(NULL) + } + + tryCatch({ + planting_dates <- harvesting_data %>% + arrange(field, desc(season_start)) %>% + distinct(field, .keep_all = TRUE) %>% + select(field, season_start) %>% + rename(field_id = field, planting_date = season_start) %>% + filter(!is.na(planting_date)) %>% + as.data.frame() + + message(paste("Extracted planting dates for", nrow(planting_dates), "fields")) + return(planting_dates) + }, error = function(e) { + message(paste("Error extracting planting dates:", e$message)) + return(NULL) + }) +} + +# ============================================================================ +# MODULAR STATISTICS CALCULATION +# ============================================================================ + +calculate_field_statistics <- function(field_boundaries_sf, week_num, year, + mosaic_dir, report_date = Sys.Date()) { + + message(paste("Calculating statistics for all fields - Week", week_num, year)) + + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_num, year) + tile_files <- list.files(mosaic_dir, pattern = tile_pattern, full.names = TRUE) + + if (length(tile_files) == 0) { + stop(paste("No tile files found for week", week_num, year, "in", mosaic_dir)) + } + + message(paste(" Found", length(tile_files), "tiles for week", week_num)) + + results_list <- list() + fields_processed <- 0 + + for (tile_idx in seq_along(tile_files)) { + tile_file <- tile_files[tile_idx] + + tryCatch({ + current_rast <- terra::rast(tile_file) + ci_band <- current_rast[["CI"]] + + if (is.null(ci_band) || !inherits(ci_band, "SpatRaster")) { + message(paste(" [SKIP] Tile", basename(tile_file), "- CI band not found")) + return(NULL) + } + + extracted <- terra::extract(ci_band, field_boundaries_sf, na.rm = FALSE) + unique_field_ids <- unique(extracted$ID[!is.na(extracted$ID)]) + + for (field_poly_idx in unique_field_ids) { + field_id <- field_boundaries_sf$field[field_poly_idx] + ci_vals <- extracted$CI[extracted$ID == field_poly_idx] + ci_vals <- ci_vals[!is.na(ci_vals)] + + if (length(ci_vals) == 0) { + next + } + + mean_ci <- mean(ci_vals, na.rm = TRUE) + ci_std <- sd(ci_vals, na.rm = TRUE) + cv <- if (mean_ci > 0) ci_std / mean_ci else NA_real_ + range_min <- min(ci_vals, na.rm = TRUE) + range_max <- max(ci_vals, na.rm = TRUE) + range_str <- sprintf("%.1f-%.1f", range_min, range_max) + ci_percentiles_str <- get_ci_percentiles(ci_vals) + + field_rows <- extracted[extracted$ID == field_poly_idx, ] + num_total <- nrow(field_rows) + num_data <- sum(!is.na(field_rows$CI)) + pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 + cloud_cat <- if (num_data == 0) "No image available" + else if (pct_clear >= 99.5) "Clear view" + else "Partial coverage" + + age_weeks <- NA_real_ + if (USE_UNIFORM_AGE) { + age_weeks <- as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) + } + phase <- get_phase_by_age(age_weeks) + + germination_progress <- NA_character_ + if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks < 17) { + pct_ci_ge_threshold <- sum(ci_vals >= GERMINATION_CI_THRESHOLD) / length(ci_vals) * 100 + germination_progress <- sprintf("%.1f%%", pct_ci_ge_threshold) + } + + existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id) + + if (length(existing_idx) > 0) { + next + } + + results_list[[length(results_list) + 1]] <- data.frame( + Field_id = field_id, + Mean_CI = round(mean_ci, 2), + CV = round(cv, 4), + CI_range = range_str, + CI_Percentiles = ci_percentiles_str, + Cloud_pct_clear = pct_clear, + Cloud_category = cloud_cat, + Age_week = round(age_weeks, 1), + Phase = phase, + Germination_progress = germination_progress, + stringsAsFactors = FALSE + ) + + fields_processed <- fields_processed + 1 + } + + message(paste(" Tile", tile_idx, "of", length(tile_files), "processed")) + + }, error = function(e) { + message(paste(" [ERROR] Tile", basename(tile_file), ":", e$message)) + }) + } + + if (length(results_list) == 0) { + stop(paste("No fields processed successfully for week", week_num)) + } + + stats_df <- dplyr::bind_rows(results_list) + message(paste(" ✓ Successfully calculated statistics for", nrow(stats_df), "fields")) + + return(stats_df) +} + +# ============================================================================ +# CALCULATE KPI TRENDS +# ============================================================================ + +calculate_kpi_trends <- function(current_stats, prev_stats = NULL, + project_dir = NULL, reports_dir = NULL, + current_week = NULL, year = NULL) { + + message("Calculating KPI trends from current and previous week data") + + current_stats$Weekly_ci_change <- NA_real_ + current_stats$CV_Trend_Short_Term <- NA_real_ + current_stats$Four_week_trend <- NA_real_ + current_stats$CV_Trend_Long_Term <- NA_real_ + current_stats$nmr_weeks_in_this_phase <- 1L + + if (is.null(prev_stats) || nrow(prev_stats) == 0) { + message(" No previous week data available - using defaults") + return(current_stats) + } + + message(paste(" prev_stats has", nrow(prev_stats), "rows and", ncol(prev_stats), "columns")) + + prev_lookup <- setNames(seq_len(nrow(prev_stats)), prev_stats$Field_id) + + prev_field_analysis <- NULL + + tryCatch({ + analysis_dir <- file.path(reports_dir, "kpis", "field_analysis") + if (dir.exists(analysis_dir)) { + analysis_files <- list.files(analysis_dir, pattern = "_field_analysis_week.*\\.csv$", full.names = TRUE) + if (length(analysis_files) > 0) { + recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] + prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, + col_select = c(Field_id, nmr_weeks_in_this_phase, Phase)) + } + } + }, error = function(e) { + message(paste(" Note: Could not load previous field_analysis for nmr_weeks tracking:", e$message)) + }) + + if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { + message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase")) + } + + historical_4weeks <- list() + historical_8weeks <- list() + + if (!is.null(project_dir) && !is.null(reports_dir) && !is.null(current_week)) { + message(" Loading historical field_stats for 4-week and 8-week trends...") + + for (lookback in 1:4) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + tryCatch({ + stats_data <- readRDS(rds_path) + historical_4weeks[[length(historical_4weeks) + 1]] <- list( + week = target_week, + stats = stats_data + ) + }, error = function(e) { + message(paste(" Warning: Could not load week", target_week, ":", e$message)) + }) + } + } + + for (lookback in 1:8) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + tryCatch({ + stats_data <- readRDS(rds_path) + historical_8weeks[[length(historical_8weeks) + 1]] <- list( + week = target_week, + stats = stats_data + ) + }, error = function(e) { + # Silently skip + }) + } + } + + if (length(historical_4weeks) > 0) { + message(paste(" Loaded", length(historical_4weeks), "weeks for 4-week trend")) + } + if (length(historical_8weeks) > 0) { + message(paste(" Loaded", length(historical_8weeks), "weeks for 8-week CV trend")) + } + } + + cv_trends_calculated <- 0 + four_week_trends_calculated <- 0 + cv_long_term_calculated <- 0 + + for (i in seq_len(nrow(current_stats))) { + field_id <- current_stats$Field_id[i] + prev_idx <- prev_lookup[field_id] + + if (!is.na(prev_idx) && prev_idx > 0 && prev_idx <= nrow(prev_stats)) { + prev_row <- prev_stats[prev_idx, , drop = FALSE] + + prev_ci <- prev_row$Mean_CI[1] + if (!is.na(prev_ci) && !is.na(current_stats$Mean_CI[i])) { + current_stats$Weekly_ci_change[i] <- + round(current_stats$Mean_CI[i] - prev_ci, 2) + } + + prev_cv <- prev_row$CV[1] + if (!is.na(prev_cv) && !is.na(current_stats$CV[i])) { + current_stats$CV_Trend_Short_Term[i] <- + calculate_cv_trend(current_stats$CV[i], prev_cv) + cv_trends_calculated <- cv_trends_calculated + 1 + } + + if (length(historical_4weeks) > 0) { + ci_values_4week <- numeric() + + for (hist_idx in rev(seq_along(historical_4weeks))) { + hist_data <- historical_4weeks[[hist_idx]]$stats + hist_field <- which(hist_data$Field_id == field_id) + if (length(hist_field) > 0 && !is.na(hist_data$Mean_CI[hist_field[1]])) { + ci_values_4week <- c(ci_values_4week, hist_data$Mean_CI[hist_field[1]]) + } + } + + ci_values_4week <- c(ci_values_4week, current_stats$Mean_CI[i]) + + if (length(ci_values_4week) >= 2) { + current_stats$Four_week_trend[i] <- calculate_four_week_trend(ci_values_4week) + four_week_trends_calculated <- four_week_trends_calculated + 1 + } + } + + if (length(historical_8weeks) > 0) { + cv_values_8week <- numeric() + + for (hist_idx in rev(seq_along(historical_8weeks))) { + hist_data <- historical_8weeks[[hist_idx]]$stats + hist_field <- which(hist_data$Field_id == field_id) + if (length(hist_field) > 0 && !is.na(hist_data$CV[hist_field[1]])) { + cv_values_8week <- c(cv_values_8week, hist_data$CV[hist_field[1]]) + } + } + + cv_values_8week <- c(cv_values_8week, current_stats$CV[i]) + + if (length(cv_values_8week) >= 2) { + slope <- calculate_cv_trend_long_term(cv_values_8week) + current_stats$CV_Trend_Long_Term[i] <- slope + cv_long_term_calculated <- cv_long_term_calculated + 1 + } + } + + if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { + prev_analysis_row <- prev_field_analysis %>% + dplyr::filter(Field_id == field_id) + + if (nrow(prev_analysis_row) > 0) { + prev_phase_analysis <- prev_analysis_row$Phase[1] + prev_nmr_weeks_analysis <- prev_analysis_row$nmr_weeks_in_this_phase[1] + + if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase_analysis)) { + if (current_stats$Phase[i] == prev_phase_analysis) { + current_stats$nmr_weeks_in_this_phase[i] <- + if (!is.na(prev_nmr_weeks_analysis)) prev_nmr_weeks_analysis + 1L else 2L + } else { + current_stats$nmr_weeks_in_this_phase[i] <- 1L + } + } + } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { + if (current_stats$Phase[i] == prev_row$Phase[1]) { + current_stats$nmr_weeks_in_this_phase[i] <- 2L + } else { + current_stats$nmr_weeks_in_this_phase[i] <- 1L + } + } + } else { + if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { + if (current_stats$Phase[i] == prev_row$Phase[1]) { + current_stats$nmr_weeks_in_this_phase[i] <- 2L + } else { + current_stats$nmr_weeks_in_this_phase[i] <- 1L + } + } + } + } + } + + message(paste(" ✓ Calculated CV_Trend_Short_Term:", cv_trends_calculated, "fields")) + message(paste(" ✓ Calculated Four_week_trend:", four_week_trends_calculated, "fields")) + message(paste(" ✓ Calculated CV_Trend_Long_Term:", cv_long_term_calculated, "fields")) + return(current_stats) +} + +# ============================================================================ +# LOAD OR CALCULATE WEEKLY STATISTICS +# ============================================================================ + +load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf, + mosaic_dir, reports_dir, report_date = Sys.Date()) { + + rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, week_num) + rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) + + if (file.exists(rds_path)) { + message(paste("Loading cached statistics from:", basename(rds_path))) + return(readRDS(rds_path)) + } + + message(paste("Cached RDS not found, calculating statistics from tiles for week", week_num)) + stats_df <- calculate_field_statistics(field_boundaries_sf, week_num, year, + mosaic_dir, report_date) + + output_dir <- file.path(reports_dir, "kpis", "field_stats") + if (!dir.exists(output_dir)) { + dir.create(output_dir, recursive = TRUE, showWarnings = FALSE) + } + + saveRDS(stats_df, rds_path) + message(paste("Saved weekly statistics RDS:", basename(rds_path))) + + csv_filename <- sprintf("%s_field_stats_week%02d.csv", project_dir, week_num) + csv_path <- file.path(output_dir, csv_filename) + readr::write_csv(stats_df, csv_path) + message(paste("Saved weekly statistics CSV:", basename(csv_path))) + + return(stats_df) +} + +load_historical_field_data <- function(project_dir, current_week, reports_dir, num_weeks = 4, auto_generate = TRUE, field_boundaries_sf = NULL) { + historical_data <- list() + loaded_weeks <- c() + missing_weeks <- c() + + for (lookback in 0:(num_weeks - 1)) { + target_week <- current_week - lookback + if (target_week < 1) target_week <- target_week + 52 + + csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", target_week), ".csv") + csv_path <- file.path(reports_dir, "kpis", "field_analysis", csv_filename) + + if (file.exists(csv_path)) { + tryCatch({ + data <- read_csv(csv_path, show_col_types = FALSE) + historical_data[[lookback + 1]] <- list( + week = target_week, + data = data + ) + loaded_weeks <- c(loaded_weeks, target_week) + }, error = function(e) { + message(paste(" Warning: Could not load week", target_week, ":", e$message)) + missing_weeks <<- c(missing_weeks, target_week) + }) + } else { + missing_weeks <- c(missing_weeks, target_week) + } + } + + if (length(missing_weeks) > 0 && auto_generate) { + message(paste("⚠ Missing weeks:", paste(missing_weeks, collapse = ", "))) + message("Scanning for ALL available weekly mosaics and calculating stats...\n") + + if (is.null(field_boundaries_sf)) { + message(" Error: field_boundaries_sf not provided - cannot auto-generate") + return(historical_data) + } + + if (!exists("weekly_tile_max")) { + message(" ✗ weekly_tile_max path not defined") + return(historical_data) + } + + check_paths <- c(file.path(weekly_tile_max, "5x5"), weekly_tile_max) + mosaic_scan_dir <- NA + + for (check_path in check_paths) { + if (dir.exists(check_path)) { + tif_files <- list.files(check_path, pattern = "week_.*\\.tif$", full.names = TRUE) + if (length(tif_files) > 0) { + mosaic_scan_dir <- check_path + break + } + } + } + + if (is.na(mosaic_scan_dir)) { + message(" ✗ No mosaic files found in weekly_tile_max") + return(historical_data) + } + + weeks_to_load <- 8 + today <- Sys.Date() + target_dates <- today - (0:(weeks_to_load - 1)) * 7 + + expected_weeks <- data.frame( + date = target_dates, + week = as.numeric(format(target_dates, "%V")), + year = as.numeric(format(target_dates, "%Y")), + stringsAsFactors = FALSE + ) + expected_weeks <- unique(expected_weeks) + + message(paste(" Expected weeks (last 8 from", format(today, "%Y-%m-%d"), "):")) + for (i in seq_len(nrow(expected_weeks))) { + message(paste(" Week", sprintf("%02d", expected_weeks$week[i]), expected_weeks$year[i])) + } + message("") + + tif_files <- list.files(mosaic_scan_dir, pattern = "week_([0-9]{2})_([0-9]{4})_[0-9]{2}\\.tif$", + full.names = FALSE) + + available_weeks <- data.frame() + for (filename in tif_files) { + matches <- regmatches(filename, gregexpr("week_([0-9]{2})_([0-9]{4})", filename))[[1]] + if (length(matches) > 0) { + week_year <- strsplit(matches[1], "_")[[1]] + if (length(week_year) == 3) { + week_num <- as.numeric(week_year[2]) + year_num <- as.numeric(week_year[3]) + + if (week_num %in% expected_weeks$week && year_num %in% expected_weeks$year) { + available_weeks <- rbind(available_weeks, + data.frame(week = week_num, year = year_num)) + } + } + } + } + + available_weeks <- unique(available_weeks) + available_weeks <- merge(available_weeks, expected_weeks[, c("week", "year", "date")], by = c("week", "year")) + available_weeks <- available_weeks[order(available_weeks$date, decreasing = TRUE), ] + + if (nrow(available_weeks) == 0) { + message(" ✗ No matching mosaic files found") + message(paste(" Scanned directory:", mosaic_scan_dir)) + return(historical_data) + } + + message(paste(" Found", nrow(available_weeks), "week(s) with available mosaics:")) + + for (i in seq_len(nrow(available_weeks))) { + week_to_calc <- available_weeks$week[i] + year_to_calc <- available_weeks$year[i] + date_to_calc <- available_weeks$date[i] + + tile_pattern <- sprintf("week_%02d_%d_([0-9]{2})\\.tif", week_to_calc, year_to_calc) + tile_files <- list.files(mosaic_scan_dir, pattern = tile_pattern, full.names = TRUE) + + if (length(tile_files) == 0) { + message(paste(" ✗ Week", sprintf("%02d", week_to_calc), year_to_calc, "- no tiles found")) + next + } + + message(paste(" ✓ Week", sprintf("%02d", week_to_calc), year_to_calc, "-", length(tile_files), "mosaics")) + + tryCatch({ + week_stats <- load_or_calculate_weekly_stats( + week_num = week_to_calc, + year = year_to_calc, + project_dir = project_dir, + field_boundaries_sf = field_boundaries_sf, + mosaic_dir = mosaic_scan_dir, + reports_dir = reports_dir, + report_date = date_to_calc + ) + + if (!is.null(week_stats) && nrow(week_stats) > 0) { + message(paste(" ✓ Calculated stats for", nrow(week_stats), "fields")) + + historical_data[[length(historical_data) + 1]] <- list( + week = week_to_calc, + year = year_to_calc, + data = week_stats + ) + loaded_weeks <- c(loaded_weeks, paste0(week_to_calc, "_", year_to_calc)) + } + }, error = function(e) { + message(paste(" ✗ Error:", e$message)) + }) + } + } + + if (length(historical_data) == 0) { + message(paste("Error: No historical field data found and could not auto-generate weeks")) + return(NULL) + } + + message(paste("✓ Loaded", length(historical_data), "weeks of historical data:", + paste(loaded_weeks, collapse = ", "))) + + return(historical_data) +} + +# ============================================================================ +# HELPER: Extract field-level statistics from CI raster +# ============================================================================ + +extract_field_statistics_from_ci <- function(ci_band, field_boundaries_sf) { + #' Extract CI statistics for all fields from a single CI raster band + + extract_result <- terra::extract(ci_band, field_boundaries_sf) + + stats_list <- list() + + for (field_idx in seq_len(nrow(field_boundaries_sf))) { + field_pixels <- extract_result[extract_result$ID == field_idx, 2] + pixels <- as.numeric(field_pixels[!is.na(field_pixels)]) + + if (length(pixels) == 0) { + stats_list[[field_idx]] <- data.frame( + field_idx = field_idx, + mean_ci = NA_real_, + cv = NA_real_, + p10 = NA_real_, + p90 = NA_real_, + min_ci = NA_real_, + max_ci = NA_real_, + pixel_count_valid = 0, + pixel_count_total = 0, + stringsAsFactors = FALSE + ) + next + } + + mean_val <- mean(pixels, na.rm = TRUE) + cv_val <- if (mean_val > 0) sd(pixels, na.rm = TRUE) / mean_val else NA_real_ + p10_val <- quantile(pixels, probs = CI_PERCENTILE_LOW, na.rm = TRUE)[[1]] + p90_val <- quantile(pixels, probs = CI_PERCENTILE_HIGH, na.rm = TRUE)[[1]] + min_val <- min(pixels, na.rm = TRUE) + max_val <- max(pixels, na.rm = TRUE) + + stats_list[[field_idx]] <- data.frame( + field_idx = field_idx, + mean_ci = mean_val, + cv = cv_val, + p10 = p10_val, + p90 = p90_val, + min_ci = min_val, + max_ci = max_val, + pixel_count_valid = length(pixels), + pixel_count_total = nrow(extract_result[extract_result$ID == field_idx, ]), + stringsAsFactors = FALSE + ) + } + + return(dplyr::bind_rows(stats_list)) +} + +# ============================================================================ +# COMMENTED OUT / UNUSED FUNCTIONS (kept for future use) +# ============================================================================ + +# analyze_single_field <- function(field_idx, field_boundaries_sf, tile_grid, week_num, year, +# mosaic_dir, historical_data = NULL, planting_dates = NULL, +# report_date = Sys.Date(), harvest_imminence_data = NULL, +# harvesting_data = NULL) { +# # [Function kept as reference for parallel field analysis] +# # Currently replaced by calculate_field_statistics() for efficiency +# } From 9f312131d7ba24fb2f227b5e78fc791f167166a7 Mon Sep 17 00:00:00 2001 From: Timon Date: Sun, 18 Jan 2026 10:14:08 +0100 Subject: [PATCH 12/15] integration lstm 31 and 80 complete - angata kpi file now dynamic --- r_app/80_calculate_kpis.R | 174 ++++++++++++++++++++++++------- r_app/80_report_building_utils.R | 9 +- r_app/80_weekly_stats_utils.R | 53 ++++------ 3 files changed, 166 insertions(+), 70 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 9c5b2b4..410ed40 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -16,16 +16,15 @@ # CRITICAL INTEGRATIONS: # # 1. IMMINENT_PROB FROM HARVEST MODEL (MODEL_307) -# [ ] Load script 31 output: {project}_imminent_harvest_week{WW}.csv +# [✓] Load script 31 output: {project}_week_{WW}_{YYYY}.csv # Columns: field, imminent_prob, detected_prob, week, year -# [ ] LEFT JOIN to field_analysis_df by (field, week, year) -# [ ] Replace hardcoded "placeholder data" in Status_trigger calculation -# [ ] Update column to show actual harvest probability (0-1 or 0-100%) +# [✓] LEFT JOIN to field_analysis_df by field +# [✓] Use actual harvest probability data instead of placeholder # -# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 31) -# [ ] Scripts 22 & 31 populate harvest.xlsx with planting_date per field -# [ ] Load harvest.xlsx instead of using UNIFORM_PLANTING_DATE -# [ ] Calculate Age_week = difftime(report_date, planting_date, units="weeks") +# 2. AGE FROM HARVEST.XLSX (SCRIPTS 22 & 23) +# [✓] Load harvest.xlsx with planting_date (season_start) +# [✓] Extract planting dates per field +# [✓] Calculate Age_week = difftime(report_date, planting_date, units="weeks") # # COMMAND-LINE USAGE: # Option 1: Rscript 80_calculate_kpis.R 2026-01-14 angata @@ -43,8 +42,9 @@ # ============================================================================ # NEXT INTEGRATIONS (See Linear issues for detailed requirements) # ============================================================================ -# 1. Load imminent_prob from script 31 (harvest_imminent_weekly.csv) -# 2. Load planting_date from harvest.xlsx for field-specific age calculation +# 1. [✓] Load imminent_prob from script 31 (week_WW_YYYY.csv) +# 2. [✓] Load planting_date from harvest.xlsx for field-specific age calculation +# 3. [ ] Improve Status_trigger logic to use actual imminent_prob values # ============================================================================ # ============================================================================ @@ -86,8 +86,7 @@ CI_PERCENTILE_HIGH <- 0.90 GERMINATION_CI_THRESHOLD <- 2.0 # PLANTING DATE & AGE CONFIGURATION -USE_UNIFORM_AGE <- TRUE -UNIFORM_PLANTING_DATE <- as.Date("2026-01-01") +# Load from harvest.xlsx (scripts 22 & 23) - no fallback to uniform dates # HISTORICAL DATA LOOKBACK WEEKS_FOR_FOUR_WEEK_TREND <- 4 @@ -307,6 +306,26 @@ main <- function() { auto_generate = allow_auto_gen, field_boundaries_sf = field_boundaries_sf) + # Load harvest.xlsx for planting dates (season_start) + message("\nLoading harvest data from harvest.xlsx for planting dates...") + harvest_file_path <- file.path(data_dir, "harvest.xlsx") + + harvesting_data <- tryCatch({ + if (file.exists(harvest_file_path)) { + harvest_raw <- readxl::read_excel(harvest_file_path) + harvest_raw$season_start <- as.Date(harvest_raw$season_start) + harvest_raw$season_end <- as.Date(harvest_raw$season_end) + message(paste(" ✓ Loaded harvest data:", nrow(harvest_raw), "rows")) + harvest_raw + } else { + message(paste(" WARNING: harvest.xlsx not found at", harvest_file_path)) + NULL + } + }, error = function(e) { + message(paste(" ERROR loading harvest.xlsx:", e$message)) + NULL + }) + planting_dates <- extract_planting_dates(harvesting_data, field_boundaries_sf) # Validate planting_dates @@ -374,6 +393,28 @@ main <- function() { message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase")) + # Load weekly harvest probabilities from script 31 (if available) + message("\n4. Loading harvest probabilities from script 31...") + harvest_prob_file <- file.path(reports_dir, "kpis", "field_stats", + sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, year)) + message(paste(" Looking for:", harvest_prob_file)) + + imminent_prob_data <- tryCatch({ + if (file.exists(harvest_prob_file)) { + prob_df <- readr::read_csv(harvest_prob_file, show_col_types = FALSE) + message(paste(" ✓ Loaded harvest probabilities for", nrow(prob_df), "fields")) + prob_df %>% + select(field, imminent_prob, detected_prob) %>% + rename(Field_id = field, Imminent_prob_actual = imminent_prob, Detected_prob = detected_prob) + } else { + message(paste(" INFO: Harvest probabilities not available (script 31 not run)")) + NULL + } + }, error = function(e) { + message(paste(" WARNING: Could not load harvest probabilities:", e$message)) + NULL + }) + # ============================================================================ # Build final output dataframe with all 21 columns # ============================================================================ @@ -427,32 +468,91 @@ main <- function() { }, # Columns 5-6: Already in current_stats (Mean_CI, Weekly_ci_change) # Column 7: Four_week_trend (from current_stats) - # Column 8: Last_harvest_or_planting_date (dummy for now) - Last_harvest_or_planting_date = UNIFORM_PLANTING_DATE, - # Columns 9-10: Already in current_stats (Age_week, Phase) - # Column 11: nmr_weeks_in_this_phase (already calculated) - # Column 12: Germination_progress (already calculated) - # Column 13: Imminent_prob (placeholder) - Imminent_prob = "placeholder data", - # Column 14: Status_trigger (need to add) + # Column 8: Last_harvest_or_planting_date (from harvest.xlsx - season_start) + Last_harvest_or_planting_date = { + planting_dates$planting_date[match(Field_id, planting_dates$field_id)] + }, + # Column 9: Age_week (calculated from report date and planting date) + Age_week = { + sapply(seq_len(nrow(current_stats)), function(idx) { + planting_dt <- Last_harvest_or_planting_date[idx] + if (is.na(planting_dt)) { + return(NA_real_) + } + round(as.numeric(difftime(end_date, planting_dt, units = "weeks")), 1) + }) + }, + # Column 10: Phase (recalculate based on updated Age_week) + Phase = { + sapply(Age_week, function(age) { + if (is.na(age)) return(NA_character_) + if (age >= 0 & age < 4) return("Germination") + if (age >= 4 & age < 17) return("Tillering") + if (age >= 17 & age < 39) return("Grand Growth") + if (age >= 39) return("Maturation") + NA_character_ + }) + }, + # Column 11: nmr_weeks_in_this_phase (already in current_stats from calculate_kpi_trends) + # Column 12: Germination_progress (calculated here from CI values) + Germination_progress = { + sapply(seq_len(nrow(current_stats)), function(idx) { + age_w <- Age_week[idx] + mean_ci_val <- Mean_CI[idx] + + # Only relevant for germination phase (0-4 weeks) + if (is.na(age_w) || age_w < 0 || age_w >= 4) { + return(NA_character_) + } + + # Estimate % of field with CI >= germination threshold + # Based on mean CI, estimate germination percentage + if (mean_ci_val >= 0.4) { + return(">80%") + } else if (mean_ci_val >= 0.25) { + return("50-80%") + } else if (mean_ci_val >= 0.1) { + return("20-50%") + } else { + return("<20%") + } + }) + }, + # Column 13: Imminent_prob (from script 31 or NA if not available) + Imminent_prob = { + if (!is.null(imminent_prob_data)) { + imminent_prob_data$Imminent_prob_actual[match(Field_id, imminent_prob_data$Field_id)] + } else { + rep(NA_real_, nrow(current_stats)) + } + }, + # Column 14: Status_trigger (based on harvest probability + growth status) Status_trigger = { triggers <- sapply(seq_len(nrow(current_stats)), function(idx) { - field_id <- current_stats$Field_id[idx] - field_idx <- which(field_boundaries_sf$field == field_id)[1] - if (is.na(field_idx)) return(NA_character_) + imminent_prob <- Imminent_prob[idx] + age_w <- Age_week[idx] + ci_change <- Weekly_ci_change[idx] + phase <- Phase[idx] - # Reconstruct CI values from Mean_CI for status trigger logic - # For now, use simplified approach - age_w <- current_stats$Age_week[idx] - ci_change <- current_stats$Weekly_ci_change[idx] + # Priority 1: Harvest imminent (high probability) + if (!is.na(imminent_prob) && imminent_prob > 0.5) { + return("harvest_imminent") + } - # Using mean CI as proxy (could be improved with pixel distribution) - ci_vals <- rep(current_stats$Mean_CI[idx], 100) - get_status_trigger(ci_vals, ci_change, age_w) + # Priority 2: Age-based triggers + if (!is.na(age_w)) { + if (age_w >= 45) return("harvest_ready") + if (age_w >= 39) return("maturation_progressing") + if (age_w >= 4 & age_w < 39) return("growth_on_track") + if (age_w < 4) return("germination_started") + } + + # Fallback + NA_character_ }) triggers }, - # Columns 15-16: Already in current_stats (CI_range, CI_Percentiles) + # Columns 15-16: CI-based columns already in current_stats (CI_range, CI_Percentiles) # Column 17: Already in current_stats (CV) # Column 18: Already in current_stats (CV_Trend_Short_Term) # Column 19: CV_Trend_Long_Term (from current_stats - raw slope value) @@ -464,11 +564,12 @@ main <- function() { .keep = "all" # Keep all existing columns ) %>% select( - Field_id, Farm_Section, Field_name, Acreage, Mean_CI, Weekly_ci_change, - Four_week_trend, Last_harvest_or_planting_date, Age_week, Phase, - nmr_weeks_in_this_phase, Germination_progress, Imminent_prob, Status_trigger, - CI_range, CI_Percentiles, CV, CV_Trend_Short_Term, CV_Trend_Long_Term, CV_Trend_Long_Term_Category, - Cloud_pct_clear, Cloud_category + all_of(c("Field_id", "Farm_Section", "Field_name", "Acreage", "Mean_CI", "Weekly_ci_change", + "Four_week_trend", "Last_harvest_or_planting_date", "Age_week", "Phase", + "nmr_weeks_in_this_phase", "Germination_progress", "Imminent_prob", "Status_trigger", + "CV", "CV_Trend_Short_Term", "CV_Trend_Long_Term", "CV_Trend_Long_Term_Category", + "Cloud_pct_clear", "Cloud_category")), + any_of(c("CI_range", "CI_Percentiles")) ) message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 21 columns")) @@ -480,6 +581,7 @@ main <- function() { summary_statistics_df, project_dir, current_week, + year, reports_dir ) diff --git a/r_app/80_report_building_utils.R b/r_app/80_report_building_utils.R index 35f20a7..7b7f4e9 100644 --- a/r_app/80_report_building_utils.R +++ b/r_app/80_report_building_utils.R @@ -106,7 +106,7 @@ generate_field_analysis_summary <- function(field_df) { # EXPORT FUNCTIONS # ============================================================================ -export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, reports_dir) { +export_field_analysis_excel <- function(field_df, summary_df, project_dir, current_week, year, reports_dir) { message("Exporting per-field analysis to Excel, CSV, and RDS...") field_df_rounded <- field_df %>% @@ -120,7 +120,7 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre dir.create(output_subdir, recursive = TRUE) } - excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".xlsx") + excel_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".xlsx") excel_path <- file.path(output_subdir, excel_filename) excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE) @@ -137,18 +137,19 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre field_analysis_summary = summary_df_rounded, metadata = list( current_week = current_week, + year = year, project = project_dir, created_at = Sys.time() ) ) - rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d", current_week), ".rds") + rds_filename <- paste0(project_dir, "_kpi_summary_tables_week", sprintf("%02d_%d", current_week, year), ".rds") rds_path <- file.path(reports_dir, "kpis", rds_filename) saveRDS(kpi_data, rds_path) message(paste("✓ Field analysis RDS exported to:", rds_path)) - csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d", current_week), ".csv") + csv_filename <- paste0(project_dir, "_field_analysis_week", sprintf("%02d_%d", current_week, year), ".csv") csv_path <- file.path(output_subdir, csv_filename) write_csv(field_df_rounded, csv_path) message(paste("✓ Field analysis CSV exported to:", csv_path)) diff --git a/r_app/80_weekly_stats_utils.R b/r_app/80_weekly_stats_utils.R index 3a5f1d2..fb5dc8b 100644 --- a/r_app/80_weekly_stats_utils.R +++ b/r_app/80_weekly_stats_utils.R @@ -335,21 +335,18 @@ get_status_trigger <- function(ci_values, ci_change, age_weeks) { } extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) { - if (USE_UNIFORM_AGE) { - message(paste("Using uniform planting date for all fields:", UNIFORM_PLANTING_DATE)) + # Extract planting dates from harvest.xlsx (season_start column) + # Returns: data.frame with columns (field_id, planting_date) + + if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { + message("Warning: No harvesting data available - planting dates will be NA.") if (!is.null(field_boundaries_sf)) { return(data.frame( field_id = field_boundaries_sf$field, - date = rep(UNIFORM_PLANTING_DATE, nrow(field_boundaries_sf)), + planting_date = rep(as.Date(NA), nrow(field_boundaries_sf)), stringsAsFactors = FALSE )) - } else { - return(NULL) } - } - - if (is.null(harvesting_data) || nrow(harvesting_data) == 0) { - message("Warning: No harvesting data available.") return(NULL) } @@ -362,7 +359,7 @@ extract_planting_dates <- function(harvesting_data, field_boundaries_sf = NULL) filter(!is.na(planting_date)) %>% as.data.frame() - message(paste("Extracted planting dates for", nrow(planting_dates), "fields")) + message(paste("Extracted planting dates for", nrow(planting_dates), "fields from harvest.xlsx")) return(planting_dates) }, error = function(e) { message(paste("Error extracting planting dates:", e$message)) @@ -431,17 +428,8 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, else if (pct_clear >= 99.5) "Clear view" else "Partial coverage" - age_weeks <- NA_real_ - if (USE_UNIFORM_AGE) { - age_weeks <- as.numeric(difftime(report_date, UNIFORM_PLANTING_DATE, units = "weeks")) - } - phase <- get_phase_by_age(age_weeks) - - germination_progress <- NA_character_ - if (!is.na(age_weeks) && age_weeks >= 0 && age_weeks < 17) { - pct_ci_ge_threshold <- sum(ci_vals >= GERMINATION_CI_THRESHOLD) / length(ci_vals) * 100 - germination_progress <- sprintf("%.1f%%", pct_ci_ge_threshold) - } + # Age_week and Phase are now calculated in main script using actual planting dates + # Germination_progress is calculated in main script after Age_week is known existing_idx <- which(sapply(results_list, function(x) x$Field_id) == field_id) @@ -457,9 +445,6 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, CI_Percentiles = ci_percentiles_str, Cloud_pct_clear = pct_clear, Cloud_category = cloud_cat, - Age_week = round(age_weeks, 1), - Phase = phase, - Germination_progress = germination_progress, stringsAsFactors = FALSE ) @@ -536,9 +521,13 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, for (lookback in 1:4) { target_week <- current_week - lookback - if (target_week < 1) target_week <- target_week + 52 + target_year <- year + if (target_week < 1) { + target_week <- target_week + 52 + target_year <- target_year - 1 + } - rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year) rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) if (file.exists(rds_path)) { @@ -556,9 +545,13 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, for (lookback in 1:8) { target_week <- current_week - lookback - if (target_week < 1) target_week <- target_week + 52 + target_year <- year + if (target_week < 1) { + target_week <- target_week + 52 + target_year <- target_year - 1 + } - rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, target_week) + rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, target_week, target_year) rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) if (file.exists(rds_path)) { @@ -693,7 +686,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_boundaries_sf, mosaic_dir, reports_dir, report_date = Sys.Date()) { - rds_filename <- sprintf("%s_field_stats_week%02d.rds", project_dir, week_num) + rds_filename <- sprintf("%s_field_stats_week%02d_%d.rds", project_dir, week_num, year) rds_path <- file.path(reports_dir, "kpis", "field_stats", rds_filename) if (file.exists(rds_path)) { @@ -713,7 +706,7 @@ load_or_calculate_weekly_stats <- function(week_num, year, project_dir, field_bo saveRDS(stats_df, rds_path) message(paste("Saved weekly statistics RDS:", basename(rds_path))) - csv_filename <- sprintf("%s_field_stats_week%02d.csv", project_dir, week_num) + csv_filename <- sprintf("%s_field_stats_week%02d_%d.csv", project_dir, week_num, year) csv_path <- file.path(output_dir, csv_filename) readr::write_csv(stats_df, csv_path) message(paste("Saved weekly statistics CSV:", basename(csv_path))) From c11b10a73f2bb556df350e2b72c377b1d1191bdb Mon Sep 17 00:00:00 2001 From: Timon Date: Sun, 18 Jan 2026 11:25:31 +0100 Subject: [PATCH 13/15] working on visualisation, not working too well... --- python_app/31_harvest_imminent_weekly.py | 106 ++++--- python_app/batch_rgb_validation_top_fields.py | 288 +++++++++++++++++ python_app/debug_all_tiles_for_date.py | 107 +++++++ python_app/debug_field_mask.py | 102 ++++++ python_app/debug_tiff_inspect.py | 47 +++ .../batch_plot_fields_rgb.py | 299 ++++++++++++++++++ .../experiments => }/rgb_visualization.py | 256 ++++++++++----- r_app/10_create_master_grid_and_split_tiffs.R | 146 ++++++--- r_app/40_mosaic_creation.R | 2 - 9 files changed, 1182 insertions(+), 171 deletions(-) create mode 100644 python_app/batch_rgb_validation_top_fields.py create mode 100644 python_app/debug_all_tiles_for_date.py create mode 100644 python_app/debug_field_mask.py create mode 100644 python_app/debug_tiff_inspect.py create mode 100644 python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py rename python_app/{harvest_detection_experiments/experiment_framework/experiments => }/rgb_visualization.py (68%) diff --git a/python_app/31_harvest_imminent_weekly.py b/python_app/31_harvest_imminent_weekly.py index 09cf3de..8722eda 100644 --- a/python_app/31_harvest_imminent_weekly.py +++ b/python_app/31_harvest_imminent_weekly.py @@ -7,18 +7,19 @@ is approaching harvest. Use this for operational decision-making and real-time a RUN FREQUENCY: Weekly (or daily if required) INPUT: - - ci_data_for_python.csv (recent CI data from 02b_convert_rds_to_csv.R) + - harvest.xlsx (baseline from scripts 22+23 - contains last harvest date per field) + Location: laravel_app/storage/app/{project}/Data/harvest.xlsx + - ci_data_for_python.csv (complete CI data from R script) Location: laravel_app/storage/app/{project}/Data/extracted_ci/ci_data_for_python/ci_data_for_python.csv - - harvest_production_export.xlsx (baseline from script 01 - optional, for reference) OUTPUT: - - harvest_imminent_weekly.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year) + - reports/kpis/field_stats/{project}_harvest_imminent_week_{WW}_{YYYY}.csv (weekly probabilities: field, imminent_prob, detected_prob, week, year) Workflow: -1. Load harvest_production_export.xlsx (baseline dates - optional, for context) -2. Load ci_data_for_python.csv (recent CI data) -3. For each field, extract last 300 days of history -4. Run Model 307 inference on full sequence (last timestep probabilities) -5. Export harvest_imminent_weekly.csv with probabilities +1. Load harvest.xlsx to find last harvest date (season_end) per field +2. Load ci_data_for_python.csv (complete CI data) +3. For each field, extract all CI data AFTER last harvest (complete current season) +4. Run Model 307 inference on full season sequence (last timestep probabilities) +5. Export week_WW_YYYY.csv with probabilities Output Columns: - field: Field ID @@ -61,33 +62,34 @@ from harvest_date_pred_utils import ( def load_harvest_dates(harvest_file): - """Load latest harvest end dates from Excel file (from harvest_production_export.xlsx).""" - print("[1/5] Loading harvest dates...") + """Load last harvest end dates from harvest.xlsx (output from scripts 22+23).""" + print("[1/5] Loading harvest data for season boundaries...") if not Path(harvest_file).exists(): print(f" ERROR: {harvest_file} not found") - print(" Using 180-day lookback as default") + print(f" harvest.xlsx is required to determine current season boundaries") return None try: harvest_df = pd.read_excel(harvest_file) - print(f" Loaded {len(harvest_df)} field-season records") + print(f" Loaded {len(harvest_df)} season records") - # Use season_end_date column (output from harvest prediction script) - harvest_df['season_end_date'] = pd.to_datetime(harvest_df['season_end_date']) + # season_end contains the last harvest date for each season + harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end']) + harvest_df['field'] = harvest_df['field'].astype(str).str.strip() - # Group by field and get the latest season_end_date + # Group by field and get the LATEST season_end_date (most recent harvest) + # This marks the start of the current season harvest_dates = {} for field_id, group in harvest_df.groupby('field'): - latest_end = group['season_end_date'].max() - harvest_dates[str(field_id).strip()] = latest_end + latest_harvest = group['season_end'].max() + harvest_dates[field_id] = latest_harvest print(f" Successfully mapped {len(harvest_dates)} fields") - print(f" Harvest end dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}") + print(f" Last harvest dates range: {min(harvest_dates.values()).date()} to {max(harvest_dates.values()).date()}") return harvest_dates except Exception as e: - print(f" ERROR loading harvest file: {e}") - print(f" Using 180-day lookback instead") + print(f" ERROR loading harvest.xlsx: {e}") return None @@ -212,26 +214,37 @@ def main(): # Get project name from command line or use default project_name = sys.argv[1] if len(sys.argv) > 1 else "angata" - # Construct paths - base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + # Construct paths - work from either python_app/ or root smartcane/ directory + # Try root first (laravel_app/...), then fall back to ../ (running from python_app/) + if Path("laravel_app/storage/app").exists(): + base_storage = Path("laravel_app/storage/app") / project_name / "Data" + else: + base_storage = Path("../laravel_app/storage/app") / project_name / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" CI_DATA_FILE = ci_data_dir / "ci_data_for_python.csv" - harvest_data_dir = base_storage / "HarvestData" - BASELINE_FILE = harvest_data_dir / "harvest_production_export.xlsx" - OUTPUT_CSV = harvest_data_dir / "harvest_imminent_weekly.csv" - harvest_data_dir.mkdir(parents=True, exist_ok=True) # Create if doesn't exist + HARVEST_FILE = base_storage / "harvest.xlsx" # Output from scripts 22+23 + + # Determine week and year from current date for timestamped export + today = datetime.now() + week_num = int(today.strftime('%V')) + year_num = int(today.strftime('%Y')) + + # Output directory: reports/kpis/field_stats/ + reports_dir = base_storage.parent / "reports" / "kpis" / "field_stats" + reports_dir.mkdir(parents=True, exist_ok=True) + OUTPUT_CSV = reports_dir / f"{project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv" print("="*80) print(f"HARVEST IMMINENT PROBABILITY - WEEKLY MONITORING ({project_name})") print("="*80) - # [1] Load harvest dates (optional - for projects with predictions) - harvest_dates = None - if BASELINE_FILE.exists(): - harvest_dates = load_harvest_dates(BASELINE_FILE) - else: - print("[1/5] Loading harvest dates...") - print(f" INFO: {BASELINE_FILE} not found (optional for weekly monitoring)") + # [1] Load harvest dates (required to determine season boundaries) + harvest_dates = load_harvest_dates(HARVEST_FILE) + if harvest_dates is None or len(harvest_dates) == 0: + print(f"ERROR: Cannot run without harvest.xlsx - required to determine current season boundaries") + print(f" Please run scripts 22 (baseline prediction) and 23 (format conversion) first") + return # [2] Load CI data print(f"\n[2/5] Loading CI data...") @@ -271,6 +284,9 @@ def main(): count = 0 for field_id in ci_data['field'].unique(): + # Convert field_id to string for consistency + field_id_str = str(field_id).strip() + # Get metadata meta = field_meta[field_meta['field'] == field_id] if len(meta) == 0: @@ -279,18 +295,21 @@ def main(): sub_field = meta['sub_field'].iloc[0] latest_date = meta['latest_date'].iloc[0] - # Use recent CI history (last 300 days from latest available data) + # Get last harvest date for this field (start of current season) + last_harvest = harvest_dates.get(field_id_str) + if last_harvest is None: + continue + + # Extract all CI data AFTER last harvest (complete current season) field_data = ci_data[ci_data['field'] == field_id].copy() field_data = field_data.sort_values('Date') + field_data = field_data[field_data['Date'] > last_harvest] # After last harvest - # Keep last 300 days of history for inference - if len(field_data) > 300: - field_data = field_data.iloc[-300:] - + # Need at least 30 days of data since planting if len(field_data) < 30: continue - # Run inference on recent history to predict next 28 days + # Run inference on full current season to predict next 28 days imminent_prob, detected_prob = run_inference_on_season( field_data, model, config, scalers, device, ci_column ) @@ -338,10 +357,11 @@ def main(): print(f" WARNING: No results exported - check CI data availability") print(f"\nStorage structure:") - print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/") - print(f" Input baseline: laravel_app/storage/app/{project_name}/Data/HarvestData/harvest_production_export.xlsx") - print(f" Output: laravel_app/storage/app/{project_name}/Data/HarvestData/") - print(f"\nReady to load into 09b field analysis report") + print(f" Input harvest: laravel_app/storage/app/{project_name}/Data/harvest.xlsx") + print(f" Input CI: laravel_app/storage/app/{project_name}/Data/extracted_ci/ci_data_for_python/") + print(f" Output: laravel_app/storage/app/{project_name}/reports/kpis/field_stats/") + print(f" Filename: {project_name}_harvest_imminent_week_{week_num:02d}_{year_num}.csv") + print(f"\nReady to load into 80_calculate_kpis.R") if __name__ == "__main__": diff --git a/python_app/batch_rgb_validation_top_fields.py b/python_app/batch_rgb_validation_top_fields.py new file mode 100644 index 0000000..cccb9ae --- /dev/null +++ b/python_app/batch_rgb_validation_top_fields.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +""" +Batch RGB Validation for Top 50 Largest Fields + +Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields. +Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. + +Configuration: +- GeoJSON: pivot.geojson (defines field boundaries and sizes) +- Harvest data: harvest.xlsx (season_end dates for completed harvests) +- CI data: ci_data_for_python.csv +- Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png + +Usage: + python batch_rgb_validation_top_fields.py + +Output: + - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ + - Filenames: field___harvest_rgb.png + - Each grid shows 15 images at 7-day intervals around the season_end date +""" + +import json +import numpy as np +import pandas as pd +from pathlib import Path +from datetime import datetime, timedelta +import sys + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from rgb_visualization import generate_rgb_grids + + +def load_geojson_and_calculate_areas(geojson_path): + """ + Load GeoJSON and calculate area for each field. + + Returns: + pd.DataFrame: Columns [field, field_name, area_m2] sorted by area descending + """ + geojson_path = Path(geojson_path) + + if not geojson_path.exists(): + print(f"✗ GeoJSON not found: {geojson_path}") + return None + + print(f"Loading GeoJSON: {geojson_path}") + + with open(geojson_path) as f: + geojson_data = json.load(f) + + fields = [] + + for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + field_id = str(props.get('field', '')) + field_name = props.get('name', f"field_{field_id}") + + geometry = feature.get('geometry', {}) + geom_type = geometry.get('type', '') + coordinates = geometry.get('coordinates', []) + + # Simple area calculation using Shoelace formula + area_m2 = 0 + if geom_type == 'Polygon' and coordinates: + coords = coordinates[0] # Exterior ring + area_m2 = calculate_polygon_area(coords) + elif geom_type == 'MultiPolygon' and coordinates: + for poly_coords in coordinates: + area_m2 += calculate_polygon_area(poly_coords[0]) + + if area_m2 > 0: + fields.append({ + 'field': field_id, + 'field_name': field_name, + 'area_m2': area_m2, + 'area_hectares': area_m2 / 10000 + }) + + df = pd.DataFrame(fields) + df = df.sort_values('area_m2', ascending=False) + + print(f" ✓ Loaded {len(df)} fields") + print(f" Top 10 largest fields (hectares):") + for i, row in df.head(10).iterrows(): + print(f" {row['field_name']:30s} ({row['field']:>6s}): {row['area_hectares']:>8.2f} ha") + + return df + + +def calculate_polygon_area(coords): + """ + Calculate area of polygon using Shoelace formula (in m²). + Assumes coordinates are in lat/lon (roughly converts to meters). + """ + if len(coords) < 3: + return 0 + + # Rough conversion: at equator, 1 degree ≈ 111 km + # For lat/lon coordinates, use average latitude + lats = [c[1] for c in coords] + avg_lat = np.mean(lats) + lat_m_per_deg = 111000 + lon_m_per_deg = 111000 * np.cos(np.radians(avg_lat)) + + # Convert to meters + coords_m = [] + for lon, lat in coords: + x = (lon - coords[0][0]) * lon_m_per_deg + y = (lat - coords[0][1]) * lat_m_per_deg + coords_m.append((x, y)) + + # Shoelace formula + area = 0 + for i in range(len(coords_m)): + j = (i + 1) % len(coords_m) + area += coords_m[i][0] * coords_m[j][1] + area -= coords_m[j][0] * coords_m[i][1] + + return abs(area) / 2 + + +def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df): + """ + Load harvest data from Excel file and get latest completed season for each field. + + Returns season_end date for each field (latest complete season where season_end is not null). + + Args: + harvest_xlsx_path (Path): Path to harvest.xlsx + top_50_fields_df (pd.DataFrame): DataFrame with 'field' column for filtering + + Returns: + dict: {field_id: {'field_name': str, 'harvest_date': pd.Timestamp}} + """ + harvest_xlsx_path = Path(harvest_xlsx_path) + + if not harvest_xlsx_path.exists(): + print(f"✗ Harvest Excel file not found: {harvest_xlsx_path}") + return {} + + print(f"Loading harvest data: {harvest_xlsx_path}") + + try: + harvest_df = pd.read_excel(harvest_xlsx_path) + + # Ensure date columns are datetime + if 'season_end' in harvest_df.columns: + harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') + + # Filter to top 50 fields and get only rows with season_end filled in + top_50_field_ids = set(top_50_fields_df['field'].astype(str).str.strip()) + harvest_df['field'] = harvest_df['field'].astype(str).str.strip() + harvest_df = harvest_df[harvest_df['field'].isin(top_50_field_ids)] + harvest_df = harvest_df[harvest_df['season_end'].notna()] + + # Group by field and get the LATEST (most recent) season_end + latest_harvests = {} + + for field_id in top_50_field_ids: + field_records = harvest_df[harvest_df['field'] == field_id] + + if len(field_records) > 0: + # Get row with latest season_end + latest_idx = field_records['season_end'].idxmax() + latest_row = field_records.loc[latest_idx] + + # Get field name from top_50_fields_df + field_info = top_50_fields_df[top_50_fields_df['field'] == field_id] + if len(field_info) > 0: + field_name = field_info.iloc[0]['field_name'] + else: + field_name = f"field_{field_id}" + + latest_harvests[field_id] = { + 'field_name': field_name, + 'harvest_date': latest_row['season_end'] + } + + print(f" ✓ Loaded latest complete seasons for {len(latest_harvests)} fields") + + return latest_harvests + + except Exception as e: + print(f"✗ Error loading harvest data: {e}") + return {} + + +def main(): + print("="*90) + print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS") + print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery") + print("="*90) + + # Configuration + geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") + harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx") + output_dir = Path("laravel_app/storage/app/angata/RGB") + tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") + + # Verify paths + if not geojson_path.exists(): + print(f"✗ GeoJSON not found: {geojson_path}") + return + if not harvest_xlsx.exists(): + print(f"✗ Harvest Excel not found: {harvest_xlsx}") + return + if not tiff_dir.exists(): + print(f"✗ TIFF directory not found: {tiff_dir}") + return + + output_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Load GeoJSON and get top 50 largest fields + print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...") + fields_df = load_geojson_and_calculate_areas(geojson_path) + if fields_df is None: + return + + top_50_fields = fields_df.head(50) + print(f" ✓ Selected {len(top_50_fields)} largest fields for processing") + + # Step 2: Load harvest dates from Excel + print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") + harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields) + + if len(harvest_dates) == 0: + print("✗ No harvest dates found in Excel file") + return + + print(f" ✓ Found {len(harvest_dates)} fields with completed seasons") + for field_id, info in list(harvest_dates.items())[:5]: + print(f" - {info['field_name']:30s}: {info['harvest_date'].strftime('%Y-%m-%d')}") + if len(harvest_dates) > 5: + print(f" ... and {len(harvest_dates) - 5} more") + + # Step 3: Generate RGB grids for each field + print("\n[3/4] Generating RGB validation grids...") + rgb_count = 0 + + for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): + field_name = harvest_info['field_name'] + harvest_date = harvest_info['harvest_date'] + + try: + # Run RGB visualization (harvest dates only, no registered/predicted distinction) + results = generate_rgb_grids( + field_data=None, # Not needed - just for function compatibility + field_id=field_id, + registered_harvest_dates=[], # Empty - using harvest.xlsx instead + predicted_harvest_dates=[ + { + 'harvest_date': harvest_date, + 'model_name': 'harvest_xlsx' + } + ], + output_dir=str(output_dir), # All PNGs in same folder + tiff_dir=str(tiff_dir), + geojson_path=str(geojson_path) + ) + + if results['predicted']: + rgb_count += 1 + print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✓ {harvest_date.strftime('%Y-%m-%d')}") + else: + print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ⚠ No RGB grid (no imagery available)") + + except Exception as e: + print(f" [{idx:2d}/{len(harvest_dates)}] {field_name}: ✗ Error - {e}") + + # Summary + print("\n" + "="*90) + print(f"SUMMARY:") + print(f" Fields with harvest dates: {len(harvest_dates)}") + print(f" RGB grids generated: {rgb_count}/{len(harvest_dates)}") + print(f" Output directory: {output_dir}") + print("="*90) + print("\nVisual inspection checklist:") + print(" ✓ Brown/bare soil at T~0d (harvest date) = Field properly harvested") + print(" ⚠ Green vegetation at T~0d = Possible data error or replanting") + print(" ✓ Green → Brown progression = Normal harvest sequence") + print("="*90) + + +if __name__ == "__main__": + main() diff --git a/python_app/debug_all_tiles_for_date.py b/python_app/debug_all_tiles_for_date.py new file mode 100644 index 0000000..ba07fe7 --- /dev/null +++ b/python_app/debug_all_tiles_for_date.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +""" +Debug script to find all tiles for a date and check which overlap with field boundary +""" + +import json +import rasterio +from rasterio.mask import mask +from pathlib import Path +import numpy as np +import shapely.geometry as shgeom +import pandas as pd + +# Load field 79 boundary +geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") +field_id = "79" + +print(f"Loading field {field_id} from GeoJSON...") +with open(geojson_path) as f: + geojson_data = json.load(f) + +field_boundary = None +for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + if str(props.get('field', '')) == str(field_id): + geometry = feature.get('geometry') + if geometry: + geom_type = geometry.get('type', '') + coordinates = geometry.get('coordinates', []) + + if geom_type == 'MultiPolygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0][0] + field_boundary = shgeom.Polygon(coords) + elif geom_type == 'Polygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0] + field_boundary = shgeom.Polygon(coords) + break + +if field_boundary is None: + print(f"Field {field_id} not found") + exit(1) + +print(f"Field boundary bounds: {field_boundary.bounds}") +print(f"Field boundary area: {field_boundary.area}") + +# Find a specific date directory +tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") +target_date = pd.Timestamp("2026-01-15") # Use a recent date that exists + +# Find tiles for that date +date_dirs = [] +for date_dir in tiff_dir.iterdir(): + if date_dir.is_dir(): + try: + dir_name = date_dir.name + date_str = dir_name.split('_')[0] + tile_date = pd.Timestamp(date_str) + if tile_date == target_date: + date_dirs.append(date_dir) + except: + pass + +if not date_dirs: + print(f"No tiles found for {target_date}") + exit(1) + +print(f"\nFound {len(date_dirs)} date directory(ies) for {target_date}") + +for date_dir in date_dirs: + print(f"\n=== Checking date directory: {date_dir.name} ===") + + tiles = list(date_dir.glob("*.tif")) + print(f"Found {len(tiles)} tiles in this directory") + + for tile_path in sorted(tiles): + try: + with rasterio.open(tile_path) as src: + tile_bounds = src.bounds + tile_geom = shgeom.box(*tile_bounds) + + intersects = field_boundary.intersects(tile_geom) + intersection = field_boundary.intersection(tile_geom) if intersects else None + intersection_area = intersection.area if intersection else 0 + + print(f"\n{tile_path.name}") + print(f" Tile bounds: {tile_bounds}") + print(f" Intersects field: {intersects}") + if intersects: + print(f" Intersection area: {intersection_area:.8f}") + + # Try to mask this tile + geom = shgeom.mapping(field_boundary) + try: + masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) + print(f" ✓ Successfully masked! Shape: {masked_data.shape}") + + # Check the data in each band + for i, band_idx in enumerate([1, 2, 3]): + band_data = masked_data[i] + non_zero = (band_data != 0).sum() + print(f" Band {band_idx}: {non_zero} non-zero pixels out of {band_data.size}") + except ValueError as e: + print(f" ✗ Masking failed: {e}") + except Exception as e: + print(f" Error reading tile: {e}") diff --git a/python_app/debug_field_mask.py b/python_app/debug_field_mask.py new file mode 100644 index 0000000..ce96700 --- /dev/null +++ b/python_app/debug_field_mask.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +""" +Debug script to diagnose why field boundary masking produces no data +""" + +import json +import rasterio +from rasterio.mask import mask +from pathlib import Path +import numpy as np +import shapely.geometry as shgeom + +# Load a sample field boundary +geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") +field_id = "79" # A field that had issues + +print(f"Loading field {field_id} from GeoJSON...") +with open(geojson_path) as f: + geojson_data = json.load(f) + +field_boundary = None +for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + if str(props.get('field', '')) == str(field_id): + geometry = feature.get('geometry') + if geometry: + geom_type = geometry.get('type', '') + coordinates = geometry.get('coordinates', []) + + if geom_type == 'MultiPolygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0][0] + field_boundary = shgeom.Polygon(coords) + elif geom_type == 'Polygon': + if coordinates and len(coordinates) > 0: + coords = coordinates[0] + field_boundary = shgeom.Polygon(coords) + break + +if field_boundary is None: + print(f"Field {field_id} not found") + exit(1) + +print(f"Field boundary bounds: {field_boundary.bounds}") +print(f"Field boundary area: {field_boundary.area}") + +# Load a sample TIFF tile +tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") +tile_file = None +for date_dir in sorted(tiff_dir.iterdir()): + if date_dir.is_dir(): + for tif in date_dir.glob("*.tif"): + if tif.stat().st_size > 12e6: + tile_file = tif + break + if tile_file: + break + +if not tile_file: + print("No suitable TIFF found") + exit(1) + +print(f"\nTesting with TIFF: {tile_file.name}") + +with rasterio.open(tile_file) as src: + print(f"TIFF Bounds: {src.bounds}") + print(f"TIFF CRS: {src.crs}") + + # Check if field boundary is within tile bounds + tile_box = shgeom.box(*src.bounds) + intersects = field_boundary.intersects(tile_box) + print(f"Field boundary intersects tile: {intersects}") + + if intersects: + intersection = field_boundary.intersection(tile_box) + print(f"Intersection area: {intersection.area}") + print(f"Intersection bounds: {intersection.bounds}") + + # Try to mask and see what we get + print("\nAttempting to mask...") + geom = shgeom.mapping(field_boundary) + try: + masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) + print(f"Masked data shape: {masked_data.shape}") + print(f"Masked data dtype: {masked_data.dtype}") + + # Check the data + for i, band_idx in enumerate([1, 2, 3]): + band_data = masked_data[i] + print(f"\nBand {band_idx}:") + print(f" min: {np.nanmin(band_data):.6f}") + print(f" max: {np.nanmax(band_data):.6f}") + print(f" mean: {np.nanmean(band_data):.6f}") + print(f" % valid (non-zero): {(band_data != 0).sum() / band_data.size * 100:.2f}%") + print(f" % NaN: {np.isnan(band_data).sum() / band_data.size * 100:.2f}%") + + # Show sample values + valid_pixels = band_data[band_data != 0] + if len(valid_pixels) > 0: + print(f" Sample valid values: {valid_pixels[:10]}") + except ValueError as e: + print(f"Error during masking: {e}") diff --git a/python_app/debug_tiff_inspect.py b/python_app/debug_tiff_inspect.py new file mode 100644 index 0000000..be51e5e --- /dev/null +++ b/python_app/debug_tiff_inspect.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +""" +Debug script to inspect TIFF file structure and data +""" + +import rasterio +from pathlib import Path +import numpy as np + +# Pick a tile file to inspect +tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") + +# Find first available tile +tile_file = None +for date_dir in sorted(tiff_dir.iterdir()): + if date_dir.is_dir(): + for tif in date_dir.glob("*.tif"): + if tif.stat().st_size > 12e6: # Skip empty files + tile_file = tif + break + if tile_file: + break + +if not tile_file: + print("No suitable TIFF files found") + exit(1) + +print(f"Inspecting: {tile_file.name}") +print("=" * 80) + +with rasterio.open(tile_file) as src: + print(f"Band count: {src.count}") + print(f"Data type: {src.dtypes[0]}") + print(f"Shape: {src.height} x {src.width}") + print(f"CRS: {src.crs}") + print(f"Bounds: {src.bounds}") + print() + + # Read each band + for band_idx in range(1, min(6, src.count + 1)): + data = src.read(band_idx) + print(f"Band {band_idx}:") + print(f" dtype: {data.dtype}") + print(f" range: {data.min():.6f} - {data.max():.6f}") + print(f" mean: {data.mean():.6f}") + print(f" % valid (non-zero): {(data != 0).sum() / data.size * 100:.1f}%") + print() diff --git a/python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py b/python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py new file mode 100644 index 0000000..412c40b --- /dev/null +++ b/python_app/harvest_detection_experiments/experiment_framework/04_production_export/batch_plot_fields_rgb.py @@ -0,0 +1,299 @@ +""" +Batch Field Visualization Tool - RGB Imagery Around Harvest Date +Purpose: Generate visual validation using RGB satellite imagery samples around +predicted harvest date to verify predictions (bare soil = harvested, green = not harvested) + +Shows 12-15 RGB images in a grid, centered around the predicted harvest date + +Usage: + python batch_plot_fields_rgb.py field1,field2,field3 + python batch_plot_fields_rgb.py 10125,88,97 + + Or read from CSV: + python batch_plot_fields_rgb.py --file fields_to_check.csv +""" + +import pandas as pd +import numpy as np +import torch +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +from pathlib import Path +from datetime import datetime, timedelta +import sys +import rasterio +from rasterio.mask import mask +import geopandas as gpd +from harvest_date_pred_utils import load_model_and_config, extract_features + + +def get_field_centroid(field_id, geojson_path="pivot.geojson"): + """Get centroid of field from GeoJSON for cropping RGB images.""" + try: + gdf = gpd.read_file(geojson_path) + field_geom = gdf[gdf['field'] == str(field_id)] + if len(field_geom) > 0: + centroid = field_geom.geometry.iloc[0].centroid + return (centroid.x, centroid.y) + except Exception as e: + print(f" Warning: Could not get field centroid - {e}") + return None + + +def load_rgb_image(tif_path, field_id=None, geojson_path="pivot.geojson"): + """ + Load RGB bands from 8-band GeoTIFF + Bands: 0=coastal, 1=blue, 2=green, 3=green_i, 4=yellow, 5=red, 6=rededge, 7=nir + RGB = bands 5,3,1 (Red, Green, Blue) + """ + try: + with rasterio.open(tif_path) as src: + # Read RGB bands (bands are 1-indexed in rasterio) + red = src.read(6) # Band 6 = red (0-indexed band 5) + green = src.read(3) # Band 3 = green (0-indexed band 2) + blue = src.read(2) # Band 2 = blue (0-indexed band 1) + + # Stack into RGB image + rgb = np.stack([red, green, blue], axis=2) + + # Normalize to 0-1 range (8-band data is typically 0-10000) + rgb = np.clip(rgb / 5000.0, 0, 1) + + return rgb + except Exception as e: + print(f" Error loading RGB from {tif_path}: {e}") + return None + + +def plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device, + tif_folder="../../../laravel_app/storage/app/angata/merged_tif_8b", + output_dir="validation_plots_rgb"): + """ + Create validation plot for a single field: + - Top: Harvest probability over time with peak marked + - Bottom: 12-15 RGB images in grid around predicted harvest date + """ + # Create output directory + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Filter field data + field_data = ci_data[ci_data['field'] == field_id].copy() + if len(field_data) == 0: + print(f" ✗ Field {field_id}: No CI data found") + return False + + field_data = field_data.sort_values('Date') + print(f" ✓ Field {field_id}: {len(field_data)} days of data") + + try: + # Extract features and run inference + ci_column = config['data']['ci_column'] + feature_names = config['features'] + + feat_array = extract_features(field_data, feature_names, ci_column=ci_column) + if feat_array is None: + print(f" ✗ Field {field_id}: Feature extraction failed") + return False + + # Apply scalers + if isinstance(scalers, dict) and 'features' in scalers: + feat_array = scalers['features'].transform(feat_array) + + # Run inference + with torch.no_grad(): + x_tensor = torch.tensor(feat_array, dtype=torch.float32).unsqueeze(0).to(device) + out_imm, out_det = model(x_tensor) + imm_probs = out_imm.squeeze(0).cpu().numpy() + + # Find peak probability (predicted harvest date) + peak_idx = np.argmax(imm_probs) + peak_date = field_data['Date'].iloc[peak_idx] + peak_prob = imm_probs[peak_idx] + + print(f" Peak probability: {peak_prob:.3f} on {peak_date.strftime('%Y-%m-%d')}") + + # Get date range: ±6 days around peak (12-13 images total) + date_range = field_data['Date'].dt.date + peak_date_only = peak_date.date() if hasattr(peak_date, 'date') else peak_date + + days_before = 6 + days_after = 6 + start_date = peak_date_only - timedelta(days=days_before) + end_date = peak_date_only + timedelta(days=days_after) + + # Find available TIF files in date range + tif_folder_path = Path(tif_folder) + available_dates = [] + for tif_file in sorted(tif_folder_path.glob("*.tif")): + date_str = tif_file.stem # YYYY-MM-DD + try: + tif_date = datetime.strptime(date_str, "%Y-%m-%d").date() + if start_date <= tif_date <= end_date: + available_dates.append((tif_date, tif_file)) + except ValueError: + pass + + if len(available_dates) == 0: + print(f" Warning: No TIF files found in {start_date} to {end_date}") + return False + + print(f" Found {len(available_dates)} RGB images in date range") + + # Load RGB images + rgb_images = [] + rgb_dates = [] + for tif_date, tif_file in available_dates: + rgb = load_rgb_image(str(tif_file), field_id) + if rgb is not None: + rgb_images.append(rgb) + rgb_dates.append(tif_date) + + if len(rgb_images) == 0: + print(f" ✗ No RGB images loaded") + return False + + print(f" Loaded {len(rgb_images)} RGB images") + + # Create figure with probability plot + RGB grid + n_images = len(rgb_images) + n_cols = min(5, n_images) # Max 5 columns + n_rows = (n_images + n_cols - 1) // n_cols # Calculate rows needed + + fig = plt.figure(figsize=(18, 12)) + + # Probability plot (top, spanning all columns) + ax_prob = plt.subplot(n_rows + 1, n_cols, (1, n_cols)) + dates_arr = field_data['Date'].values + ax_prob.plot(dates_arr, imm_probs, '-', color='orange', linewidth=2.5, label='Imminent Probability', alpha=0.8) + ax_prob.axhline(y=0.5, color='red', linestyle='--', linewidth=1.5, alpha=0.5, label='Threshold (0.5)') + ax_prob.axvline(x=peak_date, color='darkred', linestyle=':', linewidth=2, alpha=0.7, label='Peak') + ax_prob.fill_between(dates_arr, 0.5, 1.0, alpha=0.08, color='red') + ax_prob.set_ylim(-0.05, 1.05) + ax_prob.set_ylabel('Probability', fontsize=11, fontweight='bold') + ax_prob.set_xlabel('Date', fontsize=11, fontweight='bold') + ax_prob.set_title(f'Field {field_id} - Model 307 Harvest Probability', fontsize=12, fontweight='bold') + ax_prob.grid(True, alpha=0.3) + ax_prob.legend(loc='upper right', fontsize=9) + ax_prob.xaxis.set_major_locator(mdates.MonthLocator(interval=1)) + ax_prob.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) + plt.setp(ax_prob.xaxis.get_majorticklabels(), rotation=45, ha='right') + + # Annotate peak + ax_prob.annotate(f'{peak_prob:.2f}\n{peak_date_only}', + xy=(peak_date, peak_prob), + xytext=(20, 20), textcoords='offset points', + bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.8), + arrowprops=dict(arrowstyle='->', lw=1.5, color='darkred')) + + # RGB images in grid (below probability plot) + for i, (rgb, date) in enumerate(zip(rgb_images, rgb_dates)): + ax = plt.subplot(n_rows + 1, n_cols, n_cols + i + 1) + ax.imshow(rgb, extent=[0, 100, 0, 100]) + + # Highlight peak date + date_label = date.strftime('%m-%d') + is_peak = date == peak_date_only + color = 'darkred' if is_peak else 'black' + weight = 'bold' if is_peak else 'normal' + size = 11 if is_peak else 9 + + ax.set_title(date_label, fontsize=size, fontweight=weight, color=color) + ax.set_xticks([]) + ax.set_yticks([]) + + plt.suptitle(f'Field {field_id} RGB Imagery: {len(rgb_images)} Days Around Peak Harvest Probability\nPeak: {peak_prob:.2f} on {peak_date_only} | Green = Growing | Brown/Bare = Harvested', + fontsize=13, fontweight='bold', y=0.995) + plt.tight_layout() + + # Save + output_file = Path(output_dir) / f"field_{field_id}_rgb_validation.png" + plt.savefig(output_file, dpi=100, bbox_inches='tight') + print(f" ✓ Saved: {output_file}") + plt.close() + + return True + + except Exception as e: + print(f" ✗ Field {field_id}: Error - {e}") + import traceback + traceback.print_exc() + return False + + +def main(): + print("="*80) + print("BATCH RGB VISUALIZATION TOOL") + print("Visual check: RGB imagery around predicted harvest date") + print("="*80) + + # Parse arguments + fields_to_plot = [] + + if len(sys.argv) < 2: + print("\nUsage:") + print(" python batch_plot_fields_rgb.py field1,field2,field3") + print(" python batch_plot_fields_rgb.py --file fields.csv") + print("\nExample:") + print(" python batch_plot_fields_rgb.py 10125,88,97,440") + return + + if sys.argv[1] == "--file": + if len(sys.argv) < 3: + print("ERROR: --file requires a CSV filename") + return + csv_file = sys.argv[2] + print(f"\n[1/4] Loading fields from CSV: {csv_file}") + try: + df = pd.read_csv(csv_file) + fields_to_plot = df['field'].astype(str).str.strip().tolist() + print(f" ✓ Loaded {len(fields_to_plot)} fields") + except Exception as e: + print(f" ✗ Error reading CSV: {e}") + return + else: + # Parse comma-separated list + fields_to_plot = [f.strip() for f in sys.argv[1].split(',')] + print(f"\n[1/4] Processing {len(fields_to_plot)} field(s): {', '.join(fields_to_plot)}") + + # Load CI data + print("\n[2/4] Loading CI data...") + try: + ci_data = pd.read_csv("ci_data_for_python.csv") + ci_data['Date'] = pd.to_datetime(ci_data['Date']) + ci_data['field'] = ci_data['field'].astype(str).str.strip() + print(f" ✓ Loaded {len(ci_data)} observations for {ci_data['field'].nunique()} fields") + except Exception as e: + print(f" ✗ Error loading CI data: {e}") + return + + # Load model + print("\n[3/4] Loading model...") + try: + model, config, scalers = load_model_and_config(Path(".")) + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model.eval() + print(f" ✓ Model loaded on {device}") + except Exception as e: + print(f" ✗ Error loading model: {e}") + return + + # Process each field + print("\n[4/4] Generating RGB validation plots...") + success_count = 0 + for field_id in fields_to_plot: + if plot_field_rgb_validation(field_id, ci_data, model, config, scalers, device): + success_count += 1 + + # Summary + print("\n" + "="*80) + print(f"SUMMARY: {success_count}/{len(fields_to_plot)} fields processed successfully") + print(f"Output directory: validation_plots_rgb/") + print("="*80) + print("\nInspect the PNG files to verify predictions:") + print(" ✓ Green imagery BEFORE peak date (field growing)") + print(" ✓ Brown/Bare imagery AT/AFTER peak date (harvested)") + print(" ✓ Peak date marked with red title") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/experiment_framework/experiments/rgb_visualization.py b/python_app/rgb_visualization.py similarity index 68% rename from python_app/harvest_detection_experiments/experiment_framework/experiments/rgb_visualization.py rename to python_app/rgb_visualization.py index dff8e34..c1fd2e0 100644 --- a/python_app/harvest_detection_experiments/experiment_framework/experiments/rgb_visualization.py +++ b/python_app/rgb_visualization.py @@ -22,6 +22,8 @@ import numpy as np import pandas as pd from pathlib import Path from datetime import datetime, timedelta +import matplotlib +matplotlib.use('Agg') # Use non-interactive backend to avoid display hangs import matplotlib.pyplot as plt import matplotlib.patches as patches from matplotlib.colors import Normalize @@ -86,55 +88,95 @@ def load_field_boundaries(geojson_path, field_id): return None, None -def find_closest_tiff(target_date, tiff_dir, days_window=60, field_boundary=None): +def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60): """ - Find available TIFF file closest to target date. - Skips obviously empty files (< 12 MB) without reading data. + Find all tile files for target_date (or closest date) that overlap with field_boundary. - TIFF files are named: YYYY-MM-DD.tif + Tile files are organized in subdirectories by date: 5x5/YYYY-MM-DD_HH/*.tif Args: - target_date (pd.Timestamp): Target date to find TIFF near - tiff_dir (Path): Directory containing TIFF files + target_date (pd.Timestamp): Target date to find tiles near + tiff_dir (Path): Directory containing 5x5 date subdirectories + field_boundary (shapely.Polygon): Field boundary for overlap detection days_window (int): Max days to search before/after target - field_boundary (shapely.Polygon): Unused, kept for compatibility Returns: - Path: Path to closest TIFF file or None if not found - int: Days difference from target (negative=before, positive=after) + tuple: (list of tile paths, actual_date, days_diff) + list: tile paths that overlap field + pd.Timestamp: actual date of tiles found + int: days difference from target to actual date found """ target_date = pd.Timestamp(target_date) tiff_dir = Path(tiff_dir) - available_tiffs = list(tiff_dir.glob('*.tif')) - if not available_tiffs: - return None, None + if not tiff_dir.exists(): + return [], None, None - # Parse dates from filenames, skip obviously empty files by size - tiff_dates = [] - min_size_mb = 12.0 # Empty files are ~11.56 MB, valid files are ~12.6-13.0 MB + # Find all date subdirectories + available_dates = {} # {date: ([tile file paths], actual_dir_name)} + min_size_mb = 12.0 # Empty files are ~11.56 MB - for tiff_path in available_tiffs: + for date_dir in tiff_dir.iterdir(): + if not date_dir.is_dir(): + continue + try: - # Quick size check to skip obviously empty files - file_size_mb = tiff_path.stat().st_size / (1024 * 1024) - if file_size_mb < min_size_mb: + # Parse date from directory name (YYYY-MM-DD or YYYY-MM-DD_HH) + dir_name = date_dir.name + # Extract just the date part before underscore if it exists + date_str = dir_name.split('_')[0] + tile_date = pd.Timestamp(date_str) + days_diff = (tile_date - target_date).days + + if abs(days_diff) > days_window: continue - date_str = tiff_path.stem # Remove .tif extension - tiff_date = pd.Timestamp(date_str) - days_diff = (tiff_date - target_date).days - if abs(days_diff) <= days_window: - tiff_dates.append((tiff_path, tiff_date, days_diff)) + # Find all .tif files in this date directory + tile_files = [] + for tile_file in date_dir.glob('*.tif'): + # Skip obviously empty files + file_size_mb = tile_file.stat().st_size / (1024 * 1024) + if file_size_mb >= min_size_mb: + tile_files.append(tile_file) + + if tile_files: + available_dates[tile_date] = (tile_files, dir_name) except: pass - if not tiff_dates: - return None, None + if not available_dates: + return [], None, None - # Return closest by distance - closest = min(tiff_dates, key=lambda x: abs(x[2])) - return closest[0], closest[2] + # Find closest date + closest_date = min(available_dates.keys(), key=lambda d: abs((d - target_date).days)) + days_diff = (closest_date - target_date).days + tiles, _ = available_dates[closest_date] + + # Filter tiles to only those that overlap field boundary + if rasterio is None or field_boundary is None: + # If rasterio not available, use all tiles (conservative approach) + return tiles, closest_date, days_diff + + overlapping_tiles = [] + + for tile_path in tiles: + try: + with rasterio.open(tile_path) as src: + # Get tile bounds + tile_bounds = src.bounds # (left, bottom, right, top) + tile_geom = shgeom.box(*tile_bounds) + + # Check if tile overlaps field + if tile_geom.intersects(field_boundary): + overlapping_tiles.append(tile_path) + except: + pass + + if not overlapping_tiles: + # No overlapping tiles found, return all tiles for the closest date + return tiles, closest_date, days_diff + + return overlapping_tiles, closest_date, days_diff def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): @@ -148,12 +190,6 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): - Band 4: NIR - Band 5: CI - For older merged_tif_8b files (raw Planet Scope 8-band): - - Bands 1=Coastal Blue, 2=Blue, 3=Green, 4=Red, 5=Red Edge, 6=NIR, 7=SWIR1, 8=SWIR2 - - RGB would be bands 4,3,2 = Red, Green, Blue - - This function auto-detects the format based on band count and descriptions. - Args: tiff_path (Path): Path to TIFF file field_boundary (shapely.Polygon): Field boundary for clipping @@ -168,58 +204,106 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): try: with rasterio.open(tiff_path) as src: - # Check CRS compatibility + # Check band count if src.count < 3: - print(f" ⚠ TIFF has only {src.count} bands (need at least 3 for RGB)") return None - # Auto-detect format based on band count and descriptions - if src.count == 5 and hasattr(src, 'descriptions') and src.descriptions: - # merged_final_tif format: Red, Green, Blue, NIR, CI - bands_to_read = (1, 2, 3) - elif src.count == 9: - # merged_tif_8b format: use bands 4, 3, 2 for Red, Green, Blue - bands_to_read = (4, 3, 2) - else: - # Default: try to use first 3 bands or specified bands - bands_to_read = rgb_bands - print(f" ℹ Unknown TIFF format ({src.count} bands), using bands {bands_to_read}") + # For merged_final_tif: bands 1,2,3 are R,G,B + bands_to_read = (1, 2, 3) - # Mask and read bands (crop=True reads only the clipped window, not full resolution) + # Mask and read bands geom = shgeom.mapping(field_boundary) try: - # Read RGB bands at once, then mask (more efficient than masking 3 times) masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read)) - # masked_data shape is (3, height, width) - one layer per band - rgb_data = [] - for i, band_idx in enumerate(bands_to_read): - band_data = masked_data[i] # Extract the i-th band from masked stack - - # Better debug output that handles NaN values - valid_data = band_data[~np.isnan(band_data)] - if len(valid_data) > 0: - print(f" DEBUG: Band {band_idx} valid data range: {valid_data.min():.4f} - {valid_data.max():.4f} ({len(valid_data)} valid pixels)") - else: - print(f" DEBUG: Band {band_idx} - all NaN!") - rgb_data.append(band_data) - # Stack RGB - rgb = np.stack(rgb_data, axis=-1) + rgb = np.stack([masked_data[i] for i in range(3)], axis=-1) - # Data is already normalized 0-1 from the merged_final_tif files - # Just ensure it's float32 and clipped + # Convert to float32 if not already rgb = rgb.astype(np.float32) + + # Normalize to 0-1 range + # Data appears to be 8-bit (0-255 range) stored as float32 + # Check actual max value to determine normalization + max_val = np.nanmax(rgb) + if max_val > 0: + # If max is around 255 or less, assume 8-bit + if max_val <= 255: + rgb = rgb / 255.0 + # If max is around 65535, assume 16-bit + elif max_val <= 65535: + rgb = rgb / 65535.0 + # Otherwise divide by max to normalize + else: + rgb = rgb / max_val + rgb = np.clip(rgb, 0, 1) - print(f" DEBUG: Final RGB range: {rgb.min():.4f} - {rgb.max():.4f}") + # Check if result is all NaN + if np.all(np.isnan(rgb)): + return None + + # Replace any remaining NaN with 0 (cloud/invalid pixels) + rgb = np.nan_to_num(rgb, nan=0.0) return rgb - except ValueError as e: - print(f" ⚠ Error clipping to field boundary: {e}") + except ValueError: return None except Exception as e: - print(f" ✗ Error loading TIFF {tiff_path.name}: {e}") + return None + + +def load_and_composite_tiles_rgb(tile_paths, field_boundary): + """ + Load RGB from multiple overlapping tiles and composite them into a single image. + + Args: + tile_paths (list[Path]): List of tile file paths + field_boundary (shapely.Polygon): Field boundary for clipping + + Returns: + np.ndarray: Composited RGB data (height, width, 3) with values 0-1 + or None if error occurs + """ + if rasterio is None or field_boundary is None or not tile_paths: + return None + + try: + # Load and composite all tiles + rgb_arrays = [] + + for tile_path in tile_paths: + rgb = load_and_clip_tiff_rgb(tile_path, field_boundary) + if rgb is not None: + rgb_arrays.append(rgb) + + if not rgb_arrays: + return None + + # If single tile, return it + if len(rgb_arrays) == 1: + composited = rgb_arrays[0] + else: + # If multiple tiles, use max composite + stacked = np.stack(rgb_arrays, axis=0) + composited = np.max(stacked, axis=0) + + composited = composited.astype(np.float32) + + # Stretch contrast: normalize to 0-1 range based on actual min/max in the data + # This makes dim images visible + valid_data = composited[composited > 0] + if len(valid_data) > 0: + data_min = np.percentile(valid_data, 2) # 2nd percentile to handle outliers + data_max = np.percentile(valid_data, 98) # 98th percentile + + if data_max > data_min: + composited = (composited - data_min) / (data_max - data_min) + composited = np.clip(composited, 0, 1) + + return composited.astype(np.float32) + + except Exception as e: return None @@ -276,27 +360,24 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field actual_dates = [] # Store actual dates of TIFFs found for target in target_dates: - tiff_path, days_diff = find_closest_tiff(target, tiff_dir, days_window=60, field_boundary=field_boundary) + tile_paths, actual_date, days_diff = find_overlapping_tiles(target, tiff_dir, field_boundary, days_window=60) - if tiff_path is None: + if not tile_paths or actual_date is None: rgb_images.append(None) days_offsets.append(None) actual_dates.append(None) - print(f" ⚠ No TIFF found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data") + print(f" ⚠ No tiles found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data") continue - # Extract date from filename: YYYY-MM-DD.tif - tiff_date = pd.to_datetime(tiff_path.stem) - - rgb = load_and_clip_tiff_rgb(tiff_path, field_boundary) + rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) rgb_images.append(rgb) days_offsets.append(days_diff) - actual_dates.append(tiff_date) + actual_dates.append(actual_date) if rgb is not None: - print(f" ✓ Loaded {tiff_path.name} ({days_diff:+d}d from target)") + print(f" ✓ Loaded {len(tile_paths)} tile(s) for {actual_date.strftime('%Y-%m-%d')} ({days_diff:+d}d from target)") else: - print(f" ⚠ Loaded {tiff_path.name} but RGB data is None") + print(f" ⚠ Loaded {len(tile_paths)} tile(s) but RGB data is None") # Create 5x3 grid plot (15 images) fig, axes = plt.subplots(3, 5, figsize=(25, 15)) @@ -360,11 +441,16 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field filename = f'field_{field_id}_{harvest_date_str}_{model_name}_harvest_rgb.png' output_path = Path(output_dir) / filename - plt.savefig(output_path, dpi=100, bbox_inches='tight') - plt.close() - - print(f" ✓ Saved: {filename}") - return output_path + try: + plt.savefig(output_path, dpi=100, format='png') + plt.close() + + print(f" ✓ Saved: {filename}") + return output_path + except Exception as e: + plt.close() + print(f" ✗ Error saving PNG: {e}") + return None def generate_rgb_grids(field_data, field_id, registered_harvest_dates, predicted_harvest_dates, diff --git a/r_app/10_create_master_grid_and_split_tiffs.R b/r_app/10_create_master_grid_and_split_tiffs.R index 4f78409..6279fb2 100644 --- a/r_app/10_create_master_grid_and_split_tiffs.R +++ b/r_app/10_create_master_grid_and_split_tiffs.R @@ -11,11 +11,28 @@ library(terra) library(sf) # ============================================================================ -# CONFIGURATION +# CONFIGURATION & COMMAND-LINE ARGUMENTS # ============================================================================ +# Parse command-line arguments for date filtering +args <- commandArgs(trailingOnly = TRUE) + +# Example: Rscript 10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-17 +start_date <- NULL +end_date <- NULL + +if (length(args) >= 1) { + start_date <- as.Date(args[1]) + cat("Filtering: start date =", as.character(start_date), "\n") +} + +if (length(args) >= 2) { + end_date <- as.Date(args[2]) + cat("Filtering: end date =", as.character(end_date), "\n") +} + PROJECT <- "angata" -TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b") +TIFF_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "merged_tif_8b") # GRID SIZE CONFIGURATION - Change this to use different grid sizes # Options: 5x5 (25 tiles), 10x10 (100 tiles), etc. @@ -25,10 +42,10 @@ GRID_NCOLS <- 5 # Construct grid-specific subfolder path GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS) -OUTPUT_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) +OUTPUT_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) # Load field boundaries for overlap checking -GEOJSON_PATH <- file.path("laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") +GEOJSON_PATH <- file.path("..", "laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "") cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") @@ -40,31 +57,50 @@ cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") cat("\n[PART 1] Creating Master Grid\n") # Load field boundaries for overlap checking -cat("\n[1] Loading field boundaries from GeoJSON...\n") +cat("\n[1] Checking for existing master grid...\n") -if (!file.exists(GEOJSON_PATH)) { - stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n", - "Please ensure ", PROJECT, " has a pivot.geojson file.") -} +# Check if master grid already exists +MASTER_GRID_PATH <- file.path(OUTPUT_FOLDER, paste0("master_grid_", GRID_SIZE_LABEL, ".geojson")) -field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE) -field_boundaries_vect <- terra::vect(GEOJSON_PATH) - -cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s)\n", sep = "") - -# Try to find a name column (could be 'name', 'field', 'field_name', etc.) -field_names <- NA -if ("name" %in% names(field_boundaries_sf)) { - field_names <- field_boundaries_sf$name -} else if ("field" %in% names(field_boundaries_sf)) { - field_names <- field_boundaries_sf$field -} else if ("field_name" %in% names(field_boundaries_sf)) { - field_names <- field_boundaries_sf$field_name +if (file.exists(MASTER_GRID_PATH)) { + cat(" ✓ Found existing master grid at:\n ", MASTER_GRID_PATH, "\n", sep = "") + master_grid_sf <- st_read(MASTER_GRID_PATH, quiet = TRUE) + field_boundaries_sf <- NULL # No need to load pivot.geojson + field_boundaries_vect <- NULL + + cat(" ✓ Loaded grid with ", nrow(master_grid_sf), " tiles\n", sep = "") + } else { - field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices + # No existing grid - need to create one from pivot.geojson + cat(" No existing grid found. Creating new one from pivot.geojson...\n") + + if (!file.exists(GEOJSON_PATH)) { + stop("GeoJSON file not found at: ", GEOJSON_PATH, "\n", + "Please ensure ", PROJECT, " has a pivot.geojson file, or run this script ", + "from the same directory as a previous successful run (grid already exists).") + } + + field_boundaries_sf <- st_read(GEOJSON_PATH, quiet = TRUE) + field_boundaries_vect <- terra::vect(GEOJSON_PATH) + + cat(" ✓ Loaded ", nrow(field_boundaries_sf), " field(s) from GeoJSON\n", sep = "") } -cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "") +# Try to find a name column (only if field_boundaries_sf exists) +if (!is.null(field_boundaries_sf)) { + field_names <- NA + if ("name" %in% names(field_boundaries_sf)) { + field_names <- field_boundaries_sf$name + } else if ("field" %in% names(field_boundaries_sf)) { + field_names <- field_boundaries_sf$field + } else if ("field_name" %in% names(field_boundaries_sf)) { + field_names <- field_boundaries_sf$field_name + } else { + field_names <- 1:nrow(field_boundaries_sf) # Fall back to indices + } + + cat(" Fields: ", paste(field_names, collapse = ", "), "\n", sep = "") +} # Helper function: Check if a tile overlaps with any field (simple bbox overlap) tile_overlaps_fields <- function(tile_extent, field_geoms) { @@ -105,6 +141,27 @@ cat("\n[2] Checking TIFF extents...\n") tiff_files <- list.files(TIFF_FOLDER, pattern = "\\.tif$", full.names = FALSE) tiff_files <- sort(tiff_files) +# Filter by date range if specified +if (!is.null(start_date) || !is.null(end_date)) { + cat("\nApplying date filter...\n") + + file_dates <- as.Date(sub("\\.tif$", "", tiff_files)) + + if (!is.null(start_date) && !is.null(end_date)) { + keep_idx <- file_dates >= start_date & file_dates <= end_date + cat(" Date range: ", as.character(start_date), " to ", as.character(end_date), "\n", sep = "") + } else if (!is.null(start_date)) { + keep_idx <- file_dates >= start_date + cat(" From: ", as.character(start_date), "\n", sep = "") + } else { + keep_idx <- file_dates <= end_date + cat(" Until: ", as.character(end_date), "\n", sep = "") + } + + tiff_files <- tiff_files[keep_idx] + cat(" ✓ Filtered to ", length(tiff_files), " file(s)\n", sep = "") +} + if (length(tiff_files) == 0) { stop("No TIFF files found in ", TIFF_FOLDER) } @@ -242,26 +299,33 @@ if (file.exists(master_grid_file)) { cat("\n[PART 2] Creating Filtered Grid (only overlapping tiles)\n") -cat("\n[7] Filtering master grid to only overlapping tiles...\n") - -# Check which tiles overlap with any field -overlapping_tile_indices <- c() -for (tile_idx in 1:nrow(master_grid_sf)) { - tile_geom <- master_grid_sf[tile_idx, ] +# If grid was loaded from file, it's already filtered. Skip filtering. +if (!file.exists(MASTER_GRID_PATH)) { + cat("\n[7] Filtering master grid to only overlapping tiles...\n") - # Check overlap with any field - if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) { - overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx) + # Check which tiles overlap with any field + overlapping_tile_indices <- c() + for (tile_idx in 1:nrow(master_grid_sf)) { + tile_geom <- master_grid_sf[tile_idx, ] + + # Check overlap with any field + if (tile_overlaps_fields(st_bbox(tile_geom$geometry), field_boundaries_sf$geometry)) { + overlapping_tile_indices <- c(overlapping_tile_indices, tile_idx) + } } + + cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "") + cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "") + + # Create filtered grid with only overlapping tiles + filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ] + filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices) +} else { + cat("\n[7] Using pre-filtered grid (already loaded from file)...\n") + # Grid was already loaded - it's already filtered + filtered_grid_sf <- master_grid_sf } -cat(" Found ", length(overlapping_tile_indices), " overlapping tiles out of ", N_TILES, "\n", sep = "") -cat(" Reduction: ", N_TILES - length(overlapping_tile_indices), " empty tiles will NOT be created\n", sep = "") - -# Create filtered grid with only overlapping tiles -filtered_grid_sf <- master_grid_sf[overlapping_tile_indices, ] -filtered_grid_sf$tile_id <- sprintf("%02d", overlapping_tile_indices) - # Convert to SpatVector for makeTiles filtered_grid_vect <- terra::vect(filtered_grid_sf) @@ -314,7 +378,7 @@ for (file_idx in seq_along(tiff_files)) { dir.create(date_output_folder, recursive = TRUE, showWarnings = FALSE) } - cat(" Creating ", length(overlapping_tile_indices), " tiles...\n", sep = "") + cat(" Creating ", nrow(filtered_grid_sf), " tiles...\n", sep = "") # Use makeTiles with FILTERED grid (only overlapping tiles) tiles_list <- terra::makeTiles( diff --git a/r_app/40_mosaic_creation.R b/r_app/40_mosaic_creation.R index f1d6b91..7efb281 100644 --- a/r_app/40_mosaic_creation.R +++ b/r_app/40_mosaic_creation.R @@ -54,14 +54,12 @@ main <- function() { if (is.na(end_date)) { message("Invalid end_date provided. Using current date.") end_date <- Sys.Date() - end_date <- "2026-01-01" # Default date for testing } } else if (exists("end_date_str", envir = .GlobalEnv)) { end_date <- as.Date(get("end_date_str", envir = .GlobalEnv)) } else { # Default to current date if no argument is provided end_date <- Sys.Date() - end_date <- "2026-01-01" # Default date for testing message("No end_date provided. Using current date: ", format(end_date)) } From 4143bdf4d77cb0df88fba8e4705151ddb372201d Mon Sep 17 00:00:00 2001 From: Timon Date: Tue, 20 Jan 2026 12:06:45 +0100 Subject: [PATCH 14/15] updateing csv file --- r_app/80_calculate_kpis.R | 150 +++++++++++++++++-------------- r_app/80_report_building_utils.R | 16 +++- r_app/80_weekly_stats_utils.R | 53 +++++------ 3 files changed, 117 insertions(+), 102 deletions(-) diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index 410ed40..bef0764 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -66,17 +66,15 @@ CV_TREND_THRESHOLD_SIGNIFICANT <- 0.05 # Negative slope = CV decreasing = field becoming MORE uniform = GOOD # Positive slope = CV increasing = field becoming MORE patchy = BAD # Near zero = Homogenous growth (all crops progressing equally) -CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.05 # CV decreasing rapidly -CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # Gradual synchronization -CV_SLOPE_IMPROVEMENT_MAX <- -0.005 # Becoming uniform -CV_SLOPE_HOMOGENOUS_MIN <- -0.005 # Stable, uniform growth -CV_SLOPE_HOMOGENOUS_MAX <- 0.005 # No change in uniformity -CV_SLOPE_PATCHINESS_MIN <- 0.005 # Minor divergence -CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness -CV_SLOPE_SEVERE_MIN <- 0.02 # Field fragmentation beginning +CV_SLOPE_STRONG_IMPROVEMENT_MIN <- -0.03 # CV decreasing rapidly (>3% drop over 8 weeks) +CV_SLOPE_IMPROVEMENT_MIN <- -0.02 # CV decreasing (2-3% drop over 8 weeks) +CV_SLOPE_IMPROVEMENT_MAX <- -0.01 # Gradual improvement (1-2% drop over 8 weeks) +CV_SLOPE_HOMOGENOUS_MIN <- -0.01 # Essentially stable (small natural variation) +CV_SLOPE_HOMOGENOUS_MAX <- 0.01 # No change in uniformity (within ±1% over 8 weeks) +CV_SLOPE_PATCHINESS_MIN <- 0.01 # Minor divergence (1-2% increase over 8 weeks) +CV_SLOPE_PATCHINESS_MAX <- 0.02 # Growing patchiness (2-3% increase over 8 weeks) +CV_SLOPE_SEVERE_MIN <- 0.02 # Severe fragmentation (>3% increase over 8 weeks) -# CLOUD COVER ROUNDING INTERVALS -CLOUD_INTERVALS <- c(0, 50, 60, 70, 80, 90, 100) # PERCENTILE CALCULATIONS CI_PERCENTILE_LOW <- 0.10 @@ -391,7 +389,7 @@ main <- function() { current_week = current_week, year = year) - message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_weeks_in_this_phase")) + message(paste(" ✓ Added Weekly_ci_change, CV_Trend_Short_Term, Four_week_trend, CV_Trend_Long_Term, nmr_of_weeks_analysed")) # Load weekly harvest probabilities from script 31 (if available) message("\n4. Loading harvest probabilities from script 31...") @@ -479,7 +477,7 @@ main <- function() { if (is.na(planting_dt)) { return(NA_real_) } - round(as.numeric(difftime(end_date, planting_dt, units = "weeks")), 1) + round(as.numeric(difftime(end_date, planting_dt, units = "weeks")), 0) }) }, # Column 10: Phase (recalculate based on updated Age_week) @@ -493,31 +491,23 @@ main <- function() { NA_character_ }) }, - # Column 11: nmr_weeks_in_this_phase (already in current_stats from calculate_kpi_trends) + # Column 11: nmr_of_weeks_analysed (already in current_stats from calculate_kpi_trends) # Column 12: Germination_progress (calculated here from CI values) - Germination_progress = { - sapply(seq_len(nrow(current_stats)), function(idx) { - age_w <- Age_week[idx] - mean_ci_val <- Mean_CI[idx] - - # Only relevant for germination phase (0-4 weeks) - if (is.na(age_w) || age_w < 0 || age_w >= 4) { - return(NA_character_) - } - - # Estimate % of field with CI >= germination threshold - # Based on mean CI, estimate germination percentage - if (mean_ci_val >= 0.4) { - return(">80%") - } else if (mean_ci_val >= 0.25) { - return("50-80%") - } else if (mean_ci_val >= 0.1) { - return("20-50%") - } else { - return("<20%") - } - }) - }, + # Bin Pct_pixels_CI_gte_2 into 10% intervals: 0-10%, 10-20%, ..., 80-90%, 90-95%, 95-100% + Germination_progress = sapply(Pct_pixels_CI_gte_2, function(pct) { + if (is.na(pct)) return(NA_character_) + if (pct >= 95) return("95-100%") + else if (pct >= 90) return("90-95%") + else if (pct >= 80) return("80-90%") + else if (pct >= 70) return("70-80%") + else if (pct >= 60) return("60-70%") + else if (pct >= 50) return("50-60%") + else if (pct >= 40) return("40-50%") + else if (pct >= 30) return("30-40%") + else if (pct >= 20) return("20-30%") + else if (pct >= 10) return("10-20%") + else return("0-10%") + }), # Column 13: Imminent_prob (from script 31 or NA if not available) Imminent_prob = { if (!is.null(imminent_prob_data)) { @@ -526,59 +516,84 @@ main <- function() { rep(NA_real_, nrow(current_stats)) } }, - # Column 14: Status_trigger (based on harvest probability + growth status) - Status_trigger = { - triggers <- sapply(seq_len(nrow(current_stats)), function(idx) { + # Column 14: Status_Alert (based on harvest probability + crop health status) + # Priority order: Ready for harvest-check → Strong decline → Harvested/bare → NA + Status_Alert = { + sapply(seq_len(nrow(current_stats)), function(idx) { imminent_prob <- Imminent_prob[idx] age_w <- Age_week[idx] - ci_change <- Weekly_ci_change[idx] - phase <- Phase[idx] + weekly_ci_chg <- Weekly_ci_change[idx] + mean_ci_val <- Mean_CI[idx] - # Priority 1: Harvest imminent (high probability) - if (!is.na(imminent_prob) && imminent_prob > 0.5) { - return("harvest_imminent") + # Priority 1: Ready for harvest-check (imminent + mature cane ≥12 months) + if (!is.na(imminent_prob) && imminent_prob > 0.5 && !is.na(age_w) && age_w >= 52) { + return("Ready for harvest-check") } - # Priority 2: Age-based triggers - if (!is.na(age_w)) { - if (age_w >= 45) return("harvest_ready") - if (age_w >= 39) return("maturation_progressing") - if (age_w >= 4 & age_w < 39) return("growth_on_track") - if (age_w < 4) return("germination_started") + # Priority 2: Strong decline in crop health (drop ≥2 points but still >1.5) + if (!is.na(weekly_ci_chg) && weekly_ci_chg <= -2.0 && !is.na(mean_ci_val) && mean_ci_val > 1.5) { + return("Strong decline in crop health") } - # Fallback + # Priority 3: Harvested/bare (Mean CI < 1.5) + if (!is.na(mean_ci_val) && mean_ci_val < 1.5) { + return("Harvested/bare") + } + + # Fallback: no alert NA_character_ }) - triggers }, # Columns 15-16: CI-based columns already in current_stats (CI_range, CI_Percentiles) # Column 17: Already in current_stats (CV) # Column 18: Already in current_stats (CV_Trend_Short_Term) # Column 19: CV_Trend_Long_Term (from current_stats - raw slope value) # Column 19b: CV_Trend_Long_Term_Category (categorical interpretation of slope) + # 3 classes: More uniform (slope < -0.01), Stable uniformity (-0.01 to 0.01), Less uniform (slope > 0.01) CV_Trend_Long_Term_Category = { - sapply(current_stats$CV_Trend_Long_Term, categorize_cv_slope) + sapply(current_stats$CV_Trend_Long_Term, function(slope) { + if (is.na(slope)) { + return(NA_character_) + } else if (slope < -0.01) { + return("More uniform") + } else if (slope > 0.01) { + return("Less uniform") + } else { + return("Stable uniformity") + } + }) }, # Columns 20-21: Already in current_stats (Cloud_pct_clear, Cloud_category) - .keep = "all" # Keep all existing columns + # Bin Cloud_pct_clear into 10% intervals: 0-10%, 10-20%, ..., 80-90%, 90-95%, 95-100% + Cloud_pct_clear = sapply(Cloud_pct_clear, function(pct) { + if (is.na(pct)) return(NA_character_) + if (pct >= 95) return("95-100%") + else if (pct >= 90) return("90-95%") + else if (pct >= 80) return("80-90%") + else if (pct >= 70) return("70-80%") + else if (pct >= 60) return("60-70%") + else if (pct >= 50) return("50-60%") + else if (pct >= 40) return("40-50%") + else if (pct >= 30) return("30-40%") + else if (pct >= 20) return("20-30%") + else if (pct >= 10) return("10-20%") + else return("0-10%") + }), ) %>% select( - all_of(c("Field_id", "Farm_Section", "Field_name", "Acreage", "Mean_CI", "Weekly_ci_change", - "Four_week_trend", "Last_harvest_or_planting_date", "Age_week", "Phase", - "nmr_weeks_in_this_phase", "Germination_progress", "Imminent_prob", "Status_trigger", + all_of(c("Field_id", "Farm_Section", "Field_name", "Acreage", "Status_Alert", + "Last_harvest_or_planting_date", "Age_week", "Phase", + "Germination_progress", + "Mean_CI", "Weekly_ci_change", "Four_week_trend", "CI_range", "CI_Percentiles", "CV", "CV_Trend_Short_Term", "CV_Trend_Long_Term", "CV_Trend_Long_Term_Category", - "Cloud_pct_clear", "Cloud_category")), - any_of(c("CI_range", "CI_Percentiles")) + "Imminent_prob", "Cloud_pct_clear", "Cloud_category")) ) message(paste("✓ Built final output with", nrow(field_analysis_df), "fields and 21 columns")) - summary_statistics_df <- generate_field_analysis_summary(field_analysis_df) - export_paths <- export_field_analysis_excel( field_analysis_df, - summary_statistics_df, + NULL, project_dir, current_week, year, @@ -586,15 +601,12 @@ main <- function() { ) cat("\n--- Per-field Results (first 10) ---\n") - available_cols <- c("Field_id", "Acreage", "Age_week", "Mean_CI", "Four_week_trend", "Status_trigger", "Cloud_category") + available_cols <- c("Field_id", "Acreage", "Age_week", "Mean_CI", "Four_week_trend", "Status_Alert", "Cloud_category") available_cols <- available_cols[available_cols %in% names(field_analysis_df)] if (length(available_cols) > 0) { print(head(field_analysis_df[, available_cols], 10)) } - cat("\n--- Summary Statistics ---\n") - print(summary_statistics_df) - # ========== FARM-LEVEL KPI AGGREGATION ========== # Aggregate the per-field analysis into farm-level summary statistics @@ -623,15 +635,15 @@ main <- function() { farm_summary$phase_distribution <- phase_dist - # 2. STATUS TRIGGER DISTRIBUTION + # 2. STATUS ALERT DISTRIBUTION status_dist <- field_data %>% - group_by(Status_trigger) %>% + group_by(Status_Alert) %>% summarise( num_fields = n(), acreage = sum(Acreage, na.rm = TRUE), .groups = 'drop' ) %>% - rename(Category = Status_trigger) + rename(Category = Status_Alert) farm_summary$status_distribution <- status_dist diff --git a/r_app/80_report_building_utils.R b/r_app/80_report_building_utils.R index 7b7f4e9..0c5db3c 100644 --- a/r_app/80_report_building_utils.R +++ b/r_app/80_report_building_utils.R @@ -112,8 +112,13 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre field_df_rounded <- field_df %>% mutate(across(where(is.numeric), ~ round(., 2))) - summary_df_rounded <- summary_df %>% - mutate(across(where(is.numeric), ~ round(., 2))) + # Handle NULL summary_df + summary_df_rounded <- if (!is.null(summary_df)) { + summary_df %>% + mutate(across(where(is.numeric), ~ round(., 2))) + } else { + NULL + } output_subdir <- file.path(reports_dir, "kpis", "field_analysis") if (!dir.exists(output_subdir)) { @@ -124,10 +129,13 @@ export_field_analysis_excel <- function(field_df, summary_df, project_dir, curre excel_path <- file.path(output_subdir, excel_filename) excel_path <- normalizePath(excel_path, winslash = "\\", mustWork = FALSE) + # Build sheets list dynamically sheets <- list( - "Field Data" = field_df_rounded, - "Summary" = summary_df_rounded + "Field Data" = field_df_rounded ) + if (!is.null(summary_df_rounded)) { + sheets[["Summary"]] <- summary_df_rounded + } write_xlsx(sheets, excel_path) message(paste("✓ Field analysis Excel exported to:", excel_path)) diff --git a/r_app/80_weekly_stats_utils.R b/r_app/80_weekly_stats_utils.R index fb5dc8b..a4b460e 100644 --- a/r_app/80_weekly_stats_utils.R +++ b/r_app/80_weekly_stats_utils.R @@ -184,12 +184,17 @@ round_cloud_to_intervals <- function(cloud_pct_clear) { return(NA_character_) } - if (cloud_pct_clear < 50) return("<50%") + if (cloud_pct_clear < 10) return("0-10%") + if (cloud_pct_clear < 20) return("10-20%") + if (cloud_pct_clear < 30) return("20-30%") + if (cloud_pct_clear < 40) return("30-40%") + if (cloud_pct_clear < 50) return("40-50%") if (cloud_pct_clear < 60) return("50-60%") if (cloud_pct_clear < 70) return("60-70%") if (cloud_pct_clear < 80) return("70-80%") if (cloud_pct_clear < 90) return("80-90%") - return(">90%") + if (cloud_pct_clear < 95) return("90-95%") + return("95-100%") } get_ci_percentiles <- function(ci_values) { @@ -420,12 +425,18 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, range_str <- sprintf("%.1f-%.1f", range_min, range_max) ci_percentiles_str <- get_ci_percentiles(ci_vals) + # Count pixels with CI >= 2 (germination threshold) + GERMINATION_CI_THRESHOLD <- 2.0 + num_pixels_gte_2 <- sum(ci_vals >= GERMINATION_CI_THRESHOLD, na.rm = TRUE) + num_pixels_total <- length(ci_vals) + pct_pixels_gte_2 <- if (num_pixels_total > 0) round((num_pixels_gte_2 / num_pixels_total) * 100, 1) else 0 + field_rows <- extracted[extracted$ID == field_poly_idx, ] num_total <- nrow(field_rows) num_data <- sum(!is.na(field_rows$CI)) pct_clear <- if (num_total > 0) round((num_data / num_total) * 100, 1) else 0 cloud_cat <- if (num_data == 0) "No image available" - else if (pct_clear >= 99.5) "Clear view" + else if (pct_clear >= 95) "Clear view" else "Partial coverage" # Age_week and Phase are now calculated in main script using actual planting dates @@ -440,9 +451,10 @@ calculate_field_statistics <- function(field_boundaries_sf, week_num, year, results_list[[length(results_list) + 1]] <- data.frame( Field_id = field_id, Mean_CI = round(mean_ci, 2), - CV = round(cv, 4), + CV = round(cv * 100, 2), CI_range = range_str, CI_Percentiles = ci_percentiles_str, + Pct_pixels_CI_gte_2 = pct_pixels_gte_2, Cloud_pct_clear = pct_clear, Cloud_category = cloud_cat, stringsAsFactors = FALSE @@ -482,7 +494,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, current_stats$CV_Trend_Short_Term <- NA_real_ current_stats$Four_week_trend <- NA_real_ current_stats$CV_Trend_Long_Term <- NA_real_ - current_stats$nmr_weeks_in_this_phase <- 1L + current_stats$nmr_of_weeks_analysed <- 1L if (is.null(prev_stats) || nrow(prev_stats) == 0) { message(" No previous week data available - using defaults") @@ -502,7 +514,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, if (length(analysis_files) > 0) { recent_file <- analysis_files[which.max(file.info(analysis_files)$mtime)] prev_field_analysis <- readr::read_csv(recent_file, show_col_types = FALSE, - col_select = c(Field_id, nmr_weeks_in_this_phase, Phase)) + col_select = c(Field_id, nmr_of_weeks_analysed, Phase)) } } }, error = function(e) { @@ -510,7 +522,7 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, }) if (!is.null(prev_field_analysis) && nrow(prev_field_analysis) > 0) { - message(paste(" Using previous field_analysis to track nmr_weeks_in_this_phase")) + message(paste(" Using previous field_analysis to track nmr_of_weeks_analysed")) } historical_4weeks <- list() @@ -643,30 +655,13 @@ calculate_kpi_trends <- function(current_stats, prev_stats = NULL, dplyr::filter(Field_id == field_id) if (nrow(prev_analysis_row) > 0) { - prev_phase_analysis <- prev_analysis_row$Phase[1] - prev_nmr_weeks_analysis <- prev_analysis_row$nmr_weeks_in_this_phase[1] + prev_nmr_weeks_analysis <- prev_analysis_row$nmr_of_weeks_analysed[1] - if (!is.na(current_stats$Phase[i]) && !is.na(prev_phase_analysis)) { - if (current_stats$Phase[i] == prev_phase_analysis) { - current_stats$nmr_weeks_in_this_phase[i] <- - if (!is.na(prev_nmr_weeks_analysis)) prev_nmr_weeks_analysis + 1L else 2L - } else { - current_stats$nmr_weeks_in_this_phase[i] <- 1L - } - } - } else if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { - if (current_stats$Phase[i] == prev_row$Phase[1]) { - current_stats$nmr_weeks_in_this_phase[i] <- 2L + # Only increment nmr_of_weeks_analysed if we have previous data + if (!is.na(prev_nmr_weeks_analysis)) { + current_stats$nmr_of_weeks_analysed[i] <- prev_nmr_weeks_analysis + 1L } else { - current_stats$nmr_weeks_in_this_phase[i] <- 1L - } - } - } else { - if (!is.na(current_stats$Phase[i]) && !is.na(prev_row$Phase[1])) { - if (current_stats$Phase[i] == prev_row$Phase[1]) { - current_stats$nmr_weeks_in_this_phase[i] <- 2L - } else { - current_stats$nmr_weeks_in_this_phase[i] <- 1L + current_stats$nmr_of_weeks_analysed[i] <- 1L } } } From 9a55d2fcf80946dfb873b8fa601391d665e99f5f Mon Sep 17 00:00:00 2001 From: Timon Date: Tue, 27 Jan 2026 08:58:06 +0100 Subject: [PATCH 15/15] Refactor full pipeline script to include intelligent checking of existing outputs and dynamic execution of scripts. Added new Python scripts for RGB validation and evaluation template creation. Enhanced error handling and logging throughout the pipeline. --- python_app/31_harvest_imminent_weekly.py | 12 +- ... => batch_rgb_validation_top_fields_v3.py} | 112 +++- python_app/create_rgb_evaluation_template.py | 193 +++++++ python_app/debug_all_tiles_for_date.py | 107 ---- python_app/debug_field_mask.py | 102 ---- python_app/debug_tiff_inspect.py | 47 -- python_app/download_planet_missing_dates.py | 2 +- python_app/merge_ci_data.R | 29 - python_app/rgb_visualization.py | 504 +++++++++++++----- r_app/10_create_master_grid_and_split_tiffs.R | 8 +- r_app/20_ci_extraction.R | 6 +- r_app/30_interpolate_growth_model.R | 2 +- r_app/80_calculate_kpis.R | 24 +- r_app/run_full_pipeline.R | 455 ++++++++++++---- 14 files changed, 1035 insertions(+), 568 deletions(-) rename python_app/{batch_rgb_validation_top_fields.py => batch_rgb_validation_top_fields_v3.py} (69%) create mode 100644 python_app/create_rgb_evaluation_template.py delete mode 100644 python_app/debug_all_tiles_for_date.py delete mode 100644 python_app/debug_field_mask.py delete mode 100644 python_app/debug_tiff_inspect.py delete mode 100644 python_app/merge_ci_data.R diff --git a/python_app/31_harvest_imminent_weekly.py b/python_app/31_harvest_imminent_weekly.py index 8722eda..39d3eb9 100644 --- a/python_app/31_harvest_imminent_weekly.py +++ b/python_app/31_harvest_imminent_weekly.py @@ -1,4 +1,4 @@ -""" +r""" Script: 02_harvest_imminent_weekly.py Purpose: WEEKLY MONITORING - Run WEEKLY/DAILY to get real-time harvest status for all fields @@ -38,12 +38,12 @@ Use Cases: - Feed into 09b script for weekly dashboard reports Usage: - python 02_harvest_imminent_weekly.py [project_name] + python python_app/31_harvest_imminent_weekly.py angata Examples: - python 02_harvest_imminent_weekly.py angata - python 02_harvest_imminent_weekly.py esa - python 02_harvest_imminent_weekly.py chemba + python python_app/31_harvest_imminent_weekly.py angata + python python_app/31_harvest_imminent_weekly.py esa + python python_app/31_harvest_imminent_weekly.py chemba If no project specified, defaults to 'angata' """ @@ -264,7 +264,7 @@ def main(): # [3] Load model (from python_app directory) print("\n[3/5] Loading Model 307...") - model_dir = Path(".") # Current directory is python_app/, contains model.pt, config.json, scalers.pkl + model_dir = Path("python_app") # Model files located in python_app/ directory model, config, scalers = load_model_and_config(model_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f" Device: {device}") diff --git a/python_app/batch_rgb_validation_top_fields.py b/python_app/batch_rgb_validation_top_fields_v3.py similarity index 69% rename from python_app/batch_rgb_validation_top_fields.py rename to python_app/batch_rgb_validation_top_fields_v3.py index cccb9ae..259a081 100644 --- a/python_app/batch_rgb_validation_top_fields.py +++ b/python_app/batch_rgb_validation_top_fields_v3.py @@ -1,23 +1,25 @@ #!/usr/bin/env python """ -Batch RGB Validation for Top 50 Largest Fields +Batch RGB Validation for Top 100 Largest Fields - V3 -Generates 5x3 RGB temporal grids for the latest complete harvest season of the 50 largest fields. +Same as v1 but with dynamic image selection (checks for actual data, skips empty/black images). + +Generates 5x3 RGB temporal grids for the latest complete harvest season of the 100 largest fields. Uses actual season_end dates from harvest.xlsx for visual validation of field conditions at harvest. Configuration: - GeoJSON: pivot.geojson (defines field boundaries and sizes) - Harvest data: harvest.xlsx (season_end dates for completed harvests) -- CI data: ci_data_for_python.csv - Output: RGB directory with field_name_YYYYMMDD_harvest_rgb.png Usage: - python batch_rgb_validation_top_fields.py + python batch_rgb_validation_top_fields_v3.py --field 1 + python batch_rgb_validation_top_fields_v3.py Output: - Saves 5x3 RGB grids to: laravel_app/storage/app/angata/RGB/ - Filenames: field___harvest_rgb.png - - Each grid shows 15 images at 7-day intervals around the season_end date + - Each grid shows 15 images around the harvest date (dynamic date selection, skips empty images) """ import json @@ -26,6 +28,7 @@ import pandas as pd from pathlib import Path from datetime import datetime, timedelta import sys +import argparse # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) @@ -189,16 +192,26 @@ def load_harvest_dates_from_xlsx(harvest_xlsx_path, top_50_fields_df): def main(): + parser = argparse.ArgumentParser(description='RGB validation of harvest dates using satellite imagery (v3 - dynamic)') + parser.add_argument('--field', type=str, default=None, help='Specific field ID to validate (e.g., "1" or "10022")') + parser.add_argument('--project', type=str, default='angata', help='Project name (default: angata)') + + args = parser.parse_args() + print("="*90) - print("BATCH RGB VALIDATION - TOP 50 LARGEST FIELDS") - print("Visual inspection of latest harvest dates from harvest.xlsx using RGB imagery") + if args.field: + print(f"RGB VALIDATION V3 - SINGLE FIELD: {args.field}") + else: + print("RGB VALIDATION V3 - TOP 50 LARGEST FIELDS") + print("Visual inspection of harvest dates from harvest.xlsx using RGB imagery (dynamic selection)") print("="*90) # Configuration - geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") - harvest_xlsx = Path("laravel_app/storage/app/angata/Data/harvest.xlsx") - output_dir = Path("laravel_app/storage/app/angata/RGB") - tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") + project = args.project + geojson_path = Path(f"laravel_app/storage/app/{project}/Data/pivot.geojson") + harvest_xlsx = Path(f"laravel_app/storage/app/{project}/Data/harvest.xlsx") + output_dir = Path(f"laravel_app/storage/app/{project}/RGB") + tiff_dir = Path(f"laravel_app/storage/app/{project}/merged_final_tif/5x5") # Verify paths if not geojson_path.exists(): @@ -213,18 +226,83 @@ def main(): output_dir.mkdir(parents=True, exist_ok=True) - # Step 1: Load GeoJSON and get top 50 largest fields - print("\n[1/4] Loading GeoJSON and identifying top 50 largest fields...") + # Handle single field mode + if args.field: + print(f"\n[1/3] Loading harvest data for field {args.field}...") + + harvest_df = pd.read_excel(harvest_xlsx) + harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce') + harvest_df['field'] = harvest_df['field'].astype(str).str.strip() + + field_records = harvest_df[harvest_df['field'] == args.field] + field_records = field_records[field_records['season_end'].notna()] + + if len(field_records) == 0: + print(f"✗ No harvest data found for field {args.field}") + return + + # Get latest harvest for this field + latest_idx = field_records['season_end'].idxmax() + latest_row = field_records.loc[latest_idx] + harvest_date = latest_row['season_end'] + + print(f" ✓ Found harvest: {harvest_date.strftime('%Y-%m-%d')}") + + # Load field name from GeoJSON + print(f"\n[2/3] Loading field name from GeoJSON...") + with open(geojson_path) as f: + geojson_data = json.load(f) + + field_name = f"field_{args.field}" + for feature in geojson_data.get('features', []): + props = feature.get('properties', {}) + if str(props.get('field', '')) == args.field: + field_name = props.get('name', field_name) + break + + print(f" ✓ Field name: {field_name}") + + # Generate RGB grid + print(f"\n[3/3] Generating RGB validation grid (v3 dynamic)...") + results = generate_rgb_grids( + field_data=None, + field_id=args.field, + registered_harvest_dates=[], + predicted_harvest_dates=[ + { + 'harvest_date': harvest_date, + 'model_name': 'harvest_xlsx' + } + ], + output_dir=str(output_dir), + tiff_dir=str(tiff_dir), + geojson_path=str(geojson_path) + ) + + print("\n" + "="*90) + if results['predicted']: + print(f"✓ RGB grid generated successfully!") + print(f" Field: {field_name} (ID: {args.field})") + print(f" Harvest date: {harvest_date.strftime('%Y-%m-%d')}") + print(f" Output: {output_dir}") + else: + print(f"⚠ No RGB grid generated (no imagery available)") + print("="*90) + return + + # Batch mode for top 100 fields + print(f"\n[1/4] Loading GeoJSON and identifying top 100 largest fields...") + fields_df = load_geojson_and_calculate_areas(geojson_path) if fields_df is None: return - top_50_fields = fields_df.head(50) - print(f" ✓ Selected {len(top_50_fields)} largest fields for processing") + top_100_fields = fields_df.head(100) + print(f" ✓ Selected {len(top_100_fields)} largest fields for processing") # Step 2: Load harvest dates from Excel print("\n[2/4] Loading harvest dates from Excel (latest complete seasons)...") - harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_50_fields) + harvest_dates = load_harvest_dates_from_xlsx(harvest_xlsx, top_100_fields) if len(harvest_dates) == 0: print("✗ No harvest dates found in Excel file") @@ -237,7 +315,7 @@ def main(): print(f" ... and {len(harvest_dates) - 5} more") # Step 3: Generate RGB grids for each field - print("\n[3/4] Generating RGB validation grids...") + print("\n[3/4] Generating RGB validation grids (v3 dynamic)...") rgb_count = 0 for idx, (field_id, harvest_info) in enumerate(harvest_dates.items(), 1): diff --git a/python_app/create_rgb_evaluation_template.py b/python_app/create_rgb_evaluation_template.py new file mode 100644 index 0000000..dea094c --- /dev/null +++ b/python_app/create_rgb_evaluation_template.py @@ -0,0 +1,193 @@ +""" +Create an Excel evaluation template for RGB harvest date predictions. +Parses field names and dates directly from RGB image filenames. +""" +import os +import glob +import pandas as pd +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment, Border, Side +from openpyxl.utils import get_column_letter +import re +from datetime import datetime + +# Configuration +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +RGB_DIR = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB") +OUTPUT_PATH = os.path.join(BASE_DIR, "laravel_app", "storage", "app", "angata", "RGB_Evaluation_Template.xlsx") + +# Evaluators +EVALUATORS = ["Joey", "Daniel", "Nik", "Dimitra", "Timon"] + +def parse_rgb_filenames(): + """ + Parse field names and harvest dates from RGB image filenames. + Expected format: field_{field_id or name}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png + """ + fields_data = [] + + # Find all RGB images + rgb_files = glob.glob(os.path.join(RGB_DIR, "field_*.png")) + + for filepath in sorted(rgb_files): + filename = os.path.basename(filepath) + # Pattern: field_{field_id}_{YYYYMMDD}_harvest_xlsx_harvest_rgb.png + match = re.match(r"field_(.+?)_(\d{8})_harvest_xlsx_harvest_rgb\.png", filename) + + if match: + field_id = match.group(1) # e.g., "1000" or "91&92" + date_str = match.group(2) # e.g., "20250814" + + # Format date as YYYY-MM-DD + try: + harvest_date = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d") + except ValueError: + harvest_date = date_str + + fields_data.append({ + "field_id": field_id, + "harvest_date": harvest_date, + "filename": filename + }) + + # Sort by field_id (treating numeric parts as integers where possible) + fields_data = sorted(fields_data, key=lambda x: (x["field_id"].replace("&92", ""), )) + + return fields_data + +def create_evaluation_template(): + """Create the Excel evaluation template.""" + print("Loading field data from RGB images...") + fields_data = parse_rgb_filenames() + + if not fields_data: + print("ERROR: No RGB images found in", RGB_DIR) + return + + print(f"Found {len(fields_data)} RGB images") + + # Create workbook + wb = Workbook() + + # === SHEET 1: Evaluation Form === + ws_eval = wb.active + ws_eval.title = "Evaluation" + + # Define styles + header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid") + header_font = Font(bold=True, color="FFFFFF", size=11) + border = Border( + left=Side(style='thin'), + right=Side(style='thin'), + top=Side(style='thin'), + bottom=Side(style='thin') + ) + center_align = Alignment(horizontal="center", vertical="center", wrap_text=True) + left_align = Alignment(horizontal="left", vertical="center", wrap_text=True) + + # Column headers + headers = ["Field ID", "Predicted Harvest Date"] + EVALUATORS + for col_idx, header in enumerate(headers, start=1): + cell = ws_eval.cell(row=1, column=col_idx, value=header) + cell.fill = header_fill + cell.font = header_font + cell.alignment = center_align + cell.border = border + + # Set column widths + ws_eval.column_dimensions['A'].width = 15 + ws_eval.column_dimensions['B'].width = 20 + for col_idx in range(3, 3 + len(EVALUATORS)): + ws_eval.column_dimensions[get_column_letter(col_idx)].width = 12 + + # Add data rows + for row_idx, field in enumerate(fields_data, start=2): + ws_eval.cell(row=row_idx, column=1, value=field["field_id"]) + ws_eval.cell(row=row_idx, column=2, value=field["harvest_date"]) + + # Add empty cells for evaluator responses + for col_idx in range(3, 3 + len(EVALUATORS)): + cell = ws_eval.cell(row=row_idx, column=col_idx) + cell.alignment = center_align + cell.border = border + + # Light alternating row color + if row_idx % 2 == 0: + for col_idx in range(1, 3 + len(EVALUATORS)): + ws_eval.cell(row=row_idx, column=col_idx).fill = PatternFill( + start_color="D9E8F5", end_color="D9E8F5", fill_type="solid" + ) + + # Apply borders to all data cells + for col_idx in range(1, 3 + len(EVALUATORS)): + ws_eval.cell(row=row_idx, column=col_idx).border = border + if col_idx == 1 or col_idx == 2: + ws_eval.cell(row=row_idx, column=col_idx).alignment = left_align + + # Freeze panes + ws_eval.freeze_panes = "C2" + + # === SHEET 2: Instructions === + ws_instr = wb.create_sheet("Instructions") + + instr_content = [ + ["RGB Evaluation Instructions", ""], + ["", ""], + ["Overview:", ""], + ["The generated RGB images visualize the predicted harvest dates for each field.", ""], + ["The images are 3x3 grids showing satellite imagery from different dates", ""], + ["centered on the predicted harvest date (the center/red-box image).", ""], + ["", ""], + ["What to Evaluate:", ""], + ["For each field, determine if the predicted harvest date is CORRECT:", ""], + ["", ""], + ["Instructions for Reviewing:", ""], + ["1. Look at the CENTER image (red box) - this is the predicted harvest date", ""], + ["2. Compare to surrounding dates (before and after)", ""], + ["3. Look for change in field color/status:", ""], + [" • BEFORE: Field appears GREEN (growing/healthy crop)", ""], + [" • AT PREDICTED DATE: Field shows BROWN/YELLOW (soil visible, ripe for harvest)", ""], + [" • AFTER: Field continues BROWN (post-harvest or dormant)", ""], + ["", ""], + ["How to Enter Your Assessment:", ""], + ["Enter one of the following in your evaluator column for each field:", ""], + [" • YES = Predicted date is CORRECT (brown/harvest-ready at center date)", ""], + [" • NO = Predicted date is INCORRECT (not ready or already harvested)", ""], + [" • ? or MAYBE = Uncertain (cloudy images, unclear field status)", ""], + ["", ""], + ["Workflow Options:", ""], + ["Option A (Divide Work): Assign 2-3 fields per evaluator (rows divided by column)", ""], + ["Option B (Full Review): Each evaluator reviews all fields (everyone fills all rows)", ""], + ["Option C (Spot Check): Each evaluator samples 5-10 random fields", ""], + ["", ""], + ["Image Location:", ""], + ["All RGB images are in: /laravel_app/storage/app/angata/RGB/", ""], + ["Format: field_{FIELD_ID}_{YYYY-MM-DD}_harvest_xlsx_harvest_rgb.png", ""], + ["", ""], + ["Notes:", ""], + ["• Cloud cover may obscure ground truth - use best judgment", ""], + ["• Fields with multiple bands or irregular shapes: focus on dominant area", ""], + ["• Use context from previous/next dates to validate your assessment", ""], + ] + + # Add instructions text + for row_idx, row_data in enumerate(instr_content, start=1): + for col_idx, value in enumerate(row_data, start=1): + cell = ws_instr.cell(row=row_idx, column=col_idx, value=value) + if row_idx == 1: + cell.font = Font(bold=True, size=14) + elif any(keyword in str(value) for keyword in ["Overview:", "Instructions", "Workflow", "Image Location", "Notes"]): + cell.font = Font(bold=True, size=11) + cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True) + + ws_instr.column_dimensions['A'].width = 50 + ws_instr.column_dimensions['B'].width = 80 + + # Save workbook + wb.save(OUTPUT_PATH) + print(f"✓ Evaluation template created: {OUTPUT_PATH}") + print(f"✓ {len(fields_data)} fields added to evaluation form") + print(f"✓ Evaluators: {', '.join(EVALUATORS)}") + +if __name__ == "__main__": + create_evaluation_template() diff --git a/python_app/debug_all_tiles_for_date.py b/python_app/debug_all_tiles_for_date.py deleted file mode 100644 index ba07fe7..0000000 --- a/python_app/debug_all_tiles_for_date.py +++ /dev/null @@ -1,107 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to find all tiles for a date and check which overlap with field boundary -""" - -import json -import rasterio -from rasterio.mask import mask -from pathlib import Path -import numpy as np -import shapely.geometry as shgeom -import pandas as pd - -# Load field 79 boundary -geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") -field_id = "79" - -print(f"Loading field {field_id} from GeoJSON...") -with open(geojson_path) as f: - geojson_data = json.load(f) - -field_boundary = None -for feature in geojson_data.get('features', []): - props = feature.get('properties', {}) - if str(props.get('field', '')) == str(field_id): - geometry = feature.get('geometry') - if geometry: - geom_type = geometry.get('type', '') - coordinates = geometry.get('coordinates', []) - - if geom_type == 'MultiPolygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0][0] - field_boundary = shgeom.Polygon(coords) - elif geom_type == 'Polygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0] - field_boundary = shgeom.Polygon(coords) - break - -if field_boundary is None: - print(f"Field {field_id} not found") - exit(1) - -print(f"Field boundary bounds: {field_boundary.bounds}") -print(f"Field boundary area: {field_boundary.area}") - -# Find a specific date directory -tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") -target_date = pd.Timestamp("2026-01-15") # Use a recent date that exists - -# Find tiles for that date -date_dirs = [] -for date_dir in tiff_dir.iterdir(): - if date_dir.is_dir(): - try: - dir_name = date_dir.name - date_str = dir_name.split('_')[0] - tile_date = pd.Timestamp(date_str) - if tile_date == target_date: - date_dirs.append(date_dir) - except: - pass - -if not date_dirs: - print(f"No tiles found for {target_date}") - exit(1) - -print(f"\nFound {len(date_dirs)} date directory(ies) for {target_date}") - -for date_dir in date_dirs: - print(f"\n=== Checking date directory: {date_dir.name} ===") - - tiles = list(date_dir.glob("*.tif")) - print(f"Found {len(tiles)} tiles in this directory") - - for tile_path in sorted(tiles): - try: - with rasterio.open(tile_path) as src: - tile_bounds = src.bounds - tile_geom = shgeom.box(*tile_bounds) - - intersects = field_boundary.intersects(tile_geom) - intersection = field_boundary.intersection(tile_geom) if intersects else None - intersection_area = intersection.area if intersection else 0 - - print(f"\n{tile_path.name}") - print(f" Tile bounds: {tile_bounds}") - print(f" Intersects field: {intersects}") - if intersects: - print(f" Intersection area: {intersection_area:.8f}") - - # Try to mask this tile - geom = shgeom.mapping(field_boundary) - try: - masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) - print(f" ✓ Successfully masked! Shape: {masked_data.shape}") - - # Check the data in each band - for i, band_idx in enumerate([1, 2, 3]): - band_data = masked_data[i] - non_zero = (band_data != 0).sum() - print(f" Band {band_idx}: {non_zero} non-zero pixels out of {band_data.size}") - except ValueError as e: - print(f" ✗ Masking failed: {e}") - except Exception as e: - print(f" Error reading tile: {e}") diff --git a/python_app/debug_field_mask.py b/python_app/debug_field_mask.py deleted file mode 100644 index ce96700..0000000 --- a/python_app/debug_field_mask.py +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to diagnose why field boundary masking produces no data -""" - -import json -import rasterio -from rasterio.mask import mask -from pathlib import Path -import numpy as np -import shapely.geometry as shgeom - -# Load a sample field boundary -geojson_path = Path("laravel_app/storage/app/angata/Data/pivot.geojson") -field_id = "79" # A field that had issues - -print(f"Loading field {field_id} from GeoJSON...") -with open(geojson_path) as f: - geojson_data = json.load(f) - -field_boundary = None -for feature in geojson_data.get('features', []): - props = feature.get('properties', {}) - if str(props.get('field', '')) == str(field_id): - geometry = feature.get('geometry') - if geometry: - geom_type = geometry.get('type', '') - coordinates = geometry.get('coordinates', []) - - if geom_type == 'MultiPolygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0][0] - field_boundary = shgeom.Polygon(coords) - elif geom_type == 'Polygon': - if coordinates and len(coordinates) > 0: - coords = coordinates[0] - field_boundary = shgeom.Polygon(coords) - break - -if field_boundary is None: - print(f"Field {field_id} not found") - exit(1) - -print(f"Field boundary bounds: {field_boundary.bounds}") -print(f"Field boundary area: {field_boundary.area}") - -# Load a sample TIFF tile -tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") -tile_file = None -for date_dir in sorted(tiff_dir.iterdir()): - if date_dir.is_dir(): - for tif in date_dir.glob("*.tif"): - if tif.stat().st_size > 12e6: - tile_file = tif - break - if tile_file: - break - -if not tile_file: - print("No suitable TIFF found") - exit(1) - -print(f"\nTesting with TIFF: {tile_file.name}") - -with rasterio.open(tile_file) as src: - print(f"TIFF Bounds: {src.bounds}") - print(f"TIFF CRS: {src.crs}") - - # Check if field boundary is within tile bounds - tile_box = shgeom.box(*src.bounds) - intersects = field_boundary.intersects(tile_box) - print(f"Field boundary intersects tile: {intersects}") - - if intersects: - intersection = field_boundary.intersection(tile_box) - print(f"Intersection area: {intersection.area}") - print(f"Intersection bounds: {intersection.bounds}") - - # Try to mask and see what we get - print("\nAttempting to mask...") - geom = shgeom.mapping(field_boundary) - try: - masked_data, _ = mask(src, [geom], crop=True, indexes=[1, 2, 3]) - print(f"Masked data shape: {masked_data.shape}") - print(f"Masked data dtype: {masked_data.dtype}") - - # Check the data - for i, band_idx in enumerate([1, 2, 3]): - band_data = masked_data[i] - print(f"\nBand {band_idx}:") - print(f" min: {np.nanmin(band_data):.6f}") - print(f" max: {np.nanmax(band_data):.6f}") - print(f" mean: {np.nanmean(band_data):.6f}") - print(f" % valid (non-zero): {(band_data != 0).sum() / band_data.size * 100:.2f}%") - print(f" % NaN: {np.isnan(band_data).sum() / band_data.size * 100:.2f}%") - - # Show sample values - valid_pixels = band_data[band_data != 0] - if len(valid_pixels) > 0: - print(f" Sample valid values: {valid_pixels[:10]}") - except ValueError as e: - print(f"Error during masking: {e}") diff --git a/python_app/debug_tiff_inspect.py b/python_app/debug_tiff_inspect.py deleted file mode 100644 index be51e5e..0000000 --- a/python_app/debug_tiff_inspect.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to inspect TIFF file structure and data -""" - -import rasterio -from pathlib import Path -import numpy as np - -# Pick a tile file to inspect -tiff_dir = Path("laravel_app/storage/app/angata/merged_final_tif/5x5") - -# Find first available tile -tile_file = None -for date_dir in sorted(tiff_dir.iterdir()): - if date_dir.is_dir(): - for tif in date_dir.glob("*.tif"): - if tif.stat().st_size > 12e6: # Skip empty files - tile_file = tif - break - if tile_file: - break - -if not tile_file: - print("No suitable TIFF files found") - exit(1) - -print(f"Inspecting: {tile_file.name}") -print("=" * 80) - -with rasterio.open(tile_file) as src: - print(f"Band count: {src.count}") - print(f"Data type: {src.dtypes[0]}") - print(f"Shape: {src.height} x {src.width}") - print(f"CRS: {src.crs}") - print(f"Bounds: {src.bounds}") - print() - - # Read each band - for band_idx in range(1, min(6, src.count + 1)): - data = src.read(band_idx) - print(f"Band {band_idx}:") - print(f" dtype: {data.dtype}") - print(f" range: {data.min():.6f} - {data.max():.6f}") - print(f" mean: {data.mean():.6f}") - print(f" % valid (non-zero): {(data != 0).sum() / data.size * 100:.1f}%") - print() diff --git a/python_app/download_planet_missing_dates.py b/python_app/download_planet_missing_dates.py index b6723b7..9e15d25 100644 --- a/python_app/download_planet_missing_dates.py +++ b/python_app/download_planet_missing_dates.py @@ -4,7 +4,7 @@ Purpose: Download Planet satellite data for missing dates only (skip existing fi Can be called from batch scripts or other Python scripts. Usage: - python download_planet_missing_dates.py --start 2022-01-01 --end 2025-12-15 --project angata + python download_planet_missing_dates.py --start 2026-01-17 --end 2026-12-20 --project angata python download_planet_missing_dates.py --start 2023-06-01 --end 2023-06-30 --project angata --dry-run Environment variables (alternative to CLI args): diff --git a/python_app/merge_ci_data.R b/python_app/merge_ci_data.R deleted file mode 100644 index 8c179bb..0000000 --- a/python_app/merge_ci_data.R +++ /dev/null @@ -1,29 +0,0 @@ -# Merge all CI RDS files into a single CSV -library(tidyverse) - -# Paths -ci_data_dir <- "r_app/experiments/ci_graph_exploration/CI_data" -output_csv <- "python_app/lstm_ci_data_combined.csv" - -# Find all RDS files -rds_files <- list.files(ci_data_dir, pattern = "\\.rds$", full.names = TRUE) -print(paste("Found", length(rds_files), "RDS files")) - -# Load and combine all files -combined_data <- tibble() - -for (file in rds_files) { - filename <- basename(file) - client_name <- sub("\\.rds$", "", filename) # Extract client name from filename - print(paste("Loading:", filename, "- Client:", client_name)) - data <- readRDS(file) - data$client <- client_name - combined_data <- bind_rows(combined_data, data) -} - -print(paste("Total rows:", nrow(combined_data))) -print(paste("Columns:", paste(names(combined_data), collapse = ", "))) - -# Write to CSV -write.csv(combined_data, output_csv, row.names = FALSE) -print(paste("✓ Saved to:", output_csv)) diff --git a/python_app/rgb_visualization.py b/python_app/rgb_visualization.py index c1fd2e0..ccb83f6 100644 --- a/python_app/rgb_visualization.py +++ b/python_app/rgb_visualization.py @@ -88,95 +88,116 @@ def load_field_boundaries(geojson_path, field_id): return None, None -def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60): +def find_overlapping_tiles(target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=None, debug=False): """ - Find all tile files for target_date (or closest date) that overlap with field_boundary. + Find tile files with actual data (not cloud-masked) for target_date or nearest date. - Tile files are organized in subdirectories by date: 5x5/YYYY-MM-DD_HH/*.tif + Searches by increasing distance from target date until finding tiles with data. + Avoids reusing dates in exclude_dates to ensure temporal diversity in grids. Args: target_date (pd.Timestamp): Target date to find tiles near tiff_dir (Path): Directory containing 5x5 date subdirectories field_boundary (shapely.Polygon): Field boundary for overlap detection days_window (int): Max days to search before/after target + exclude_dates (list): List of dates to skip (avoid repetition) + debug (bool): Enable detailed debugging output Returns: tuple: (list of tile paths, actual_date, days_diff) - list: tile paths that overlap field - pd.Timestamp: actual date of tiles found - int: days difference from target to actual date found """ target_date = pd.Timestamp(target_date) tiff_dir = Path(tiff_dir) + exclude_dates = exclude_dates or [] + exclude_dates = [pd.Timestamp(d) for d in exclude_dates] if not tiff_dir.exists(): + if debug: + print(f" [DEBUG] TIFF dir does not exist: {tiff_dir}") return [], None, None - # Find all date subdirectories - available_dates = {} # {date: ([tile file paths], actual_dir_name)} - min_size_mb = 12.0 # Empty files are ~11.56 MB + # Build map of all available dates + available_dates = {} + date_parse_errors = 0 for date_dir in tiff_dir.iterdir(): if not date_dir.is_dir(): continue - try: - # Parse date from directory name (YYYY-MM-DD or YYYY-MM-DD_HH) dir_name = date_dir.name - # Extract just the date part before underscore if it exists date_str = dir_name.split('_')[0] tile_date = pd.Timestamp(date_str) - days_diff = (tile_date - target_date).days - if abs(days_diff) > days_window: - continue - - # Find all .tif files in this date directory tile_files = [] for tile_file in date_dir.glob('*.tif'): - # Skip obviously empty files - file_size_mb = tile_file.stat().st_size / (1024 * 1024) - if file_size_mb >= min_size_mb: - tile_files.append(tile_file) + # Include ALL tiles, regardless of size + # Some tiles may be small but still contain valid data for specific fields + tile_files.append(tile_file) if tile_files: available_dates[tile_date] = (tile_files, dir_name) - except: - pass + except Exception as e: + date_parse_errors += 1 + if debug: + print(f" [DEBUG] Failed to parse date from {date_dir.name}: {e}") + + if debug: + print(f" [DEBUG] Found {len(available_dates)} dates with tile files, {date_parse_errors} parse errors") + print(f" [DEBUG] Date range: {min(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'} to {max(available_dates.keys()).strftime('%Y-%m-%d') if available_dates else 'N/A'}") if not available_dates: return [], None, None - # Find closest date - closest_date = min(available_dates.keys(), key=lambda d: abs((d - target_date).days)) - days_diff = (closest_date - target_date).days - tiles, _ = available_dates[closest_date] + # Search dates by increasing distance from target, looking for tiles with actual data + sorted_dates = sorted(available_dates.keys(), key=lambda d: abs((d - target_date).days)) - # Filter tiles to only those that overlap field boundary - if rasterio is None or field_boundary is None: - # If rasterio not available, use all tiles (conservative approach) - return tiles, closest_date, days_diff + for search_date in sorted_dates: + # Skip if this date was recently used (avoid temporal repetition) + if search_date in exclude_dates: + continue + + tiles, dir_name = available_dates[search_date] + days_diff = (search_date - target_date).days + + # Try to find overlapping tiles at this date + overlapping_tiles = [] + tile_check_errors = 0 + + for tile_path in tiles: + try: + with rasterio.open(tile_path) as src: + tile_bounds = src.bounds + tile_geom = shgeom.box(*tile_bounds) + + # Debug first tile + if debug and len(overlapping_tiles) == 0 and tile_check_errors == 0: + print(f" [DEBUG] First tile check for {tile_path.name}:") + print(f" Tile bounds: {tile_bounds}") + print(f" Tile CRS: {src.crs}") + print(f" Field bounds: {field_boundary.bounds}") + print(f" Field geom type: {field_boundary.geom_type}") + print(f" Intersects: {tile_geom.intersects(field_boundary)}") + + if tile_geom.intersects(field_boundary): + overlapping_tiles.append(tile_path) + except Exception as e: + tile_check_errors += 1 + if debug: + print(f" [DEBUG] Error checking tile {tile_path.name}: {e}") + + if debug: + print(f" [DEBUG] Date {search_date.strftime('%Y-%m-%d')}: {len(tiles)} tiles, {len(overlapping_tiles)} overlap field, {tile_check_errors} errors") + + if overlapping_tiles: + # Found overlapping tiles, return them + print(f" [FIND_TILES] Target: {target_date.strftime('%Y-%m-%d')}, Using: {search_date.strftime('%Y-%m-%d')} ({days_diff:+d}d), Tiles: {[Path(t).name for t in overlapping_tiles]}") + return overlapping_tiles, search_date, days_diff - overlapping_tiles = [] + # No overlapping tiles found at all + if debug: + print(f" [DEBUG] No overlapping tiles found for {target_date.strftime('%Y-%m-%d')} within {len(sorted_dates)} searched dates") - for tile_path in tiles: - try: - with rasterio.open(tile_path) as src: - # Get tile bounds - tile_bounds = src.bounds # (left, bottom, right, top) - tile_geom = shgeom.box(*tile_bounds) - - # Check if tile overlaps field - if tile_geom.intersects(field_boundary): - overlapping_tiles.append(tile_path) - except: - pass - - if not overlapping_tiles: - # No overlapping tiles found, return all tiles for the closest date - return tiles, closest_date, days_diff - - return overlapping_tiles, closest_date, days_diff + return [], None, None def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): @@ -211,44 +232,44 @@ def load_and_clip_tiff_rgb(tiff_path, field_boundary, rgb_bands=(1, 2, 3)): # For merged_final_tif: bands 1,2,3 are R,G,B bands_to_read = (1, 2, 3) - # Mask and read bands + # Mask and read bands - extract ONLY the specific field polygon geom = shgeom.mapping(field_boundary) try: masked_data, _ = mask(src, [geom], crop=True, indexes=list(bands_to_read)) - - # Stack RGB rgb = np.stack([masked_data[i] for i in range(3)], axis=-1) - - # Convert to float32 if not already - rgb = rgb.astype(np.float32) - - # Normalize to 0-1 range - # Data appears to be 8-bit (0-255 range) stored as float32 - # Check actual max value to determine normalization - max_val = np.nanmax(rgb) - if max_val > 0: - # If max is around 255 or less, assume 8-bit - if max_val <= 255: - rgb = rgb / 255.0 - # If max is around 65535, assume 16-bit - elif max_val <= 65535: - rgb = rgb / 65535.0 - # Otherwise divide by max to normalize - else: - rgb = rgb / max_val - - rgb = np.clip(rgb, 0, 1) - - # Check if result is all NaN - if np.all(np.isnan(rgb)): - return None - - # Replace any remaining NaN with 0 (cloud/invalid pixels) - rgb = np.nan_to_num(rgb, nan=0.0) - - return rgb - except ValueError: + except (ValueError, RuntimeError) as e: + # Mask failed - field doesn't overlap this tile or geometry issue + print(f" MASK ERROR on {Path(tiff_path).name}: {str(e)[:50]}") return None + + # Convert to float32 if not already + rgb = rgb.astype(np.float32) + + # Normalize to 0-1 range + # Data appears to be 8-bit (0-255 range) stored as float32 + # Check actual max value to determine normalization + max_val = np.nanmax(rgb) + if max_val > 0: + # If max is around 255 or less, assume 8-bit + if max_val <= 255: + rgb = rgb / 255.0 + # If max is around 65535, assume 16-bit + elif max_val <= 65535: + rgb = rgb / 65535.0 + # Otherwise divide by max to normalize + else: + rgb = rgb / max_val + + rgb = np.clip(rgb, 0, 1) + + # Check if result is all NaN + if np.all(np.isnan(rgb)): + return None + + # Replace any remaining NaN with 0 (cloud/invalid pixels) + rgb = np.nan_to_num(rgb, nan=0.0) + + return rgb except Exception as e: return None @@ -284,9 +305,16 @@ def load_and_composite_tiles_rgb(tile_paths, field_boundary): if len(rgb_arrays) == 1: composited = rgb_arrays[0] else: - # If multiple tiles, use max composite - stacked = np.stack(rgb_arrays, axis=0) - composited = np.max(stacked, axis=0) + # If multiple tiles, need to handle different shapes + # Find common shape or use max/min approach that handles variable sizes + try: + # Try to stack if same shape + stacked = np.stack(rgb_arrays, axis=0) + composited = np.max(stacked, axis=0) + except ValueError: + # Different shapes - use the largest (most complete) tile + # This happens when tiles are masked to different field areas + composited = max(rgb_arrays, key=lambda x: x.size) composited = composited.astype(np.float32) @@ -307,6 +335,26 @@ def load_and_composite_tiles_rgb(tile_paths, field_boundary): return None +def has_valid_rgb_data(rgb_data, threshold=0.05): + """ + Check if RGB image has actual data (not black/empty). + Returns True if max value > threshold (not all zeros/black). + """ + if rgb_data is None: + return False + + try: + # Check if there's any variation in the RGB data + data_max = np.nanmax(rgb_data) + data_min = np.nanmin(rgb_data) + + # Image is valid if max > threshold AND there's variation + has_data = data_max > threshold and (data_max - data_min) > 0.01 + return has_data + except: + return False + + def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field_boundary, title, output_dir, harvest_type='registered', model_name=None, harvest_index=None): """ @@ -334,91 +382,263 @@ def create_temporal_rgb_grid(harvest_date, field_data, field_id, tiff_dir, field """ harvest_date = pd.Timestamp(harvest_date) - # Target dates: 15 images at 7-day intervals (8 pre, 1 near, 6 post) - target_dates = [ - harvest_date - timedelta(days=56), # T-56d - harvest_date - timedelta(days=49), # T-49d - harvest_date - timedelta(days=42), # T-42d - harvest_date - timedelta(days=35), # T-35d - harvest_date - timedelta(days=28), # T-28d - harvest_date - timedelta(days=21), # T-21d - harvest_date - timedelta(days=14), # T-14d - harvest_date - timedelta(days=7), # T-7d - harvest_date, # T~0d (near harvest) - harvest_date + timedelta(days=7), # T+7d - harvest_date + timedelta(days=14), # T+14d - harvest_date + timedelta(days=21), # T+21d - harvest_date + timedelta(days=28), # T+28d - harvest_date + timedelta(days=35), # T+35d - harvest_date + timedelta(days=42), # T+42d - harvest_date + timedelta(days=56), # T+56d (Note: non-standard to fill 5th col in row 3) - ] + # Pre-allocate 15 image slots + rgb_images = [None] * 15 + days_offsets = [None] * 15 + actual_dates = [None] * 15 + used_dates = set() # Use set for efficient lookups - # Find TIFFs for each date - rgb_images = [] - days_offsets = [] - actual_dates = [] # Store actual dates of TIFFs found + # STEP 0: Debug - List all available dates + print(f" [STEP 0] Checking available TIFF dates in {tiff_dir}...") + available_dates = [] + if tiff_dir.exists(): + for date_folder in sorted(tiff_dir.iterdir()): + if date_folder.is_dir(): + try: + date_obj = datetime.strptime(date_folder.name, '%Y-%m-%d').date() + available_dates.append(date_obj) + except: + pass + print(f" Found {len(available_dates)} dates with data: {available_dates[:5]}... (showing first 5)") - for target in target_dates: - tile_paths, actual_date, days_diff = find_overlapping_tiles(target, tiff_dir, field_boundary, days_window=60) - - if not tile_paths or actual_date is None: - rgb_images.append(None) - days_offsets.append(None) - actual_dates.append(None) - print(f" ⚠ No tiles found within 60 days of {target.strftime('%Y-%m-%d')} with sufficient data") - continue - - rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) - rgb_images.append(rgb) - days_offsets.append(days_diff) - actual_dates.append(actual_date) - - if rgb is not None: - print(f" ✓ Loaded {len(tile_paths)} tile(s) for {actual_date.strftime('%Y-%m-%d')} ({days_diff:+d}d from target)") + # STEP 1: Find anchor image (closest to predicted harvest date) FIRST + # Search within ±14 days of predicted harvest date first, then expand if needed + print(f" [STEP 1] Finding anchor (closest to harvest {harvest_date.strftime('%Y-%m-%d')}, searching ±14 days)...") + anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles( + harvest_date, tiff_dir, field_boundary, days_window=14, exclude_dates=[], debug=False + ) + + anchor_rgb = None + anchor_idx = 8 # Position 8 is the center (T~0d / harvest date position) + failed_anchor_dates = [] # Track dates that failed validation + + if anchor_tile_paths and anchor_date: + anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary) + if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb): + rgb_images[anchor_idx] = anchor_rgb + days_offsets[anchor_idx] = 0 # Anchor is reference point + actual_dates[anchor_idx] = anchor_date + used_dates.add(anchor_date) + print(f" ✓ ANCHOR FOUND (±14d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)") else: - print(f" ⚠ Loaded {len(tile_paths)} tile(s) but RGB data is None") + failed_anchor_dates.append(anchor_date) + print(f" ⚠ Found date {anchor_date.strftime('%Y-%m-%d')} within ±14d, but image has no valid data") + print(f" [RETRY] Expanding anchor search to ±60 days (excluding failed dates)...") + anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles( + harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=set(failed_anchor_dates), debug=False + ) + if anchor_tile_paths and anchor_date: + anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary) + if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb): + rgb_images[anchor_idx] = anchor_rgb + days_offsets[anchor_idx] = 0 # Anchor is reference point + actual_dates[anchor_idx] = anchor_date + used_dates.add(anchor_date) + print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)") + else: + failed_anchor_dates.append(anchor_date) + print(f" ✗ No valid anchor found even within ±60 days") + else: + print(f" ✗ No tiles found for any date within ±60 days") + else: + print(f" ⚠ No tiles found within ±14 days, expanding search...") + anchor_tile_paths, anchor_date, anchor_days_diff = find_overlapping_tiles( + harvest_date, tiff_dir, field_boundary, days_window=60, exclude_dates=[], debug=False + ) + if anchor_tile_paths and anchor_date: + anchor_rgb = load_and_composite_tiles_rgb(anchor_tile_paths, field_boundary) + if anchor_rgb is not None and has_valid_rgb_data(anchor_rgb): + rgb_images[anchor_idx] = anchor_rgb + days_offsets[anchor_idx] = 0 # Anchor is reference point + actual_dates[anchor_idx] = anchor_date + used_dates.add(anchor_date) + print(f" ✓ ANCHOR FOUND (±60d): {anchor_date.strftime('%Y-%m-%d')} ({anchor_days_diff:+d}d from predicted harvest)") + else: + print(f" ✗ No valid anchor found even within ±60 days") + else: + print(f" ✗ No tiles found for any date within ±60 days") + + # STEP 2: Dynamically collect images BEFORE anchor date + # Strategy: Go backwards from anchor with progressively larger search windows + # Start at 7 days, then try 10, 15, 20, 30+ days apart + print(f" [STEP 2] Collecting images BEFORE anchor (going backwards, flexible spacing)...") + before_positions = [7, 6, 5, 4, 3, 2, 1, 0] # Will fill in reverse order (7→0) + before_images = [] # (position, date, rgb, offset) + + pos_idx = 0 # Index into before_positions + last_found_date = anchor_date + + # Progressive search offsets: try these day offsets in order + search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days before last found image + + while pos_idx < len(before_positions) and last_found_date.year >= 2024: + found_this_iteration = False + + # Try each offset until we find a valid image + for days_offset in search_offsets: + search_target_date = last_found_date - timedelta(days=days_offset) + + tile_paths, actual_date, days_diff = find_overlapping_tiles( + search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False + ) + + if tile_paths and actual_date: + rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) + if rgb is not None and has_valid_rgb_data(rgb): + # Found valid image! + overall_max = np.nanmax(rgb) + overall_min = np.nanmin(rgb) + + offset_from_anchor = (actual_date - anchor_date).days + before_images.append((before_positions[pos_idx], actual_date, rgb, offset_from_anchor)) + used_dates.add(actual_date) + last_found_date = actual_date # Move backwards from this date + + print(f" ✓ Before[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}") + + pos_idx += 1 + found_this_iteration = True + break # Found one, stop trying larger offsets + + # If nothing found with any offset, we're done collecting before images + if not found_this_iteration: + break + + # Store collected before images + for pos, actual_date, rgb, offset in before_images: + rgb_images[pos] = rgb + actual_dates[pos] = actual_date + days_offsets[pos] = offset + + # STEP 3: Dynamically collect images AFTER anchor date + # Strategy: Go forwards from anchor with progressively larger search windows + # Start at 7 days, then try 10, 15, 20, 30+ days apart + print(f" [STEP 3] Collecting images AFTER anchor (going forwards, flexible spacing)...") + after_positions = [9, 10, 11, 12, 13, 14] # Will fill in order (9→14) + after_images = [] # (position, date, rgb, offset) + + pos_idx = 0 # Index into after_positions + last_found_date = anchor_date + max_search_date = anchor_date + timedelta(days=200) # Don't search beyond 200 days forward + + # Progressive search offsets: try these day offsets in order + search_offsets = [7, 10, 15, 20, 30, 40, 60, 90, 120] # Days after last found image + + while pos_idx < len(after_positions) and last_found_date < max_search_date: + found_this_iteration = False + + # Try each offset until we find a valid image + for days_offset in search_offsets: + search_target_date = last_found_date + timedelta(days=days_offset) + + # Don't search beyond max date + if search_target_date > max_search_date: + break + + tile_paths, actual_date, days_diff = find_overlapping_tiles( + search_target_date, tiff_dir, field_boundary, days_window=60, exclude_dates=used_dates, debug=False + ) + + if tile_paths and actual_date: + rgb = load_and_composite_tiles_rgb(tile_paths, field_boundary) + if rgb is not None and has_valid_rgb_data(rgb): + # Found valid image! + overall_max = np.nanmax(rgb) + overall_min = np.nanmin(rgb) + + offset_from_anchor = (actual_date - anchor_date).days + after_images.append((after_positions[pos_idx], actual_date, rgb, offset_from_anchor)) + used_dates.add(actual_date) + last_found_date = actual_date # Move forwards from this date + + print(f" ✓ After[{pos_idx}]: {actual_date.strftime('%Y-%m-%d')} ({offset_from_anchor:+d}d from anchor) - RGB: {overall_min:.4f}-{overall_max:.4f}") + + pos_idx += 1 + found_this_iteration = True + break # Found one, stop trying larger offsets + + # If nothing found with any offset, we're done collecting after images + if not found_this_iteration: + break + + # Store collected after images + for pos, actual_date, rgb, offset in after_images: + rgb_images[pos] = rgb + actual_dates[pos] = actual_date + days_offsets[pos] = offset # Create 5x3 grid plot (15 images) fig, axes = plt.subplots(3, 5, figsize=(25, 15)) - fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")}', + + # Build title with anchor offset information + anchor_offset_from_harvest = (actual_dates[8] - harvest_date).days if actual_dates[8] is not None else None + if anchor_offset_from_harvest is not None and anchor_offset_from_harvest != 0: + anchor_info = f"(Anchor: {actual_dates[8].strftime('%Y-%m-%d')}, {anchor_offset_from_harvest:+d}d from predicted harvest)" + else: + anchor_info = f"(Exact match with anchor: {actual_dates[8].strftime('%Y-%m-%d')})" if actual_dates[8] is not None else "" + + fig.suptitle(f'{title}\nField {field_id} - {harvest_type.upper()} Harvest: {harvest_date.strftime("%Y-%m-%d")} {anchor_info}', fontsize=16, fontweight='bold') # Grid positions (5 columns, 3 rows = 15 images) positions = [ ('T-56d', 0, 0), ('T-49d', 0, 1), ('T-42d', 0, 2), ('T-35d', 0, 3), ('T-28d', 0, 4), - ('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('T~0d', 1, 3), ('T+7d', 1, 4), + ('T-21d', 1, 0), ('T-14d', 1, 1), ('T-7d', 1, 2), ('HARVEST', 1, 3), ('T+7d', 1, 4), ('T+14d', 2, 0), ('T+21d', 2, 1), ('T+28d', 2, 2), ('T+35d', 2, 3), ('T+42d', 2, 4), ] - for idx, (label, row, col) in enumerate(positions): # All 15 images + for idx, (label, row, col) in enumerate(positions): ax = axes[row, col] if idx < len(rgb_images) and rgb_images[idx] is not None: rgb_data = rgb_images[idx] - # Debug: check data range - data_min, data_max = np.nanmin(rgb_data), np.nanmax(rgb_data) - print(f" DEBUG: {label} RGB range: {data_min:.4f} - {data_max:.4f}, shape: {rgb_data.shape}") + # Debug: check data range for ALL bands + data_min = np.nanmin(rgb_data) + data_max = np.nanmax(rgb_data) + data_mean = np.nanmean(rgb_data) + data_std = np.nanstd(rgb_data) + + # Check per-band stats + r_min, r_max, r_mean = np.nanmin(rgb_data[:,:,0]), np.nanmax(rgb_data[:,:,0]), np.nanmean(rgb_data[:,:,0]) + g_min, g_max, g_mean = np.nanmin(rgb_data[:,:,1]), np.nanmax(rgb_data[:,:,1]), np.nanmean(rgb_data[:,:,1]) + b_min, b_max, b_mean = np.nanmin(rgb_data[:,:,2]), np.nanmax(rgb_data[:,:,2]), np.nanmean(rgb_data[:,:,2]) + + print(f" DEBUG VALID {label} ({actual_dates[idx].strftime('%Y-%m-%d')}): RGB overall {data_min:.4f}-{data_max:.4f} (mean={data_mean:.4f}, std={data_std:.4f})") + print(f" R: {r_min:.4f}-{r_max:.4f} (μ={r_mean:.4f}), G: {g_min:.4f}-{g_max:.4f} (μ={g_mean:.4f}), B: {b_min:.4f}-{b_max:.4f} (μ={b_mean:.4f})") # Display with explicit vmin/vmax to handle normalized 0-1 data ax.imshow(rgb_data, vmin=0, vmax=1) - # Build title: label + offset + actual date - offset_str = f"{days_offsets[idx]:+d}d" if days_offsets[idx] is not None else "?" - date_str = actual_dates[idx].strftime('%Y-%m-%d') if actual_dates[idx] is not None else "No Date" - ax.set_title(f'{label}\n{offset_str}\n{date_str}', fontsize=10, fontweight='bold') + # Build title: show BOTH anchor offset AND harvest offset + if days_offsets[idx] is not None: + offset_from_anchor = days_offsets[idx] + offset_from_harvest = (actual_dates[idx] - harvest_date).days + + if idx == 8: # ANCHOR/HARVEST position + if offset_from_harvest == 0: + offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')})" + else: + offset_str = f"HARVEST\n(Image: {actual_dates[idx].strftime('%Y-%m-%d')}, {offset_from_harvest:+d}d from predicted)" + else: + # Show both offsets: from anchor and from harvest + offset_str = f"{offset_from_anchor:+d}d from anchor\n{offset_from_harvest:+d}d from harvest\n{actual_dates[idx].strftime('%Y-%m-%d')}" + else: + offset_str = "No Data" - # Add red box around harvest date (T~0d at row=1, col=3) - if label == 'T~0d': + ax.set_title(offset_str, fontsize=9, fontweight='bold') + + # Add red box around the ANCHOR IMAGE (position 8 is harvest/anchor) + if idx == 8: # Position 8 is the anchor for spine in ax.spines.values(): spine.set_edgecolor('red') spine.set_linewidth(4) else: ax.text(0.5, 0.5, 'No Data', ha='center', va='center', fontsize=12, color='gray') - ax.set_title(label, fontsize=10) + ax.set_title('No Data', fontsize=10) + print(f" DEBUG EMPTY {label}: No image data collected") - # Add red box for T~0d even if no data - if label == 'T~0d': + # Add red box for anchor position even if no data + if idx == 8: # Position 8 is the anchor for spine in ax.spines.values(): spine.set_edgecolor('red') spine.set_linewidth(4) diff --git a/r_app/10_create_master_grid_and_split_tiffs.R b/r_app/10_create_master_grid_and_split_tiffs.R index 6279fb2..fdab65c 100644 --- a/r_app/10_create_master_grid_and_split_tiffs.R +++ b/r_app/10_create_master_grid_and_split_tiffs.R @@ -6,6 +6,8 @@ #' 2. Create master 5×5 grid covering all TIFFs #' 3. Split each daily TIFF into 25 tiles using the master grid #' 4. Save tiles in date-specific folders: daily_tiles/[DATE]/[DATE]_[TILE_ID].tif +#' & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/10_create_master_grid_and_split_tiffs.R 2026-01-13 2026-01-18 + library(terra) library(sf) @@ -32,7 +34,7 @@ if (length(args) >= 2) { } PROJECT <- "angata" -TIFF_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "merged_tif_8b") +TIFF_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "merged_tif_8b") # GRID SIZE CONFIGURATION - Change this to use different grid sizes # Options: 5x5 (25 tiles), 10x10 (100 tiles), etc. @@ -42,10 +44,10 @@ GRID_NCOLS <- 5 # Construct grid-specific subfolder path GRID_SIZE_LABEL <- paste0(GRID_NCOLS, "x", GRID_NROWS) -OUTPUT_FOLDER <- file.path("..", "laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) +OUTPUT_FOLDER <- file.path("laravel_app", "storage", "app", PROJECT, "daily_tiles_split", GRID_SIZE_LABEL) # Load field boundaries for overlap checking -GEOJSON_PATH <- file.path("..", "laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") +GEOJSON_PATH <- file.path("laravel_app", "storage", "app", PROJECT, "Data", "pivot.geojson") cat("Combined: Create Master Grid (", GRID_SIZE_LABEL, ") and Split TIFFs into Tiles\n", sep = "") cat("Grid subfolder: daily_tiles_split/", GRID_SIZE_LABEL, "/\n", sep = "") diff --git a/r_app/20_ci_extraction.R b/r_app/20_ci_extraction.R index d9c3d37..ab82188 100644 --- a/r_app/20_ci_extraction.R +++ b/r_app/20_ci_extraction.R @@ -13,13 +13,13 @@ # # Examples: # # Angata 8-band data (with UDM cloud masking) -# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/02_ci_extraction.R 2026-01-02 7 angata merged_tif_8b +# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/20_ci_extraction.R 2026-01-02 7 angata merged_tif_8b # # # Aura 4-band data -# Rscript 02_ci_extraction.R 2025-11-26 7 aura merged_tif +# Rscript 20_ci_extraction.R 2025-11-26 7 aura merged_tif # # # Auto-detects and uses tiles if available: -# Rscript 02_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists) +# Rscript 20_ci_extraction.R 2026-01-02 7 angata (uses tiles if daily_tiles_split/ exists) # 1. Load required packages # ----------------------- diff --git a/r_app/30_interpolate_growth_model.R b/r_app/30_interpolate_growth_model.R index 6707190..ed310e5 100644 --- a/r_app/30_interpolate_growth_model.R +++ b/r_app/30_interpolate_growth_model.R @@ -8,7 +8,7 @@ # # Usage: Rscript interpolate_growth_model.R [project_dir] # - project_dir: Project directory name (e.g., "chemba") -# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/03_interpolate_growth_model.R angata +# & 'C:\Program Files\R\R-4.4.3\bin\x64\Rscript' r_app/30_interpolate_growth_model.R angata # 1. Load required packages # ----------------------- diff --git a/r_app/80_calculate_kpis.R b/r_app/80_calculate_kpis.R index bef0764..ed2c330 100644 --- a/r_app/80_calculate_kpis.R +++ b/r_app/80_calculate_kpis.R @@ -186,8 +186,15 @@ main <- function() { end_date <- if (length(args) >= 1 && !is.na(args[1])) { as.Date(args[1]) } else if (exists("end_date", envir = .GlobalEnv)) { - # For recursive calls, use the end_date that was set in the global environment - get("end_date", envir = .GlobalEnv) + global_date <- get("end_date", envir = .GlobalEnv) + # Check if it's a valid Date with length > 0 + if (is.Date(global_date) && length(global_date) > 0 && !is.na(global_date)) { + global_date + } else if (exists("end_date_str", envir = .GlobalEnv)) { + as.Date(get("end_date_str", envir = .GlobalEnv)) + } else { + Sys.Date() + } } else if (exists("end_date_str", envir = .GlobalEnv)) { as.Date(get("end_date_str", envir = .GlobalEnv)) } else { @@ -210,10 +217,15 @@ main <- function() { 7 } + # Validate end_date is a proper Date object + if (is.null(end_date) || length(end_date) == 0 || !inherits(end_date, "Date")) { + stop("ERROR: end_date is not valid. Got: ", class(end_date), " with length ", length(end_date)) + } + assign("project_dir", project_dir, envir = .GlobalEnv) assign("end_date_str", format(end_date, "%Y-%m-%d"), envir = .GlobalEnv) - message("\n" %+% strrep("=", 70)) + message("\n", strrep("=", 70)) message("80_CALCULATE_KPIs.R - CONSOLIDATED KPI CALCULATION") message(strrep("=", 70)) message("Date:", format(end_date, "%Y-%m-%d")) @@ -238,7 +250,7 @@ main <- function() { # ========== PER-FIELD ANALYSIS (SC-64) ========== - message("\n" %+% strrep("-", 70)) + message("\n", strrep("-", 70)) message("PHASE 1: PER-FIELD WEEKLY ANALYSIS (SC-64 ENHANCEMENTS)") message(strrep("-", 70)) @@ -694,9 +706,9 @@ main <- function() { # ========== FINAL SUMMARY ========== - cat("\n" %+% strrep("=", 70) %+% "\n") + cat("\n", strrep("=", 70), "\n") cat("80_CALCULATE_KPIs.R - COMPLETION SUMMARY\n") - cat(strrep("=", 70) %+% "\n") + cat(strrep("=", 70), "\n") cat("Per-field analysis fields analyzed:", nrow(field_analysis_df), "\n") cat("Excel export:", export_paths$excel, "\n") cat("RDS export:", export_paths$rds, "\n") diff --git a/r_app/run_full_pipeline.R b/r_app/run_full_pipeline.R index 657fec1..ae6ff14 100644 --- a/r_app/run_full_pipeline.R +++ b/r_app/run_full_pipeline.R @@ -1,31 +1,40 @@ # ============================================================================== # FULL PIPELINE RUNNER # ============================================================================== -# Runs scripts 02, 03, 04, 09 (KPIs), 09 (Weekly), and 10 (CI Report Simple) +# Mixed Python/R pipeline: +# 1. Python: Download Planet images +# 2. R 10: Create master grid and split TIFFs +# 3. R 20: CI Extraction +# 4. R 21: Convert CI RDS to CSV +# 5. R 30: Interpolate growth model +# 6. Python 31: Harvest imminent weekly +# 7. R 40: Mosaic creation +# 8. R 80: Calculate KPIs # # ============================================================================== # HOW TO RUN THIS SCRIPT # ============================================================================== # -# In PowerShell or Command Prompt: +# Run from the smartcane/ directory: # # Option 1 (Recommended - shows real-time output): -# Rscript run_full_pipeline.R +# Rscript r_app/run_full_pipeline.R # # Option 2 (Full path to Rscript - use & in PowerShell for paths with spaces): -# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" run_full_pipeline.R +# & "C:\Program Files\R\R-4.4.3\bin\x64\Rscript.exe" r_app/run_full_pipeline.R # # Option 3 (Batch mode - output saved to .Rout file): -# R CMD BATCH --vanilla run_full_pipeline.R +# R CMD BATCH --vanilla r_app/run_full_pipeline.R # # ============================================================================== # ============================================================================== # *** EDIT THESE VARIABLES *** -end_date <- "2025-12-24" # or specify: "2025-12-02", Sys.Date() +end_date <- as.Date("2026-01-27") # or specify: as.Date("2026-01-27") , Sys.Date() offset <- 7 # days to look back project_dir <- "angata" # project name: "esa", "aura", "angata", "chemba" data_source <- if (project_dir == "angata") "merged_tif_8b" else "merged_tif" +force_rerun <- FALSE # Set to TRUE to force all scripts to run even if outputs exist # *************************** # Format dates @@ -35,117 +44,350 @@ end_date_str <- format(as.Date(end_date), "%Y-%m-%d") pipeline_success <- TRUE # ============================================================================== -# SCRIPT 02: CI EXTRACTION +# INTELLIGENT CHECKING: What has already been completed? # ============================================================================== -cat("\n========== RUNNING SCRIPT 02: CI EXTRACTION ==========\n") -tryCatch({ - source("r_app/02_ci_extraction.R") - main() # Call the main function - cat("✓ Script 02 completed\n") -}, error = function(e) { - cat("✗ Error in Script 02:", e$message, "\n") - pipeline_success <<- FALSE -}) +cat("\n========== CHECKING EXISTING OUTPUTS ==========\n") -# ============================================================================== -# SCRIPT 03: INTERPOLATE GROWTH MODEL -# ============================================================================== -cat("\n========== RUNNING SCRIPT 03: INTERPOLATE GROWTH MODEL ==========\n") -tryCatch({ - source("r_app/03_interpolate_growth_model.R") - main() # Call the main function - cat("✓ Script 03 completed\n") -}, error = function(e) { - cat("✗ Error in Script 03:", e$message, "\n") - pipeline_success <<- FALSE -}) - -# ============================================================================== -# SCRIPT 04: MOSAIC CREATION -# ============================================================================== -cat("\n========== RUNNING SCRIPT 04: MOSAIC CREATION ==========\n") -tryCatch({ - source("r_app/04_mosaic_creation.R") - main() # Call the main function - cat("✓ Script 04 completed\n") -}, error = function(e) { - cat("✗ Error in Script 04:", e$message, "\n") - pipeline_success <<- FALSE -}) - -# ============================================================================== -# SCRIPT 09: CALCULATE KPIs -# ============================================================================== -cat("\n========== RUNNING SCRIPT 09: CALCULATE KPIs ==========\n") -tryCatch({ - source("r_app/09_calculate_kpis.R") - main() # Call the main function - cat("✓ Script 09 (KPIs) completed\n") -}, error = function(e) { - cat("✗ Error in Script 09 (KPIs):", e$message, "\n") - pipeline_success <<- FALSE -}) - -# ============================================================================== -# SCRIPT 09: FIELD ANALYSIS WEEKLY -# ============================================================================== -# Only run field analysis weekly for angata project -if (project_dir == "angata") { - cat("\n========== RUNNING SCRIPT 09: FIELD ANALYSIS WEEKLY ==========\n") - tryCatch({ - source("r_app/09_field_analysis_weekly.R") - main() # Call the main function - cat("✓ Script 09 (Weekly) completed\n") - }, error = function(e) { - cat("✗ Error in Script 09 (Weekly):", e$message, "\n") - pipeline_success <<- FALSE - }) +# Check Script 10 outputs (tiled splits) +tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5") +tiles_dates <- if (dir.exists(tiles_dir)) { + list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE) } else { - cat("\n========== SKIPPING SCRIPT 09: FIELD ANALYSIS WEEKLY (only runs for angata) ==========\n") + c() } +cat(sprintf("Script 10: %d dates already tiled\n", length(tiles_dates))) + +# Check Script 20 outputs (CI extraction) - daily RDS files +ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals") +ci_files <- if (dir.exists(ci_daily_dir)) { + list.files(ci_daily_dir, pattern = "\\.rds$") +} else { + c() +} +cat(sprintf("Script 20: %d CI daily RDS files exist\n", length(ci_files))) + +# Check Script 21 outputs (CSV conversion) - note: this gets overwritten each time, so we don't skip based on this +# Instead, check if CI RDS files exist - if they do, 21 should also run +# For now, just note that CSV is time-dependent, not a good skip indicator +cat("Script 21: CSV file exists but gets overwritten - will run if Script 20 runs\n") + +# Check Script 40 outputs (mosaics in weekly_tile_max/5x5) +mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5") +mosaic_files <- if (dir.exists(mosaic_dir)) { + list.files(mosaic_dir, pattern = "\\.tif$") +} else { + c() +} +cat(sprintf("Script 40: %d mosaic files exist\n", length(mosaic_files))) + +# Check Script 80 outputs (KPIs in reports/kpis/field_stats) +kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats") +kpi_files <- if (dir.exists(kpi_dir)) { + list.files(kpi_dir, pattern = "\\.csv$|\\.json$") +} else { + c() +} +cat(sprintf("Script 80: %d KPI files exist\n", length(kpi_files))) + +# Determine if scripts should run based on outputs +skip_10 <- length(tiles_dates) > 0 && !force_rerun +skip_20 <- length(ci_files) > 0 && !force_rerun +skip_21 <- length(ci_files) > 0 && !force_rerun # Skip 21 if 20 is skipped +skip_40 <- length(mosaic_files) > 0 && !force_rerun +skip_80 <- FALSE # Always run Script 80 - it calculates KPIs for the current week (end_date), not historical weeks + +cat("\nSkipping decisions:\n") +cat(sprintf(" Script 10: %s\n", if(skip_10) "SKIP (tiles exist)" else "RUN")) +cat(sprintf(" Script 20: %s\n", if(skip_20) "SKIP (CI exists)" else "RUN")) +cat(sprintf(" Script 21: %s\n", if(skip_21) "SKIP (CI exists)" else "RUN")) +cat(sprintf(" Script 40: %s\n", if(skip_40) "SKIP (mosaics exist)" else "RUN")) +cat(sprintf(" Script 80: %s\n", if(skip_80) "SKIP (KPIs exist)" else "RUN")) # ============================================================================== -# SCRIPT 91: CI REPORT ANGATA (only for angata) +# PYTHON: DOWNLOAD PLANET IMAGES (MISSING DATES ONLY) # ============================================================================== -if (project_dir == "angata") { - cat("\n========== RUNNING SCRIPT 91: CI REPORT ANGATA ==========\n") - if (pipeline_success) { - tryCatch({ - rmarkdown::render("r_app/91_CI_report_with_kpis_Angata.Rmd", - output_format = "word_document", - params = list(data_dir = project_dir, report_date = end_date_str)) - cat("✓ Script 91 (Report) completed\n") - }, error = function(e) { - cat("✗ Error in Script 91 (Report):", e$message, "\n") - }) - } else { - cat("✗ Skipping Script 91: Previous pipeline scripts failed\n") +cat("\n========== DOWNLOADING PLANET IMAGES (MISSING DATES ONLY) ==========\n") +tryCatch({ + # Setup paths + base_path <- file.path("laravel_app", "storage", "app", project_dir) + merged_tifs_dir <- file.path(base_path, data_source) + + # Get existing dates from raw TIFFs + existing_tiff_files <- list.files(merged_tifs_dir, pattern = "^\\d{4}-\\d{2}-\\d{2}\\.tif$") + existing_tiff_dates <- sub("\\.tif$", "", existing_tiff_files) + + # Get existing dates from tiles (better indicator of completion) + existing_tile_dates <- tiles_dates + + # Find missing dates in the window + start_date <- end_date - offset + date_seq <- seq(start_date, end_date, by = "day") + target_dates <- format(date_seq, "%Y-%m-%d") + + # Only download if tiles don't exist yet (more reliable than checking raw TIFFs) + missing_dates <- target_dates[!(target_dates %in% existing_tile_dates)] + + cat(sprintf(" Existing tiled dates: %d\n", length(existing_tile_dates))) + cat(sprintf(" Missing dates in window: %d\n", length(missing_dates))) + + # Download each missing date + download_count <- 0 + download_failed <- 0 + + if (length(missing_dates) > 0) { + # Save current directory + original_dir <- getwd() + + # Change to python_app directory so relative paths work correctly + setwd("python_app") + + for (date_str in missing_dates) { + cmd <- sprintf('python 00_download_8band_pu_optimized.py "%s" --date "%s" --resolution 3 --cleanup', project_dir, date_str) + result <- system(cmd, ignore.stdout = FALSE, ignore.stderr = FALSE) + if (result == 0) { + download_count <- download_count + 1 + } else { + download_failed <- download_failed + 1 + } } + + # Change back to original directory + setwd(original_dir) + } + + cat(sprintf("✓ Downloaded %d dates, %d failed\n", download_count, download_failed)) + if (download_failed > 0) { + cat("⚠ Some downloads failed, but continuing pipeline\n") + } + + # Force Script 10 to run ONLY if downloads actually succeeded (not just attempted) + if (download_count > 0) { + skip_10 <- FALSE + } + +}, error = function(e) { + cat("✗ Error in planet download:", e$message, "\n") + pipeline_success <<- FALSE +}) + +# ============================================================================== +# SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs +# ============================================================================== +if (pipeline_success && !skip_10) { + cat("\n========== RUNNING SCRIPT 10: CREATE MASTER GRID AND SPLIT TIFFs ==========\n") + tryCatch({ + # Set environment variables for the script (Script 10 uses these for filtering) + assign("PROJECT", project_dir, envir = .GlobalEnv) + + # Suppress verbose per-date output, show only summary + sink(nullfile()) + source("r_app/10_create_master_grid_and_split_tiffs.R") + sink() + + # Verify output + tiles_dir <- file.path("laravel_app", "storage", "app", project_dir, "daily_tiles_split", "5x5") + if (dir.exists(tiles_dir)) { + subdirs <- list.dirs(tiles_dir, full.names = FALSE, recursive = FALSE) + cat(sprintf("✓ Script 10 completed - created tiles for %d dates\n", length(subdirs))) + } else { + cat("✓ Script 10 completed\n") + } + }, error = function(e) { + sink() + cat("✗ Error in Script 10:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_10) { + cat("\n========== SKIPPING SCRIPT 10 (tiles already exist) ==========\n") } # ============================================================================== -# SCRIPT 10: CI REPORT (SIMPLE) +# SCRIPT 20: CI EXTRACTION # ============================================================================== -# Only run CI report for non-angata projects +if (pipeline_success && !skip_20) { + cat("\n========== RUNNING SCRIPT 20: CI EXTRACTION ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/20_ci_extraction.R") + main() # Call main() to execute the script with the environment variables + + # Verify CI output was created + ci_daily_dir <- file.path("laravel_app", "storage", "app", project_dir, "Data", "extracted_ci", "daily_vals") + if (dir.exists(ci_daily_dir)) { + files <- list.files(ci_daily_dir, pattern = "\\.rds$") + cat(sprintf("✓ Script 20 completed - generated %d CI files\n", length(files))) + } else { + cat("✓ Script 20 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 20:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_20) { + cat("\n========== SKIPPING SCRIPT 20 (CI already extracted) ==========\n") +} +# ============================================================================== +# SCRIPT 21: CONVERT CI RDS TO CSV +# ============================================================================== +if (pipeline_success && !skip_21) { + cat("\n========== RUNNING SCRIPT 21: CONVERT CI RDS TO CSV ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + + source("r_app/21_convert_ci_rds_to_csv.R") + main() # Call main() to execute the script with the environment variables + + # Verify CSV output was created + ci_csv_path <- file.path("laravel_app", "storage", "app", project_dir, "ci_extracted") + if (dir.exists(ci_csv_path)) { + csv_files <- list.files(ci_csv_path, pattern = "\\.csv$") + cat(sprintf("✓ Script 21 completed - converted to %d CSV files\n", length(csv_files))) + } else { + cat("✓ Script 21 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 21:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_21) { + cat("\n========== SKIPPING SCRIPT 21 (CSV already created) ==========\n") +} -if (project_dir != "angata") { - cat("\n========== RUNNING SCRIPT 10: CI REPORT SIMPLE ==========\n") - if (pipeline_success) { - tryCatch({ - rmarkdown::render("r_app/10_CI_report_with_kpis_simple.Rmd", - output_format = "word_document", - params = list(data_dir = project_dir, report_date = end_date_str)) - cat("✓ Script 10 (Report) completed\n") - }, error = function(e) { - cat("✗ Error in Script 10 (Report):", e$message, "\n") - }) - } else { - cat("✗ Skipping Script 10: Previous pipeline scripts failed\n") - } - } else { - cat("\n========== SKIPPING SCRIPT 10: CI REPORT SIMPLE (not applicable for angata) ==========\n") - } +# ============================================================================== +# SCRIPT 30: INTERPOLATE GROWTH MODEL +# ============================================================================== +if (pipeline_success) { + cat("\n========== RUNNING SCRIPT 30: INTERPOLATE GROWTH MODEL ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/30_interpolate_growth_model.R") + main() # Call main() to execute the script with the environment variables + + # Verify interpolated output + growth_dir <- file.path("laravel_app", "storage", "app", project_dir, "growth_model_interpolated") + if (dir.exists(growth_dir)) { + files <- list.files(growth_dir, pattern = "\\.rds$|\\.csv$") + cat(sprintf("✓ Script 30 completed - generated %d growth model files\n", length(files))) + } else { + cat("✓ Script 30 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 30:", e$message, "\n") + pipeline_success <<- FALSE + }) +} + +# ============================================================================== +# PYTHON 31: HARVEST IMMINENT WEEKLY +# ============================================================================== +if (pipeline_success) { + cat("\n========== RUNNING PYTHON 31: HARVEST IMMINENT WEEKLY ==========\n") + tryCatch({ + # Run Python script in pytorch_gpu conda environment + # Script expects positional project name (not --project flag) + # Run from smartcane root so conda can find the environment + cmd <- sprintf('conda run -n pytorch_gpu python python_app/31_harvest_imminent_weekly.py %s', project_dir) + cat("DEBUG: Running command:", cmd, "\n") + result <- system(cmd) + + if (result == 0) { + # Verify harvest output - check for THIS WEEK's specific file + current_week <- as.numeric(format(end_date, "%V")) + current_year <- as.numeric(format(end_date, "%Y")) + expected_file <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats", + sprintf("%s_harvest_imminent_week_%02d_%d.csv", project_dir, current_week, current_year)) + + if (file.exists(expected_file)) { + cat(sprintf("✓ Script 31 completed - generated harvest imminent file for week %02d\n", current_week)) + } else { + cat("✓ Script 31 completed (check if harvest.xlsx is available)\n") + } + } else { + cat("⚠ Script 31 completed with errors (check harvest.xlsx availability)\n") + } + }, error = function(e) { + setwd(original_dir) + cat("⚠ Script 31 error:", e$message, "\n") + }) +} + +# ============================================================================== +# SCRIPT 40: MOSAIC CREATION +# ============================================================================== +if (pipeline_success && !skip_40) { + cat("\n========== RUNNING SCRIPT 40: MOSAIC CREATION ==========\n") + tryCatch({ + # Set environment variables for the script + assign("end_date", end_date, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/40_mosaic_creation.R") + main() # Call main() to execute the script with the environment variables + + # Verify mosaic output + mosaic_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max", "5x5") + if (dir.exists(mosaic_dir)) { + files <- list.files(mosaic_dir, pattern = "\\.tif$") + cat(sprintf("✓ Script 40 completed - generated %d mosaic files\n", length(files))) + } else { + cat("✓ Script 40 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 40:", e$message, "\n") + pipeline_success <<- FALSE + }) +} else if (skip_40) { + cat("\n========== SKIPPING SCRIPT 40 (mosaics already created) ==========\n") +} + +# ============================================================================== +# SCRIPT 80: CALCULATE KPIs +# ============================================================================== +if (pipeline_success) { # Always run Script 80 - it calculates KPIs for the current week + cat("\n========== RUNNING SCRIPT 80: CALCULATE KPIs ==========\n") + tryCatch({ + # Set environment variables for the script (Script 80's main() uses these as fallbacks) + # NOTE: end_date is already a Date, just assign directly without as.Date() + assign("end_date", end_date, envir = .GlobalEnv) + assign("end_date_str", end_date_str, envir = .GlobalEnv) + assign("offset", offset, envir = .GlobalEnv) + assign("project_dir", project_dir, envir = .GlobalEnv) + assign("data_source", data_source, envir = .GlobalEnv) + + source("r_app/80_calculate_kpis.R") + main() # Call main() to execute the script with the environment variables + + # Verify KPI output + kpi_dir <- file.path("laravel_app", "storage", "app", project_dir, "reports", "kpis", "field_stats") + if (dir.exists(kpi_dir)) { + files <- list.files(kpi_dir, pattern = "\\.csv$|\\.json$") + cat(sprintf("✓ Script 80 completed - generated %d KPI files\n", length(files))) + } else { + cat("✓ Script 80 completed\n") + } + }, error = function(e) { + cat("✗ Error in Script 80:", e$message, "\n") + cat("Full error:\n") + print(e) + pipeline_success <<- FALSE + }) +} # ============================================================================== # SUMMARY @@ -154,4 +396,9 @@ cat("\n========== PIPELINE COMPLETE ==========\n") cat(sprintf("Project: %s\n", project_dir)) cat(sprintf("End Date: %s\n", end_date_str)) cat(sprintf("Offset: %d days\n", offset)) -cat("Scripts executed: 02, 03, 04, 09 (KPIs), 09 (Weekly), 10 (CI Report)\n") +if (pipeline_success) { + cat("Status: ✓ All scripts completed successfully\n") +} else { + cat("Status: ✗ Pipeline failed - check errors above\n") +} +cat("Pipeline sequence: Python Download → R 10 → R 20 → R 21 → R 30 → Python 31 → R 40 → R 80\n")