From 073f1567d96fae5f998780805bb63c87a17ee234 Mon Sep 17 00:00:00 2001 From: Timon Date: Mon, 16 Mar 2026 16:16:16 +0100 Subject: [PATCH] wip --- python_app/weather_api_comparison.py | 648 ++++++++++++++++++++++----- 1 file changed, 527 insertions(+), 121 deletions(-) diff --git a/python_app/weather_api_comparison.py b/python_app/weather_api_comparison.py index d8cb238..821de8e 100644 --- a/python_app/weather_api_comparison.py +++ b/python_app/weather_api_comparison.py @@ -12,34 +12,42 @@ ARCHIVE providers (no API key required): 4. NASA POWER — completely independent source (0.50°, global) FORECAST providers (no API key required): - 5. Open-Meteo Forecast — ensemble of NWP models (global) - 6. YR.no LocationForecast — Norwegian Met Institute (~10 days, global) + 5. Open-Meteo Forecast — deterministic NWP (global) + 6. Open-Meteo Ensemble — ECMWF IFS 51-member ensemble; gives probability bands + 7. YR.no LocationForecast — Norwegian Met Institute (~10 days, global) FORECAST providers (API key required — set in CONFIG below, leave "" to skip): - 7. OpenWeatherMap — free tier, 1000 calls/day - 8. WeatherAPI.com — free tier + 8. OpenWeatherMap — free tier, 1000 calls/day + 9. WeatherAPI.com — free tier OUTPUT: Plots saved to: weather_comparison_plots/ - Summary stats printed to console. + archive_.png — daily lines + multi-source spread band + agreement signal + archive_rolling_.png — 30-day rolling mean comparison (original style) + cumulative_.png — cumulative annual precipitation + vs_era5_.png — each provider vs ERA5 scatter (note: ERA5 is not ground truth) + pairwise_.png — pairwise Pearson r and RMSE heatmaps (unbiased) + wetdry_agreement_.png — % of days providers agree on wet vs dry + forecast_ensemble_.png — ensemble uncertainty bands + exceedance probability + forecast_.png — deterministic forecast bars (original style) Usage: python weather_api_comparison.py """ -import os -import json -import time import datetime -import requests -import pandas as pd +import time + import matplotlib matplotlib.use("Agg") -import matplotlib.pyplot as plt import matplotlib.dates as mdates +import matplotlib.pyplot as plt import numpy as np +import pandas as pd +import requests from pathlib import Path + # ============================================================ # CONFIG # ============================================================ @@ -74,7 +82,6 @@ def fetch_openmeteo_archive(lat, lon, start, end, model="era5"): """Open-Meteo ERA5 / ERA5-Land / CERRA archive. ERA5 is the default (no models param needed). ERA5-Land and CERRA use lowercase names. """ - # ERA5 is the default model — adding models param with wrong casing causes 400 model_suffix = "" if model == "era5" else f"&models={model}" url = ( f"https://archive-api.open-meteo.com/v1/archive" @@ -128,7 +135,6 @@ def fetch_nasa_power(lat, lon, start, end): def fetch_openmeteo_forecast(lat, lon, days=8): """Open-Meteo NWP forecast — default best_match model.""" - end = datetime.date.today() + datetime.timedelta(days=days) url = ( f"https://api.open-meteo.com/v1/forecast" f"?latitude={lat}&longitude={lon}" @@ -147,8 +153,57 @@ def fetch_openmeteo_forecast(lat, lon, days=8): return df +def fetch_openmeteo_ensemble_forecast(lat, lon, days=8): + """Open-Meteo ECMWF IFS ensemble forecast — 51 members at 0.4° resolution. + + Returns a DataFrame with daily percentile bands and exceedance probabilities: + p10, p25, p50 (median), p75, p90 — mm/day + prob_gt_1mm, prob_gt_5mm — % of members exceeding threshold + n_members — number of members in response + """ + url = ( + f"https://ensemble-api.open-meteo.com/v1/ensemble" + f"?latitude={lat}&longitude={lon}" + f"&daily=precipitation_sum" + f"&models=ecmwf_ifs04" + f"&forecast_days={days}" + f"&timezone=UTC" + ) + r = requests.get(url, timeout=60) + r.raise_for_status() + body = r.json() + daily = body["daily"] + dates = pd.to_datetime(daily["time"]) + + # Member columns are named like "precipitation_sum_member01", etc. + member_keys = [k for k in daily.keys() if k.startswith("precipitation_sum")] + if not member_keys: + print(" ⚠ No member columns found in ensemble response") + return None + + # Shape: (n_members, n_days) + members = np.array([daily[k] for k in member_keys], dtype=float) + members = np.where(members < 0, 0, members) # clip negatives + + df = pd.DataFrame({ + "date": dates, + "p10": np.nanpercentile(members, 10, axis=0), + "p25": np.nanpercentile(members, 25, axis=0), + "p50": np.nanpercentile(members, 50, axis=0), + "p75": np.nanpercentile(members, 75, axis=0), + "p90": np.nanpercentile(members, 90, axis=0), + "mean": np.nanmean(members, axis=0), + "prob_gt_1mm": np.mean(members > 1.0, axis=0) * 100, + "prob_gt_5mm": np.mean(members > 5.0, axis=0) * 100, + "n_members": members.shape[0], + }) + return df + + def fetch_yr_forecast(lat, lon): - """YR.no LocationForecast 2.0 — hourly precip aggregated to daily.""" + """YR.no LocationForecast 2.0 — hourly precip aggregated to daily. + Note: forecast-only service; no historical archive is available. + """ url = f"https://api.met.no/weatherapi/locationforecast/2.0/compact?lat={lat}&lon={lon}" headers = {"User-Agent": USER_AGENT} r = requests.get(url, headers=headers, timeout=30) @@ -234,88 +289,209 @@ def compare_stats(df, ref_col, other_col): "Bias": round(bias,2), "r": round(r,3)} +def compute_pairwise_stats(data_dict): + """Compute pairwise Pearson r and RMSE for all archive provider pairs. + + Returns: + names — list of provider names (same order as matrices) + r_matrix — (n x n) Pearson r array, NaN where not computable + rmse_matrix — (n x n) RMSE array (mm/day), 0 on diagonal + """ + providers = [(name, df) for name, df in data_dict.items() + if df is not None and len(df) > 5] + names = [p[0] for p in providers] + n = len(names) + + r_matrix = np.full((n, n), np.nan) + rmse_matrix = np.full((n, n), np.nan) + + for i in range(n): + r_matrix[i, i] = 1.0 + rmse_matrix[i, i] = 0.0 + _, df_i = providers[i] + for j in range(i + 1, n): + _, df_j = providers[j] + merged = df_i.merge(df_j, on="date", suffixes=("_i", "_j")) + valid = merged[["rain_mm_i", "rain_mm_j"]].dropna() + if len(valid) < 5: + continue + r = valid["rain_mm_i"].corr(valid["rain_mm_j"]) + rmse = ((valid["rain_mm_i"] - valid["rain_mm_j"]) ** 2).mean() ** 0.5 + r_matrix[i, j] = r_matrix[j, i] = round(r, 3) + rmse_matrix[i, j] = rmse_matrix[j, i] = round(rmse, 2) + + return names, r_matrix, rmse_matrix + + +def wetdry_agreement(data_dict, threshold=1.0): + """For each provider pair, compute % of days both-dry / both-wet / disagree. + + A day is 'dry' if rain_mm < threshold (default 1 mm). + Returns a list of dicts: pair, both_dry, both_wet, disagree, n. + """ + providers = [(name, df) for name, df in data_dict.items() + if df is not None and len(df) > 5] + results = [] + for i, (name_i, df_i) in enumerate(providers): + for j in range(i + 1, len(providers)): + name_j, df_j = providers[j] + merged = df_i.merge(df_j, on="date", suffixes=("_i", "_j")) + valid = merged[["rain_mm_i", "rain_mm_j"]].dropna() + if len(valid) < 5: + continue + dry_i = valid["rain_mm_i"] < threshold + dry_j = valid["rain_mm_j"] < threshold + both_dry = (dry_i & dry_j).mean() * 100 + both_wet = (~dry_i & ~dry_j).mean() * 100 + disagree = 100 - both_dry - both_wet + results.append({ + "pair": f"{name_i}\nvs\n{name_j}", + "both_dry": round(both_dry, 1), + "both_wet": round(both_wet, 1), + "disagree": round(disagree, 1), + "n": len(valid), + }) + return results + + # ============================================================ -# PLOTTING +# PLOTTING — ARCHIVE # ============================================================ -def plot_archive(data_dict, location_name, start, end, output_dir): - """Line plot of all archive providers for one location.""" - fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True) +ARCHIVE_COLORS = { + "ERA5 (Open-Meteo)": "#1f77b4", + "ERA5-Land (Open-Meteo)": "#ff7f0e", + "CERRA (Open-Meteo)": "#2ca02c", + "NASA POWER": "#d62728", +} - colors = { - "ERA5 (Open-Meteo)": "#1f77b4", - "ERA5-Land (Open-Meteo)": "#ff7f0e", - "CERRA (Open-Meteo)": "#2ca02c", - "NASA POWER": "#d62728", - } - # Top: daily raw +def _build_spread_frame(data_dict): + """Merge all valid archive providers onto a common date axis. + Returns a DataFrame with one column per provider + _mean and _std columns. + """ + valid = {name: df for name, df in data_dict.items() + if df is not None and len(df) > 0} + if not valid: + return None, [] + + merged = None + for name, df in valid.items(): + tmp = df[["date", "rain_mm"]].rename(columns={"rain_mm": name}) + merged = tmp if merged is None else merged.merge(tmp, on="date", how="outer") + + cols = list(valid.keys()) + merged[cols] = merged[cols].clip(lower=0) + merged["_mean"] = merged[cols].mean(axis=1) + merged["_std"] = merged[cols].std(axis=1) + return merged.sort_values("date").reset_index(drop=True), cols + + +def plot_archive_with_spread(data_dict, location_name, start, end, output_dir): + """Three-panel archive plot: + Top: Individual provider lines + multi-source mean±std shading + Middle: 30-day rolling mean + Bottom: Inter-source std (agreement signal — used to flag uncertain periods) + """ + spread, _ = _build_spread_frame(data_dict) + if spread is None: + return + + valid_providers = {name: df for name, df in data_dict.items() + if df is not None and len(df) > 0} + + fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True) + + # --- Top: daily raw + spread band --- ax1 = axes[0] - for name, df in data_dict.items(): - if df is not None and len(df) > 0: - ax1.plot(df["date"], df["rain_mm"], label=name, - color=colors.get(name), linewidth=0.8, alpha=0.85) + ax1.fill_between( + spread["date"], + (spread["_mean"] - spread["_std"]).clip(lower=0), + spread["_mean"] + spread["_std"], + alpha=0.15, color="gray", label="Multi-source ±1 std" + ) + ax1.plot(spread["date"], spread["_mean"], color="black", linewidth=1.0, + linestyle="--", alpha=0.6, label="Multi-source mean", zorder=5) + for name, df in valid_providers.items(): + ax1.plot(df["date"], df["rain_mm"], + label=name, color=ARCHIVE_COLORS.get(name, "gray"), + linewidth=0.7, alpha=0.85) ax1.set_ylabel("Precipitation (mm/day)") - ax1.set_title(f"{location_name} — Daily Precipitation (archive {start} → {end})") - ax1.legend(fontsize=8) + ax1.set_title( + f"{location_name} — Daily Precipitation with Multi-Source Spread\n" + f"{start} → {end} | Grey band = ±1 std across all providers" + ) + ax1.legend(fontsize=8, ncol=2) ax1.grid(True, alpha=0.3) - # Bottom: 30-day rolling mean + # --- Middle: 30-day rolling mean --- ax2 = axes[1] - for name, df in data_dict.items(): - if df is not None and len(df) > 0: - rolled = df.set_index("date")["rain_mm"].rolling(30, min_periods=15).mean() - ax2.plot(rolled.index, rolled.values, label=name, - color=colors.get(name), linewidth=1.5) + roll_mean = spread.set_index("date")["_mean"].rolling(30, min_periods=15).mean() + roll_std = spread.set_index("date")["_std"].rolling(30, min_periods=15).mean() + ax2.fill_between( + roll_mean.index, + (roll_mean - roll_std).clip(lower=0), + roll_mean + roll_std, + alpha=0.15, color="gray" + ) + for name, df in valid_providers.items(): + rolled = df.set_index("date")["rain_mm"].rolling(30, min_periods=15).mean() + ax2.plot(rolled.index, rolled.values, + label=name, color=ARCHIVE_COLORS.get(name, "gray"), linewidth=1.5) ax2.set_ylabel("30-day rolling mean (mm/day)") - ax2.set_xlabel("Date") ax2.legend(fontsize=8) ax2.grid(True, alpha=0.3) - ax2.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y")) - fig.autofmt_xdate() + # --- Bottom: inter-source std (agreement signal) --- + ax3 = axes[2] + ax3.fill_between( + spread["date"], 0, spread["_std"], + color="purple", alpha=0.35, + label="Std across providers (higher = less agreement)" + ) + median_std = spread["_std"].median() + ax3.axhline(median_std, color="purple", linestyle=":", linewidth=1, + label=f"Median std = {median_std:.2f} mm/day") + ax3.set_ylabel("Std dev across\nproviders (mm)") + ax3.set_xlabel("Date") + ax3.legend(fontsize=8) + ax3.grid(True, alpha=0.3) + ax3.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y")) + + fig.autofmt_xdate() plt.tight_layout() path = output_dir / f"archive_{location_name}.png" - plt.savefig(path, dpi=150) + plt.savefig(path, dpi=150, bbox_inches="tight") plt.close() print(f" Saved: {path}") -def plot_forecast(data_dict, location_name, output_dir): - """Bar chart comparing 7-day forecasts across providers.""" - fig, ax = plt.subplots(figsize=(12, 5)) +def plot_cumulative(data_dict, location_name, output_dir): + """Cumulative annual precipitation — most relevant for crop/irrigation context.""" + fig, ax = plt.subplots(figsize=(14, 5)) - providers = [(name, df) for name, df in data_dict.items() if df is not None and len(df) > 0] - n = len(providers) - if n == 0: - plt.close() - return + for name, df in data_dict.items(): + if df is None or len(df) == 0: + continue + s = df.set_index("date")["rain_mm"].sort_index().cumsum() + total = s.iloc[-1] + ax.plot(s.index, s.values, + label=f"{name} (total: {total:.0f} mm)", + color=ARCHIVE_COLORS.get(name, "gray"), linewidth=1.8) - # Collect all forecast dates - all_dates = sorted(set( - d for _, df in providers - for d in df["date"].dt.date.tolist() - )) - x = np.arange(len(all_dates)) - width = 0.8 / n - - cmap = matplotlib.colormaps["tab10"].resampled(n) - for i, (name, df) in enumerate(providers): - vals = [] - date_map = dict(zip(df["date"].dt.date, df["rain_mm"])) - for d in all_dates: - vals.append(date_map.get(d, 0.0)) - ax.bar(x + i * width, vals, width, label=name, color=cmap(i), alpha=0.85) - - ax.set_xticks(x + width * (n - 1) / 2) - ax.set_xticklabels([d.strftime("%d %b") for d in all_dates], rotation=45, ha="right") - ax.set_ylabel("Precipitation (mm/day)") - ax.set_title(f"{location_name} — 7-Day Forecast Comparison") + ax.set_ylabel("Cumulative precipitation (mm)") + ax.set_xlabel("Date") + ax.set_title( + f"{location_name} — Cumulative Annual Precipitation by Provider\n" + "Divergence = sources disagree on total seasonal rainfall" + ) ax.legend(fontsize=9) - ax.grid(True, axis="y", alpha=0.3) + ax.grid(True, alpha=0.3) + ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y")) + fig.autofmt_xdate() plt.tight_layout() - path = output_dir / f"forecast_{location_name}.png" + path = output_dir / f"cumulative_{location_name}.png" plt.savefig(path, dpi=150) plt.close() print(f" Saved: {path}") @@ -324,6 +500,10 @@ def plot_forecast(data_dict, location_name, output_dir): def plot_vs_era5(data_dict, location_name, output_dir): """Each provider vs ERA5 reference: scatter + regression line. + NOTE: ERA5 is used as reference here for visual comparison only. + It is a reanalysis product, not a ground-truth measurement. + Use the pairwise heatmap (plot_pairwise_heatmap) for an unbiased comparison. + How to read: - Each panel shows one provider (y-axis) vs ERA5 (x-axis) for daily precip. - Points on the red diagonal = perfect agreement. @@ -344,26 +524,18 @@ def plot_vs_era5(data_dict, location_name, output_dir): n = len(others) fig, axes = plt.subplots(1, n, figsize=(5 * n, 5), squeeze=False) - colors = { - "ERA5-Land (Open-Meteo)": "#ff7f0e", - "CERRA (Open-Meteo)": "#2ca02c", - "NASA POWER": "#d62728", - } - for i, (name, df) in enumerate(others): ax = axes[0][i] merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp")) valid = merged[["rain_mm_ref", "rain_mm_cmp"]].dropna() - color = colors.get(name, "steelblue") + color = ARCHIVE_COLORS.get(name, "steelblue") ax.scatter(valid["rain_mm_ref"], valid["rain_mm_cmp"], s=4, alpha=0.35, color=color) - # Perfect-agreement diagonal lim = max(valid.max().max(), 1) * 1.05 ax.plot([0, lim], [0, lim], "r--", linewidth=1, label="Perfect agreement") - # Linear regression line if len(valid) > 5: coeffs = np.polyfit(valid["rain_mm_ref"], valid["rain_mm_cmp"], 1) x_fit = np.linspace(0, lim, 100) @@ -382,8 +554,8 @@ def plot_vs_era5(data_dict, location_name, output_dir): ax.grid(True, alpha=0.3) fig.suptitle( - f"{location_name} — Daily Precip vs ERA5 Reference\n" - "Red dashed = perfect agreement. Points above line = provider wetter than ERA5.", + f"{location_name} — Daily Precip vs ERA5 (reference only — ERA5 is not ground truth)\n" + "Red dashed = perfect agreement. See pairwise heatmap for unbiased comparison.", fontsize=10 ) plt.tight_layout() @@ -393,38 +565,262 @@ def plot_vs_era5(data_dict, location_name, output_dir): print(f" Saved: {path}") -def plot_cumulative(data_dict, location_name, output_dir): - """Cumulative annual precipitation — most relevant for crop/irrigation context.""" - fig, ax = plt.subplots(figsize=(14, 5)) +def plot_pairwise_heatmap(data_dict, location_name, output_dir): + """Two heatmaps side by side: pairwise Pearson r and pairwise RMSE. - colors = { - "ERA5 (Open-Meteo)": "#1f77b4", - "ERA5-Land (Open-Meteo)": "#ff7f0e", - "CERRA (Open-Meteo)": "#2ca02c", - "NASA POWER": "#d62728", - } + No provider is treated as reference — all pairs compared equally. + High r + low RMSE = sources agree. Where they diverge, neither is ground truth. + """ + names, r_matrix, rmse_matrix = compute_pairwise_stats(data_dict) + if len(names) < 2: + return - for name, df in data_dict.items(): - if df is None or len(df) == 0: - continue - s = df.set_index("date")["rain_mm"].sort_index().cumsum() - total = s.iloc[-1] - ax.plot(s.index, s.values, label=f"{name} (total: {total:.0f} mm)", - color=colors.get(name), linewidth=1.8) + n = len(names) + short = [nm.replace(" (Open-Meteo)", "\n(OM)") for nm in names] - ax.set_ylabel("Cumulative precipitation (mm)") - ax.set_xlabel("Date") - ax.set_title( - f"{location_name} — Cumulative Annual Precipitation by Provider\n" - "Divergence = sources disagree on total seasonal rainfall" + fig, axes = plt.subplots(1, 2, figsize=(max(10, n * 2.5), max(5, n * 1.5 + 1))) + + # Pearson r + ax1 = axes[0] + im1 = ax1.imshow(r_matrix, vmin=0, vmax=1, cmap="YlGn", aspect="auto") + plt.colorbar(im1, ax=ax1, label="Pearson r") + ax1.set_xticks(range(n)); ax1.set_yticks(range(n)) + ax1.set_xticklabels(short, fontsize=8, rotation=45, ha="right") + ax1.set_yticklabels(short, fontsize=8) + for i in range(n): + for j in range(n): + val = r_matrix[i, j] + if not np.isnan(val): + text_color = "white" if val > 0.8 else "black" + ax1.text(j, i, f"{val:.2f}", ha="center", va="center", + fontsize=9, color=text_color) + ax1.set_title("Pairwise Pearson r\n(1.0 = perfect agreement, no reference)") + + # RMSE + ax2 = axes[1] + off_diag = rmse_matrix[rmse_matrix > 0] + vmax_rmse = float(np.nanmax(off_diag)) if len(off_diag) > 0 else 10 + im2 = ax2.imshow(rmse_matrix, vmin=0, vmax=vmax_rmse, cmap="YlOrRd_r", aspect="auto") + plt.colorbar(im2, ax=ax2, label="RMSE (mm/day)") + ax2.set_xticks(range(n)); ax2.set_yticks(range(n)) + ax2.set_xticklabels(short, fontsize=8, rotation=45, ha="right") + ax2.set_yticklabels(short, fontsize=8) + for i in range(n): + for j in range(n): + val = rmse_matrix[i, j] + if not np.isnan(val): + ax2.text(j, i, f"{val:.1f}", ha="center", va="center", fontsize=9) + ax2.set_title("Pairwise RMSE (mm/day)\n(0 = perfect agreement)") + + fig.suptitle( + f"{location_name} — Archive Provider Agreement (unbiased — no single reference)\n" + "Pairs with high r + low RMSE are consistent. Divergent pairs reveal dataset uncertainty.", + fontsize=10 ) - ax.legend(fontsize=9) - ax.grid(True, alpha=0.3) - ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y")) + plt.tight_layout() + path = output_dir / f"pairwise_{location_name}.png" + plt.savefig(path, dpi=150, bbox_inches="tight") + plt.close() + print(f" Saved: {path}") + + +def plot_wetdry_agreement(data_dict, location_name, output_dir, threshold=1.0): + """Stacked bar chart: for each provider pair, % of days both-dry / both-wet / disagree. + + High 'disagree' % means one source says rain, the other says dry on the same day. + This is the most practically relevant divergence for SmartCane crop monitoring. + """ + results = wetdry_agreement(data_dict, threshold) + if not results: + return + + pairs = [r["pair"] for r in results] + both_dry = [r["both_dry"] for r in results] + both_wet = [r["both_wet"] for r in results] + disagree = [r["disagree"] for r in results] + + fig, ax = plt.subplots(figsize=(max(8, len(pairs) * 2.8), 6)) + x = np.arange(len(pairs)) + + ax.bar(x, both_dry, label=f"Both dry (<{threshold} mm)", color="#a8d5e2", edgecolor="white") + ax.bar(x, both_wet, bottom=both_dry, + label=f"Both wet (≥{threshold} mm)", color="#3a7ebf", edgecolor="white") + bottom2 = [d + w for d, w in zip(both_dry, both_wet)] + ax.bar(x, disagree, bottom=bottom2, + label="Disagree (one wet, one dry)", color="#e07b54", edgecolor="white") + + # Add % labels inside bars + for i, r in enumerate(results): + if r["both_dry"] > 4: + ax.text(i, r["both_dry"] / 2, f"{r['both_dry']:.0f}%", + ha="center", va="center", fontsize=7, color="black") + if r["both_wet"] > 4: + ax.text(i, r["both_dry"] + r["both_wet"] / 2, f"{r['both_wet']:.0f}%", + ha="center", va="center", fontsize=7, color="white") + if r["disagree"] > 4: + ax.text(i, bottom2[i] + r["disagree"] / 2, f"{r['disagree']:.0f}%", + ha="center", va="center", fontsize=7, color="white") + + ax.set_xticks(x) + ax.set_xticklabels(pairs, fontsize=8) + ax.set_ylim(0, 108) + ax.set_ylabel("% of days in archive period") + ax.set_title( + f"{location_name} — Provider Agreement on Wet vs Dry Days\n" + f"Threshold = {threshold} mm/day | Orange = providers disagree on whether it rained" + ) + ax.legend(loc="upper right", fontsize=9) + ax.grid(True, axis="y", alpha=0.3) + plt.tight_layout() + + path = output_dir / f"wetdry_agreement_{location_name}.png" + plt.savefig(path, dpi=150, bbox_inches="tight") + plt.close() + print(f" Saved: {path}") + + +# ============================================================ +# PLOTTING — FORECAST +# ============================================================ + +def plot_forecast_with_ensemble(forecast_data, ensemble_df, location_name, output_dir): + """Two-panel forecast comparison: + Top: Deterministic bars (YR.no, Open-Meteo) + ECMWF ensemble percentile shading + Bottom: Exceedance probabilities P(rain > 1 mm) and P(rain > 5 mm) + + Reading guide: + - Dark blue shading = where 50% of ensemble members agree (25th–75th %ile) + - Light blue shading = where 80% of ensemble members agree (10th–90th %ile) + - Bars = deterministic (single-value) forecast from each provider + - Bottom panel: at 50% on the dashed line = coin-flip uncertainty about rain event + """ + has_ensemble = ensemble_df is not None and len(ensemble_df) > 0 + det_providers = [(name, df) for name, df in forecast_data.items() + if df is not None and len(df) > 0] + + if not has_ensemble and not det_providers: + return + + fig, axes = plt.subplots( + 2, 1, figsize=(12, 8), sharex=True, + gridspec_kw={"height_ratios": [3, 1.5]} + ) + + # ---- TOP: deterministic bars + ensemble shading ---- + ax1 = axes[0] + + if has_ensemble: + ax1.fill_between( + ensemble_df["date"], ensemble_df["p10"], ensemble_df["p90"], + alpha=0.12, color="steelblue", label="Ensemble 10th–90th %ile" + ) + ax1.fill_between( + ensemble_df["date"], ensemble_df["p25"], ensemble_df["p75"], + alpha=0.28, color="steelblue", label="Ensemble 25th–75th %ile" + ) + ax1.plot( + ensemble_df["date"], ensemble_df["p50"], + color="steelblue", linewidth=1.8, linestyle="-", + label=f"Ensemble median ({int(ensemble_df['n_members'].iloc[0])} members)", + zorder=5 + ) + + det_colors = {"Open-Meteo Forecast": "#1f77b4", "YR.no": "#e8882a"} + bar_half = datetime.timedelta(hours=10) # offset so bars don't overlap + n_det = len(det_providers) + offsets = np.linspace(-bar_half.total_seconds() / 3600 * (n_det - 1) / 2, + bar_half.total_seconds() / 3600 * (n_det - 1) / 2, + n_det) + bar_width_days = 0.3 + + for k, (name, df) in enumerate(det_providers): + offset_td = datetime.timedelta(hours=offsets[k]) + shifted_dates = df["date"] + offset_td + ax1.bar( + shifted_dates, df["rain_mm"], + width=bar_width_days, + label=name, + color=det_colors.get(name, f"C{k}"), + alpha=0.75, + zorder=4, + ) + + ax1.set_ylabel("Precipitation (mm/day)") + ax1.set_title( + f"{location_name} — 7-Day Forecast\n" + "Shading = Open-Meteo ECMWF ensemble spread | Bars = deterministic forecasts" + ) + ax1.legend(fontsize=8, loc="upper right") + ax1.grid(True, axis="y", alpha=0.3) + ax1.set_ylim(bottom=0) + + # ---- BOTTOM: exceedance probabilities ---- + ax2 = axes[1] + if has_ensemble: + ax2.step( + ensemble_df["date"], ensemble_df["prob_gt_1mm"], + where="mid", color="steelblue", linewidth=2.0, + label="P(rain > 1 mm)" + ) + ax2.step( + ensemble_df["date"], ensemble_df["prob_gt_5mm"], + where="mid", color="darkblue", linewidth=2.0, linestyle="--", + label="P(rain > 5 mm)" + ) + ax2.fill_between( + ensemble_df["date"], 0, ensemble_df["prob_gt_5mm"], + step="mid", alpha=0.18, color="darkblue" + ) + ax2.axhline(50, color="gray", linestyle=":", linewidth=0.9, alpha=0.8, + label="50% (coin-flip)") + ax2.set_ylim(0, 108) + ax2.set_ylabel("Exceedance\nprobability (%)") + ax2.legend(fontsize=8, loc="upper right") + ax2.grid(True, alpha=0.3) + ax2.set_xlabel("Date") + ax2.xaxis.set_major_formatter(mdates.DateFormatter("%d %b")) + fig.autofmt_xdate() plt.tight_layout() - path = output_dir / f"cumulative_{location_name}.png" + path = output_dir / f"forecast_ensemble_{location_name}.png" + plt.savefig(path, dpi=150, bbox_inches="tight") + plt.close() + print(f" Saved: {path}") + + +def plot_forecast(data_dict, location_name, output_dir): + """Bar chart comparing deterministic 7-day forecasts (original style, kept for reference).""" + _, ax = plt.subplots(figsize=(12, 5)) + + providers = [(name, df) for name, df in data_dict.items() if df is not None and len(df) > 0] + n = len(providers) + if n == 0: + plt.close() + return + + all_dates = sorted(set( + d for _, df in providers + for d in df["date"].dt.date.tolist() + )) + x = np.arange(len(all_dates)) + width = 0.8 / n + + cmap = matplotlib.colormaps["tab10"].resampled(n) + for i, (name, df) in enumerate(providers): + date_map = dict(zip(df["date"].dt.date, df["rain_mm"])) + vals = [date_map.get(d, 0.0) for d in all_dates] + ax.bar(x + i * width, vals, width, label=name, color=cmap(i), alpha=0.85) + + ax.set_xticks(x + width * (n - 1) / 2) + ax.set_xticklabels([d.strftime("%d %b") for d in all_dates], rotation=45, ha="right") + ax.set_ylabel("Precipitation (mm/day)") + ax.set_title(f"{location_name} — 7-Day Deterministic Forecast Comparison") + ax.legend(fontsize=9) + ax.grid(True, axis="y", alpha=0.3) + plt.tight_layout() + + path = output_dir / f"forecast_{location_name}.png" plt.savefig(path, dpi=150) plt.close() print(f" Saved: {path}") @@ -489,28 +885,26 @@ def run_location(loc_name, lat, lon): print(f" ✗ NASA POWER failed: {e}") archive_data["NASA POWER"] = None - # Stats vs ERA5 reference - print("\n Stats vs ERA5 (Open-Meteo) reference:") - ref_df = archive_data.get("ERA5 (Open-Meteo)") - for name, df in archive_data.items(): - if name == "ERA5 (Open-Meteo)" or df is None: - continue - if ref_df is None: - continue - merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp")) - stats = compare_stats(merged, "rain_mm_ref", "rain_mm_cmp") - print(f" {name:30s} n={stats['n']:4d} MAE={stats['MAE']} " - f"RMSE={stats['RMSE']} Bias={stats['Bias']} r={stats['r']}") + # Pairwise stats (no ERA5 reference bias) + print("\n Pairwise archive stats (Pearson r | RMSE mm/day | Bias mm/day):") + names, r_mat, rmse_mat = compute_pairwise_stats(archive_data) + for i in range(len(names)): + for j in range(i + 1, len(names)): + if not np.isnan(r_mat[i, j]): + print(f" {names[i]:30s} vs {names[j]:30s} " + f"r={r_mat[i,j]:.3f} RMSE={rmse_mat[i,j]:.2f} mm") - plot_archive(archive_data, loc_name, ARCHIVE_START, ARCHIVE_END, OUTPUT_DIR) + plot_archive_with_spread(archive_data, loc_name, ARCHIVE_START, ARCHIVE_END, OUTPUT_DIR) plot_cumulative(archive_data, loc_name, OUTPUT_DIR) plot_vs_era5(archive_data, loc_name, OUTPUT_DIR) + plot_pairwise_heatmap(archive_data, loc_name, OUTPUT_DIR) + plot_wetdry_agreement(archive_data, loc_name, OUTPUT_DIR) # ---- FORECAST ---- print("\n[Forecast]") forecast_data = {} - print(" Fetching Open-Meteo forecast...") + print(" Fetching Open-Meteo deterministic forecast...") try: forecast_data["Open-Meteo Forecast"] = fetch_openmeteo_forecast(lat, lon) print(f" → {len(forecast_data['Open-Meteo Forecast'])} days") @@ -518,6 +912,17 @@ def run_location(loc_name, lat, lon): print(f" ✗ Open-Meteo forecast failed: {e}") forecast_data["Open-Meteo Forecast"] = None + time.sleep(0.5) + print(" Fetching Open-Meteo ECMWF ensemble forecast...") + ensemble_df = None + try: + ensemble_df = fetch_openmeteo_ensemble_forecast(lat, lon) + if ensemble_df is not None: + print(f" → {len(ensemble_df)} days " + f"({int(ensemble_df['n_members'].iloc[0])} ensemble members)") + except Exception as e: + print(f" ✗ Open-Meteo ensemble failed: {e}") + time.sleep(0.5) print(" Fetching YR.no LocationForecast...") try: @@ -551,6 +956,7 @@ def run_location(loc_name, lat, lon): print(f" ✗ WeatherAPI.com failed: {e}") forecast_data["WeatherAPI.com"] = None + plot_forecast_with_ensemble(forecast_data, ensemble_df, loc_name, OUTPUT_DIR) plot_forecast(forecast_data, loc_name, OUTPUT_DIR)