SmartCane/python_app/weather_api_comparison.py

"""
weather_api_comparison.py
=========================
Compare daily precipitation from multiple free weather APIs across two locations:
  - Arnhem, Netherlands  (51.985°N, 5.899°E)  — European climate
  - Angata, Kenya        ( 1.330°S, 34.738°E) — tropical / sugarcane context

ARCHIVE providers (no API key required):
  1. Open-Meteo ERA5          — current SmartCane provider  (0.25°, global)
  2. Open-Meteo ERA5-Land     — higher resolution variant   (0.10°, global)
  3. Open-Meteo CERRA         — EU regional reanalysis      (0.05°, EU only)
  4. NASA POWER               — completely independent source (0.50°, global)

FORECAST providers (no API key required):
  5. Open-Meteo Forecast      — deterministic NWP (global)
  6. Open-Meteo Ensemble      — ECMWF IFS 51-member ensemble; gives probability bands
  7. YR.no LocationForecast   — Norwegian Met Institute (~10 days, global)

FORECAST providers (API key required — set in CONFIG below, leave "" to skip):
  8. OpenWeatherMap            — free tier, 1000 calls/day
  9. WeatherAPI.com            — free tier

OUTPUT:
  Plots saved to: weather_comparison_plots/
    archive_<loc>.png              — daily lines + multi-source spread band + agreement signal
    archive_rolling_<loc>.png      — 30-day rolling mean comparison (original style)
    cumulative_<loc>.png           — cumulative annual precipitation
    vs_era5_<loc>.png              — each provider vs ERA5 scatter (note: ERA5 is not ground truth)
    pairwise_<loc>.png             — pairwise Pearson r and RMSE heatmaps (unbiased)
    wetdry_agreement_<loc>.png     — % of days providers agree on wet vs dry
    forecast_ensemble_<loc>.png    — ensemble uncertainty bands + exceedance probability
    forecast_<loc>.png             — deterministic forecast bars (original style)

Usage:
  python weather_api_comparison.py
"""

import datetime
import time

import matplotlib
matplotlib.use("Agg")
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from pathlib import Path


# ============================================================
# CONFIG
# ============================================================

LOCATIONS = {
    "Arnhem_NL": {"lat": 51.985, "lon": 5.899},
    "Angata_KE": {"lat": -1.330, "lon": 34.738},
}

# Archive: last 12 months
ARCHIVE_END   = datetime.date.today() - datetime.timedelta(days=2)  # ERA5 lags ~2 days
ARCHIVE_START = ARCHIVE_END - datetime.timedelta(days=365)

# Forecast: today + 7 days
FORECAST_START = datetime.date.today()
FORECAST_END   = FORECAST_START + datetime.timedelta(days=7)

# Optional API keys — leave "" to skip that provider
OPENWEATHERMAP_KEY = ""   # https://openweathermap.org/api
WEATHERAPI_KEY     = ""   # https://www.weatherapi.com/

OUTPUT_DIR = Path("weather_comparison_plots")
OUTPUT_DIR.mkdir(exist_ok=True)

USER_AGENT = "SmartCane-WeatherComparison/1.0 (research; contact via github)"

# ============================================================
# ARCHIVE FETCHERS
# ============================================================

def fetch_openmeteo_archive(lat, lon, start, end, model="era5"):
    """Open-Meteo ERA5 / ERA5-Land / CERRA archive.
    ERA5 is the default (no models param needed). ERA5-Land and CERRA use lowercase names.
    """
    model_suffix = "" if model == "era5" else f"&models={model}"
    url = (
        f"https://archive-api.open-meteo.com/v1/archive"
        f"?latitude={lat}&longitude={lon}"
        f"&daily=precipitation_sum"
        f"&start_date={start}&end_date={end}"
        f"{model_suffix}"
        f"&timezone=UTC"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    df = pd.DataFrame({
        "date":    pd.to_datetime(body["daily"]["time"]),
        "rain_mm": body["daily"]["precipitation_sum"],
    })
    df["rain_mm"] = pd.to_numeric(df["rain_mm"], errors="coerce").clip(lower=0).fillna(0)
    # ERA5-Land sometimes returns values in meters (Open-Meteo API quirk).
    # Auto-detect: if annual total < 50mm for any non-polar location, assume m → convert.
    if df["rain_mm"].sum() < 50 and len(df) > 30:
        df["rain_mm"] = df["rain_mm"] * 1000
        print(f"    ⚠ Unit auto-converted m→mm (values were implausibly small)")
    return df


def fetch_nasa_power(lat, lon, start, end):
    """NASA POWER — daily PRECTOTCORR (precipitation corrected), 0.5° grid."""
    url = (
        "https://power.larc.nasa.gov/api/temporal/daily/point"
        f"?parameters=PRECTOTCORR"
        f"&community=AG"
        f"&longitude={lon}&latitude={lat}"
        f"&start={start.strftime('%Y%m%d')}&end={end.strftime('%Y%m%d')}"
        f"&format=JSON"
    )
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    body = r.json()
    raw = body["properties"]["parameter"]["PRECTOTCORR"]
    df = pd.DataFrame([
        {"date": pd.to_datetime(k, format="%Y%m%d"), "rain_mm": max(v, 0)}
        for k, v in raw.items()
        if v != -999  # NASA POWER fill value
    ])
    return df.sort_values("date").reset_index(drop=True)


# ============================================================
# FORECAST FETCHERS
# ============================================================

def fetch_openmeteo_forecast(lat, lon, days=8):
    """Open-Meteo NWP forecast — default best_match model."""
    url = (
        f"https://api.open-meteo.com/v1/forecast"
        f"?latitude={lat}&longitude={lon}"
        f"&daily=precipitation_sum"
        f"&forecast_days={days + 1}"
        f"&timezone=UTC"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    df = pd.DataFrame({
        "date":    pd.to_datetime(body["daily"]["time"]),
        "rain_mm": body["daily"]["precipitation_sum"],
    })
    df["rain_mm"] = pd.to_numeric(df["rain_mm"], errors="coerce").fillna(0)
    return df


def fetch_openmeteo_ensemble_forecast(lat, lon, days=8):
    """Open-Meteo ECMWF IFS ensemble forecast — 51 members at 0.4° resolution.

    Returns a DataFrame with daily percentile bands and exceedance probabilities:
      p10, p25, p50 (median), p75, p90  — mm/day
      prob_gt_1mm, prob_gt_5mm          — % of members exceeding threshold
      n_members                          — number of members in response
    """
    url = (
        f"https://ensemble-api.open-meteo.com/v1/ensemble"
        f"?latitude={lat}&longitude={lon}"
        f"&daily=precipitation_sum"
        f"&models=ecmwf_ifs04"
        f"&forecast_days={days}"
        f"&timezone=UTC"
    )
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    body = r.json()
    daily = body["daily"]
    dates = pd.to_datetime(daily["time"])

    # Member columns are named like "precipitation_sum_member01", etc.
    member_keys = [k for k in daily.keys() if k.startswith("precipitation_sum")]
    if not member_keys:
        print("    ⚠ No member columns found in ensemble response")
        return None

    # Shape: (n_members, n_days)
    members = np.array([daily[k] for k in member_keys], dtype=float)
    members = np.where(members < 0, 0, members)  # clip negatives

    df = pd.DataFrame({
        "date":        dates,
        "p10":         np.nanpercentile(members, 10, axis=0),
        "p25":         np.nanpercentile(members, 25, axis=0),
        "p50":         np.nanpercentile(members, 50, axis=0),
        "p75":         np.nanpercentile(members, 75, axis=0),
        "p90":         np.nanpercentile(members, 90, axis=0),
        "mean":        np.nanmean(members, axis=0),
        "prob_gt_1mm": np.mean(members > 1.0, axis=0) * 100,
        "prob_gt_5mm": np.mean(members > 5.0, axis=0) * 100,
        "n_members":   members.shape[0],
    })
    return df


def fetch_yr_forecast(lat, lon):
    """YR.no LocationForecast 2.0 — hourly precip aggregated to daily.
    Note: forecast-only service; no historical archive is available.
    """
    url = f"https://api.met.no/weatherapi/locationforecast/2.0/compact?lat={lat}&lon={lon}"
    headers = {"User-Agent": USER_AGENT}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    body = r.json()

    records = []
    for entry in body["properties"]["timeseries"]:
        ts   = pd.to_datetime(entry["time"])
        data = entry["data"]
        precip = 0.0
        if "next_1_hours" in data:
            precip = data["next_1_hours"]["details"].get("precipitation_amount", 0.0)
        elif "next_6_hours" in data:
            precip = data["next_6_hours"]["details"].get("precipitation_amount", 0.0) / 6
        records.append({"datetime": ts, "precip_hour": precip})

    hourly = pd.DataFrame(records)
    hourly["date"] = hourly["datetime"].dt.date
    daily = (
        hourly.groupby("date")["precip_hour"]
        .sum()
        .reset_index()
        .rename(columns={"precip_hour": "rain_mm"})
    )
    daily["date"] = pd.to_datetime(daily["date"])
    return daily


def fetch_openweathermap_forecast(lat, lon, api_key):
    """OpenWeatherMap One Call 3.0 — daily forecast (needs paid/free key)."""
    url = (
        f"https://api.openweathermap.org/data/3.0/onecall"
        f"?lat={lat}&lon={lon}"
        f"&exclude=current,minutely,hourly,alerts"
        f"&appid={api_key}&units=metric"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    records = []
    for day in body.get("daily", []):
        records.append({
            "date":    pd.to_datetime(day["dt"], unit="s").normalize(),
            "rain_mm": day.get("rain", 0.0),
        })
    return pd.DataFrame(records)


def fetch_weatherapi_forecast(lat, lon, api_key, days=7):
    """WeatherAPI.com free forecast (up to 3 days on free tier, 14 on paid)."""
    url = (
        f"https://api.weatherapi.com/v1/forecast.json"
        f"?key={api_key}&q={lat},{lon}&days={days}&aqi=no&alerts=no"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    records = []
    for day in body["forecast"]["forecastday"]:
        records.append({
            "date":    pd.to_datetime(day["date"]),
            "rain_mm": day["day"].get("totalprecip_mm", 0.0),
        })
    return pd.DataFrame(records)


# ============================================================
# STATS
# ============================================================

def compare_stats(df, ref_col, other_col):
    """Compute MAE, RMSE, bias, Pearson r between two columns."""
    valid = df[[ref_col, other_col]].dropna()
    if len(valid) < 5:
        return {"n": len(valid), "MAE": None, "RMSE": None, "Bias": None, "r": None}
    diff  = valid[other_col] - valid[ref_col]
    mae   = diff.abs().mean()
    rmse  = (diff**2).mean()**0.5
    bias  = diff.mean()
    r     = valid[ref_col].corr(valid[other_col])
    return {"n": len(valid), "MAE": round(mae,2), "RMSE": round(rmse,2),
            "Bias": round(bias,2), "r": round(r,3)}


def compute_pairwise_stats(data_dict):
    """Compute pairwise Pearson r and RMSE for all archive provider pairs.

    Returns:
        names       — list of provider names (same order as matrices)
        r_matrix    — (n x n) Pearson r array, NaN where not computable
        rmse_matrix — (n x n) RMSE array (mm/day), 0 on diagonal
    """
    providers = [(name, df) for name, df in data_dict.items()
                 if df is not None and len(df) > 5]
    names = [p[0] for p in providers]
    n = len(names)

    r_matrix    = np.full((n, n), np.nan)
    rmse_matrix = np.full((n, n), np.nan)

    for i in range(n):
        r_matrix[i, i]    = 1.0
        rmse_matrix[i, i] = 0.0
        _, df_i = providers[i]
        for j in range(i + 1, n):
            _, df_j = providers[j]
            merged = df_i.merge(df_j, on="date", suffixes=("_i", "_j"))
            valid = merged[["rain_mm_i", "rain_mm_j"]].dropna()
            if len(valid) < 5:
                continue
            r    = valid["rain_mm_i"].corr(valid["rain_mm_j"])
            rmse = ((valid["rain_mm_i"] - valid["rain_mm_j"]) ** 2).mean() ** 0.5
            r_matrix[i, j]    = r_matrix[j, i]    = round(r, 3)
            rmse_matrix[i, j] = rmse_matrix[j, i] = round(rmse, 2)

    return names, r_matrix, rmse_matrix


def wetdry_agreement(data_dict, threshold=1.0):
    """For each provider pair, compute % of days both-dry / both-wet / disagree.

    A day is 'dry' if rain_mm < threshold (default 1 mm).
    Returns a list of dicts: pair, both_dry, both_wet, disagree, n.
    """
    providers = [(name, df) for name, df in data_dict.items()
                 if df is not None and len(df) > 5]
    results = []
    for i, (name_i, df_i) in enumerate(providers):
        for j in range(i + 1, len(providers)):
            name_j, df_j = providers[j]
            merged = df_i.merge(df_j, on="date", suffixes=("_i", "_j"))
            valid = merged[["rain_mm_i", "rain_mm_j"]].dropna()
            if len(valid) < 5:
                continue
            dry_i = valid["rain_mm_i"] < threshold
            dry_j = valid["rain_mm_j"] < threshold
            both_dry = (dry_i  &  dry_j).mean() * 100
            both_wet = (~dry_i & ~dry_j).mean() * 100
            disagree = 100 - both_dry - both_wet
            results.append({
                "pair":      f"{name_i}\nvs\n{name_j}",
                "both_dry":  round(both_dry, 1),
                "both_wet":  round(both_wet, 1),
                "disagree":  round(disagree, 1),
                "n":         len(valid),
            })
    return results


# ============================================================
# PLOTTING — ARCHIVE
# ============================================================

ARCHIVE_COLORS = {
    "ERA5 (Open-Meteo)":      "#1f77b4",
    "ERA5-Land (Open-Meteo)": "#ff7f0e",
    "CERRA (Open-Meteo)":     "#2ca02c",
    "NASA POWER":             "#d62728",
}


def _build_spread_frame(data_dict):
    """Merge all valid archive providers onto a common date axis.
    Returns a DataFrame with one column per provider + _mean and _std columns.
    """
    valid = {name: df for name, df in data_dict.items()
             if df is not None and len(df) > 0}
    if not valid:
        return None, []

    merged = None
    for name, df in valid.items():
        tmp = df[["date", "rain_mm"]].rename(columns={"rain_mm": name})
        merged = tmp if merged is None else merged.merge(tmp, on="date", how="outer")

    cols = list(valid.keys())
    merged[cols]   = merged[cols].clip(lower=0)
    merged["_mean"] = merged[cols].mean(axis=1)
    merged["_std"]  = merged[cols].std(axis=1)
    return merged.sort_values("date").reset_index(drop=True), cols


def plot_archive_with_spread(data_dict, location_name, start, end, output_dir):
    """Three-panel archive plot:
      Top:    Individual provider lines + multi-source mean±std shading
      Middle: 30-day rolling mean
      Bottom: Inter-source std (agreement signal — used to flag uncertain periods)
    """
    spread, _ = _build_spread_frame(data_dict)
    if spread is None:
        return

    valid_providers = {name: df for name, df in data_dict.items()
                       if df is not None and len(df) > 0}

    fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)

    # --- Top: daily raw + spread band ---
    ax1 = axes[0]
    ax1.fill_between(
        spread["date"],
        (spread["_mean"] - spread["_std"]).clip(lower=0),
        spread["_mean"] + spread["_std"],
        alpha=0.15, color="gray", label="Multi-source ±1 std"
    )
    ax1.plot(spread["date"], spread["_mean"], color="black", linewidth=1.0,
             linestyle="--", alpha=0.6, label="Multi-source mean", zorder=5)
    for name, df in valid_providers.items():
        ax1.plot(df["date"], df["rain_mm"],
                 label=name, color=ARCHIVE_COLORS.get(name, "gray"),
                 linewidth=0.7, alpha=0.85)
    ax1.set_ylabel("Precipitation (mm/day)")
    ax1.set_title(
        f"{location_name} — Daily Precipitation with Multi-Source Spread\n"
        f"{start} → {end}  |  Grey band = ±1 std across all providers"
    )
    ax1.legend(fontsize=8, ncol=2)
    ax1.grid(True, alpha=0.3)

    # --- Middle: 30-day rolling mean ---
    ax2 = axes[1]
    roll_mean = spread.set_index("date")["_mean"].rolling(30, min_periods=15).mean()
    roll_std  = spread.set_index("date")["_std"].rolling(30, min_periods=15).mean()
    ax2.fill_between(
        roll_mean.index,
        (roll_mean - roll_std).clip(lower=0),
        roll_mean + roll_std,
        alpha=0.15, color="gray"
    )
    for name, df in valid_providers.items():
        rolled = df.set_index("date")["rain_mm"].rolling(30, min_periods=15).mean()
        ax2.plot(rolled.index, rolled.values,
                 label=name, color=ARCHIVE_COLORS.get(name, "gray"), linewidth=1.5)
    ax2.set_ylabel("30-day rolling mean (mm/day)")
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)

    # --- Bottom: inter-source std (agreement signal) ---
    ax3 = axes[2]
    ax3.fill_between(
        spread["date"], 0, spread["_std"],
        color="purple", alpha=0.35,
        label="Std across providers (higher = less agreement)"
    )
    median_std = spread["_std"].median()
    ax3.axhline(median_std, color="purple", linestyle=":", linewidth=1,
                label=f"Median std = {median_std:.2f} mm/day")
    ax3.set_ylabel("Std dev across\nproviders (mm)")
    ax3.set_xlabel("Date")
    ax3.legend(fontsize=8)
    ax3.grid(True, alpha=0.3)
    ax3.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))

    fig.autofmt_xdate()
    plt.tight_layout()
    path = output_dir / f"archive_{location_name}.png"
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"  Saved: {path}")


def plot_cumulative(data_dict, location_name, output_dir):
    """Cumulative annual precipitation — most relevant for crop/irrigation context."""
    fig, ax = plt.subplots(figsize=(14, 5))

    for name, df in data_dict.items():
        if df is None or len(df) == 0:
            continue
        s = df.set_index("date")["rain_mm"].sort_index().cumsum()
        total = s.iloc[-1]
        ax.plot(s.index, s.values,
                label=f"{name}  (total: {total:.0f} mm)",
                color=ARCHIVE_COLORS.get(name, "gray"), linewidth=1.8)

    ax.set_ylabel("Cumulative precipitation (mm)")
    ax.set_xlabel("Date")
    ax.set_title(
        f"{location_name} — Cumulative Annual Precipitation by Provider\n"
        "Divergence = sources disagree on total seasonal rainfall"
    )
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))
    fig.autofmt_xdate()
    plt.tight_layout()

    path = output_dir / f"cumulative_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


def plot_vs_era5(data_dict, location_name, output_dir):
    """Each provider vs ERA5 reference: scatter + regression line.

    NOTE: ERA5 is used as reference here for visual comparison only.
    It is a reanalysis product, not a ground-truth measurement.
    Use the pairwise heatmap (plot_pairwise_heatmap) for an unbiased comparison.

    How to read:
      - Each panel shows one provider (y-axis) vs ERA5 (x-axis) for daily precip.
      - Points on the red diagonal = perfect agreement.
      - Points above = provider wetter than ERA5 on that day.
      - r = Pearson correlation (1 = perfect). MAE = mean absolute error in mm/day.
      - Bias = provider minus ERA5 on average (positive = provider wetter).
    """
    ref_name = "ERA5 (Open-Meteo)"
    ref_df = data_dict.get(ref_name)
    if ref_df is None:
        return

    others = [(n, df) for n, df in data_dict.items()
              if n != ref_name and df is not None and len(df) > 0]
    if not others:
        return

    n = len(others)
    fig, axes = plt.subplots(1, n, figsize=(5 * n, 5), squeeze=False)

    for i, (name, df) in enumerate(others):
        ax = axes[0][i]
        merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp"))
        valid  = merged[["rain_mm_ref", "rain_mm_cmp"]].dropna()

        color = ARCHIVE_COLORS.get(name, "steelblue")
        ax.scatter(valid["rain_mm_ref"], valid["rain_mm_cmp"],
                   s=4, alpha=0.35, color=color)

        lim = max(valid.max().max(), 1) * 1.05
        ax.plot([0, lim], [0, lim], "r--", linewidth=1, label="Perfect agreement")

        if len(valid) > 5:
            coeffs = np.polyfit(valid["rain_mm_ref"], valid["rain_mm_cmp"], 1)
            x_fit  = np.linspace(0, lim, 100)
            ax.plot(x_fit, np.polyval(coeffs, x_fit), "k-", linewidth=1,
                    alpha=0.6, label=f"Regression (slope={coeffs[0]:.2f})")

        stats = compare_stats(merged, "rain_mm_ref", "rain_mm_cmp")
        ax.set_xlim(0, lim); ax.set_ylim(0, lim)
        ax.set_xlabel("ERA5 (Open-Meteo) mm/day", fontsize=9)
        ax.set_ylabel(f"{name} mm/day", fontsize=9)
        ax.set_title(
            f"{name}\nr={stats['r']}  MAE={stats['MAE']} mm  Bias={stats['Bias']:+.2f} mm",
            fontsize=9
        )
        ax.legend(fontsize=7)
        ax.grid(True, alpha=0.3)

    fig.suptitle(
        f"{location_name} — Daily Precip vs ERA5 (reference only — ERA5 is not ground truth)\n"
        "Red dashed = perfect agreement. See pairwise heatmap for unbiased comparison.",
        fontsize=10
    )
    plt.tight_layout()
    path = output_dir / f"vs_era5_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


def plot_pairwise_heatmap(data_dict, location_name, output_dir):
    """Two heatmaps side by side: pairwise Pearson r and pairwise RMSE.

    No provider is treated as reference — all pairs compared equally.
    High r + low RMSE = sources agree. Where they diverge, neither is ground truth.
    """
    names, r_matrix, rmse_matrix = compute_pairwise_stats(data_dict)
    if len(names) < 2:
        return

    n = len(names)
    short = [nm.replace(" (Open-Meteo)", "\n(OM)") for nm in names]

    fig, axes = plt.subplots(1, 2, figsize=(max(10, n * 2.5), max(5, n * 1.5 + 1)))

    # Pearson r
    ax1 = axes[0]
    im1 = ax1.imshow(r_matrix, vmin=0, vmax=1, cmap="YlGn", aspect="auto")
    plt.colorbar(im1, ax=ax1, label="Pearson r")
    ax1.set_xticks(range(n)); ax1.set_yticks(range(n))
    ax1.set_xticklabels(short, fontsize=8, rotation=45, ha="right")
    ax1.set_yticklabels(short, fontsize=8)
    for i in range(n):
        for j in range(n):
            val = r_matrix[i, j]
            if not np.isnan(val):
                text_color = "white" if val > 0.8 else "black"
                ax1.text(j, i, f"{val:.2f}", ha="center", va="center",
                         fontsize=9, color=text_color)
    ax1.set_title("Pairwise Pearson r\n(1.0 = perfect agreement, no reference)")

    # RMSE
    ax2 = axes[1]
    off_diag = rmse_matrix[rmse_matrix > 0]
    vmax_rmse = float(np.nanmax(off_diag)) if len(off_diag) > 0 else 10
    im2 = ax2.imshow(rmse_matrix, vmin=0, vmax=vmax_rmse, cmap="YlOrRd_r", aspect="auto")
    plt.colorbar(im2, ax=ax2, label="RMSE (mm/day)")
    ax2.set_xticks(range(n)); ax2.set_yticks(range(n))
    ax2.set_xticklabels(short, fontsize=8, rotation=45, ha="right")
    ax2.set_yticklabels(short, fontsize=8)
    for i in range(n):
        for j in range(n):
            val = rmse_matrix[i, j]
            if not np.isnan(val):
                ax2.text(j, i, f"{val:.1f}", ha="center", va="center", fontsize=9)
    ax2.set_title("Pairwise RMSE (mm/day)\n(0 = perfect agreement)")

    fig.suptitle(
        f"{location_name} — Archive Provider Agreement (unbiased — no single reference)\n"
        "Pairs with high r + low RMSE are consistent. Divergent pairs reveal dataset uncertainty.",
        fontsize=10
    )
    plt.tight_layout()
    path = output_dir / f"pairwise_{location_name}.png"
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"  Saved: {path}")


def plot_wetdry_agreement(data_dict, location_name, output_dir, threshold=1.0):
    """Stacked bar chart: for each provider pair, % of days both-dry / both-wet / disagree.

    High 'disagree' % means one source says rain, the other says dry on the same day.
    This is the most practically relevant divergence for SmartCane crop monitoring.
    """
    results = wetdry_agreement(data_dict, threshold)
    if not results:
        return

    pairs    = [r["pair"]     for r in results]
    both_dry = [r["both_dry"] for r in results]
    both_wet = [r["both_wet"] for r in results]
    disagree = [r["disagree"] for r in results]

    fig, ax = plt.subplots(figsize=(max(8, len(pairs) * 2.8), 6))
    x = np.arange(len(pairs))

    ax.bar(x, both_dry, label=f"Both dry (<{threshold} mm)", color="#a8d5e2", edgecolor="white")
    ax.bar(x, both_wet, bottom=both_dry,
           label=f"Both wet (≥{threshold} mm)", color="#3a7ebf", edgecolor="white")
    bottom2 = [d + w for d, w in zip(both_dry, both_wet)]
    ax.bar(x, disagree, bottom=bottom2,
           label="Disagree (one wet, one dry)", color="#e07b54", edgecolor="white")

    # Add % labels inside bars
    for i, r in enumerate(results):
        if r["both_dry"] > 4:
            ax.text(i, r["both_dry"] / 2, f"{r['both_dry']:.0f}%",
                    ha="center", va="center", fontsize=7, color="black")
        if r["both_wet"] > 4:
            ax.text(i, r["both_dry"] + r["both_wet"] / 2, f"{r['both_wet']:.0f}%",
                    ha="center", va="center", fontsize=7, color="white")
        if r["disagree"] > 4:
            ax.text(i, bottom2[i] + r["disagree"] / 2, f"{r['disagree']:.0f}%",
                    ha="center", va="center", fontsize=7, color="white")

    ax.set_xticks(x)
    ax.set_xticklabels(pairs, fontsize=8)
    ax.set_ylim(0, 108)
    ax.set_ylabel("% of days in archive period")
    ax.set_title(
        f"{location_name} — Provider Agreement on Wet vs Dry Days\n"
        f"Threshold = {threshold} mm/day  |  Orange = providers disagree on whether it rained"
    )
    ax.legend(loc="upper right", fontsize=9)
    ax.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()

    path = output_dir / f"wetdry_agreement_{location_name}.png"
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"  Saved: {path}")


# ============================================================
# PLOTTING — FORECAST
# ============================================================

def plot_forecast_with_ensemble(forecast_data, ensemble_df, location_name, output_dir):
    """Two-panel forecast comparison:
      Top:    Deterministic bars (YR.no, Open-Meteo) + ECMWF ensemble percentile shading
      Bottom: Exceedance probabilities P(rain > 1 mm) and P(rain > 5 mm)

    Reading guide:
      - Dark blue shading = where 50% of ensemble members agree (25th–75th %ile)
      - Light blue shading = where 80% of ensemble members agree (10th–90th %ile)
      - Bars = deterministic (single-value) forecast from each provider
      - Bottom panel: at 50% on the dashed line = coin-flip uncertainty about rain event
    """
    has_ensemble = ensemble_df is not None and len(ensemble_df) > 0
    det_providers = [(name, df) for name, df in forecast_data.items()
                     if df is not None and len(df) > 0]

    if not has_ensemble and not det_providers:
        return

    fig, axes = plt.subplots(
        2, 1, figsize=(12, 8), sharex=True,
        gridspec_kw={"height_ratios": [3, 1.5]}
    )

    # ---- TOP: deterministic bars + ensemble shading ----
    ax1 = axes[0]

    if has_ensemble:
        ax1.fill_between(
            ensemble_df["date"], ensemble_df["p10"], ensemble_df["p90"],
            alpha=0.12, color="steelblue", label="Ensemble 10th–90th %ile"
        )
        ax1.fill_between(
            ensemble_df["date"], ensemble_df["p25"], ensemble_df["p75"],
            alpha=0.28, color="steelblue", label="Ensemble 25th–75th %ile"
        )
        ax1.plot(
            ensemble_df["date"], ensemble_df["p50"],
            color="steelblue", linewidth=1.8, linestyle="-",
            label=f"Ensemble median ({int(ensemble_df['n_members'].iloc[0])} members)",
            zorder=5
        )

    det_colors = {"Open-Meteo Forecast": "#1f77b4", "YR.no": "#e8882a"}
    bar_half   = datetime.timedelta(hours=10)  # offset so bars don't overlap
    n_det = len(det_providers)
    offsets = np.linspace(-bar_half.total_seconds() / 3600 * (n_det - 1) / 2,
                           bar_half.total_seconds() / 3600 * (n_det - 1) / 2,
                           n_det)
    bar_width_days = 0.3

    for k, (name, df) in enumerate(det_providers):
        offset_td = datetime.timedelta(hours=offsets[k])
        shifted_dates = df["date"] + offset_td
        ax1.bar(
            shifted_dates, df["rain_mm"],
            width=bar_width_days,
            label=name,
            color=det_colors.get(name, f"C{k}"),
            alpha=0.75,
            zorder=4,
        )

    ax1.set_ylabel("Precipitation (mm/day)")
    ax1.set_title(
        f"{location_name} — 7-Day Forecast\n"
        "Shading = Open-Meteo ECMWF ensemble spread  |  Bars = deterministic forecasts"
    )
    ax1.legend(fontsize=8, loc="upper right")
    ax1.grid(True, axis="y", alpha=0.3)
    ax1.set_ylim(bottom=0)

    # ---- BOTTOM: exceedance probabilities ----
    ax2 = axes[1]
    if has_ensemble:
        ax2.step(
            ensemble_df["date"], ensemble_df["prob_gt_1mm"],
            where="mid", color="steelblue", linewidth=2.0,
            label="P(rain > 1 mm)"
        )
        ax2.step(
            ensemble_df["date"], ensemble_df["prob_gt_5mm"],
            where="mid", color="darkblue", linewidth=2.0, linestyle="--",
            label="P(rain > 5 mm)"
        )
        ax2.fill_between(
            ensemble_df["date"], 0, ensemble_df["prob_gt_5mm"],
            step="mid", alpha=0.18, color="darkblue"
        )
        ax2.axhline(50, color="gray", linestyle=":", linewidth=0.9, alpha=0.8,
                    label="50% (coin-flip)")
        ax2.set_ylim(0, 108)
        ax2.set_ylabel("Exceedance\nprobability (%)")
        ax2.legend(fontsize=8, loc="upper right")
        ax2.grid(True, alpha=0.3)
    ax2.set_xlabel("Date")
    ax2.xaxis.set_major_formatter(mdates.DateFormatter("%d %b"))

    fig.autofmt_xdate()
    plt.tight_layout()

    path = output_dir / f"forecast_ensemble_{location_name}.png"
    plt.savefig(path, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"  Saved: {path}")


def plot_forecast(data_dict, location_name, output_dir):
    """Bar chart comparing deterministic 7-day forecasts (original style, kept for reference)."""
    _, ax = plt.subplots(figsize=(12, 5))

    providers = [(name, df) for name, df in data_dict.items() if df is not None and len(df) > 0]
    n = len(providers)
    if n == 0:
        plt.close()
        return

    all_dates = sorted(set(
        d for _, df in providers
        for d in df["date"].dt.date.tolist()
    ))
    x = np.arange(len(all_dates))
    width = 0.8 / n

    cmap = matplotlib.colormaps["tab10"].resampled(n)
    for i, (name, df) in enumerate(providers):
        date_map = dict(zip(df["date"].dt.date, df["rain_mm"]))
        vals = [date_map.get(d, 0.0) for d in all_dates]
        ax.bar(x + i * width, vals, width, label=name, color=cmap(i), alpha=0.85)

    ax.set_xticks(x + width * (n - 1) / 2)
    ax.set_xticklabels([d.strftime("%d %b") for d in all_dates], rotation=45, ha="right")
    ax.set_ylabel("Precipitation (mm/day)")
    ax.set_title(f"{location_name} — 7-Day Deterministic Forecast Comparison")
    ax.legend(fontsize=9)
    ax.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()

    path = output_dir / f"forecast_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


# ============================================================
# MAIN
# ============================================================

def run_location(loc_name, lat, lon):
    print(f"\n{'='*60}")
    print(f"  {loc_name}  ({lat}°, {lon}°)")
    print(f"{'='*60}")

    # ---- ARCHIVE ----
    print("\n[Archive]")
    archive_data = {}

    print("  Fetching Open-Meteo ERA5...")
    try:
        archive_data["ERA5 (Open-Meteo)"] = fetch_openmeteo_archive(
            lat, lon, ARCHIVE_START, ARCHIVE_END, model="era5"
        )
        print(f"    → {len(archive_data['ERA5 (Open-Meteo)'])} days")
    except Exception as e:
        print(f"    ✗ ERA5 failed: {e}")
        archive_data["ERA5 (Open-Meteo)"] = None

    time.sleep(0.5)
    print("  Fetching Open-Meteo ERA5-Land...")
    try:
        archive_data["ERA5-Land (Open-Meteo)"] = fetch_openmeteo_archive(
            lat, lon, ARCHIVE_START, ARCHIVE_END, model="era5_land"
        )
        print(f"    → {len(archive_data['ERA5-Land (Open-Meteo)'])} days")
    except Exception as e:
        print(f"    ✗ ERA5-Land failed: {e}")
        archive_data["ERA5-Land (Open-Meteo)"] = None

    time.sleep(0.5)
    # CERRA only covers Europe (roughly 20°W–45°E, 30°N–80°N)
    if -20 <= lon <= 45 and 30 <= lat <= 80:
        print("  Fetching Open-Meteo CERRA (EU only)...")
        try:
            archive_data["CERRA (Open-Meteo)"] = fetch_openmeteo_archive(
                lat, lon, ARCHIVE_START, ARCHIVE_END, model="cerra"
            )
            print(f"    → {len(archive_data['CERRA (Open-Meteo)'])} days")
        except Exception as e:
            print(f"    ✗ CERRA failed: {e}")
            archive_data["CERRA (Open-Meteo)"] = None
    else:
        print("  Skipping CERRA (outside EU coverage)")
        archive_data["CERRA (Open-Meteo)"] = None

    time.sleep(0.5)
    print("  Fetching NASA POWER...")
    try:
        archive_data["NASA POWER"] = fetch_nasa_power(lat, lon, ARCHIVE_START, ARCHIVE_END)
        print(f"    → {len(archive_data['NASA POWER'])} days")
    except Exception as e:
        print(f"    ✗ NASA POWER failed: {e}")
        archive_data["NASA POWER"] = None

    # Pairwise stats (no ERA5 reference bias)
    print("\n  Pairwise archive stats (Pearson r | RMSE mm/day | Bias mm/day):")
    names, r_mat, rmse_mat = compute_pairwise_stats(archive_data)
    for i in range(len(names)):
        for j in range(i + 1, len(names)):
            if not np.isnan(r_mat[i, j]):
                print(f"    {names[i]:30s} vs {names[j]:30s}  "
                      f"r={r_mat[i,j]:.3f}  RMSE={rmse_mat[i,j]:.2f} mm")

    plot_archive_with_spread(archive_data, loc_name, ARCHIVE_START, ARCHIVE_END, OUTPUT_DIR)
    plot_cumulative(archive_data, loc_name, OUTPUT_DIR)
    plot_vs_era5(archive_data, loc_name, OUTPUT_DIR)
    plot_pairwise_heatmap(archive_data, loc_name, OUTPUT_DIR)
    plot_wetdry_agreement(archive_data, loc_name, OUTPUT_DIR)

    # ---- FORECAST ----
    print("\n[Forecast]")
    forecast_data = {}

    print("  Fetching Open-Meteo deterministic forecast...")
    try:
        forecast_data["Open-Meteo Forecast"] = fetch_openmeteo_forecast(lat, lon)
        print(f"    → {len(forecast_data['Open-Meteo Forecast'])} days")
    except Exception as e:
        print(f"    ✗ Open-Meteo forecast failed: {e}")
        forecast_data["Open-Meteo Forecast"] = None

    time.sleep(0.5)
    print("  Fetching Open-Meteo ECMWF ensemble forecast...")
    ensemble_df = None
    try:
        ensemble_df = fetch_openmeteo_ensemble_forecast(lat, lon)
        if ensemble_df is not None:
            print(f"    → {len(ensemble_df)} days  "
                  f"({int(ensemble_df['n_members'].iloc[0])} ensemble members)")
    except Exception as e:
        print(f"    ✗ Open-Meteo ensemble failed: {e}")

    time.sleep(0.5)
    print("  Fetching YR.no LocationForecast...")
    try:
        forecast_data["YR.no"] = fetch_yr_forecast(lat, lon)
        print(f"    → {len(forecast_data['YR.no'])} days")
    except Exception as e:
        print(f"    ✗ YR.no failed: {e}")
        forecast_data["YR.no"] = None

    if OPENWEATHERMAP_KEY:
        time.sleep(0.5)
        print("  Fetching OpenWeatherMap forecast...")
        try:
            forecast_data["OpenWeatherMap"] = fetch_openweathermap_forecast(
                lat, lon, OPENWEATHERMAP_KEY
            )
            print(f"    → {len(forecast_data['OpenWeatherMap'])} days")
        except Exception as e:
            print(f"    ✗ OpenWeatherMap failed: {e}")
            forecast_data["OpenWeatherMap"] = None

    if WEATHERAPI_KEY:
        time.sleep(0.5)
        print("  Fetching WeatherAPI.com forecast...")
        try:
            forecast_data["WeatherAPI.com"] = fetch_weatherapi_forecast(
                lat, lon, WEATHERAPI_KEY
            )
            print(f"    → {len(forecast_data['WeatherAPI.com'])} days")
        except Exception as e:
            print(f"    ✗ WeatherAPI.com failed: {e}")
            forecast_data["WeatherAPI.com"] = None

    plot_forecast_with_ensemble(forecast_data, ensemble_df, loc_name, OUTPUT_DIR)
    plot_forecast(forecast_data, loc_name, OUTPUT_DIR)


if __name__ == "__main__":
    print(f"Weather API Comparison — {datetime.date.today()}")
    print(f"Archive: {ARCHIVE_START} → {ARCHIVE_END}")
    print(f"Forecast: {FORECAST_START} → {FORECAST_END}")
    print(f"Output: {OUTPUT_DIR.resolve()}")

    for loc_name, coords in LOCATIONS.items():
        run_location(loc_name, coords["lat"], coords["lon"])
        time.sleep(1)

    print(f"\nDone. Plots saved to: {OUTPUT_DIR.resolve()}")