SmartCane/python_app/weather_api_comparison.py

"""
weather_api_comparison.py
=========================
Compare daily precipitation from multiple free weather APIs across two locations:
  - Arnhem, Netherlands  (51.985°N, 5.899°E)  — European climate
  - Angata, Kenya        ( 1.330°S, 34.738°E) — tropical / sugarcane context

ARCHIVE providers (no API key required):
  1. Open-Meteo ERA5          — current SmartCane provider  (0.25°, global)
  2. Open-Meteo ERA5-Land     — higher resolution variant   (0.10°, global)
  3. Open-Meteo CERRA         — EU regional reanalysis      (0.05°, EU only)
  4. NASA POWER               — completely independent source (0.50°, global)

FORECAST providers (no API key required):
  5. Open-Meteo Forecast      — ensemble of NWP models      (global)
  6. YR.no LocationForecast   — Norwegian Met Institute     (~10 days, global)

FORECAST providers (API key required — set in CONFIG below, leave "" to skip):
  7. OpenWeatherMap            — free tier, 1000 calls/day
  8. WeatherAPI.com            — free tier

OUTPUT:
  Plots saved to: weather_comparison_plots/
  Summary stats printed to console.

Usage:
  python weather_api_comparison.py
"""

import os
import json
import time
import datetime
import requests
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from pathlib import Path

# ============================================================
# CONFIG
# ============================================================

LOCATIONS = {
    "Arnhem_NL": {"lat": 51.985, "lon": 5.899},
    "Angata_KE": {"lat": -1.330, "lon": 34.738},
}

# Archive: last 12 months
ARCHIVE_END   = datetime.date.today() - datetime.timedelta(days=2)  # ERA5 lags ~2 days
ARCHIVE_START = ARCHIVE_END - datetime.timedelta(days=365)

# Forecast: today + 7 days
FORECAST_START = datetime.date.today()
FORECAST_END   = FORECAST_START + datetime.timedelta(days=7)

# Optional API keys — leave "" to skip that provider
OPENWEATHERMAP_KEY = ""   # https://openweathermap.org/api
WEATHERAPI_KEY     = ""   # https://www.weatherapi.com/

OUTPUT_DIR = Path("weather_comparison_plots")
OUTPUT_DIR.mkdir(exist_ok=True)

USER_AGENT = "SmartCane-WeatherComparison/1.0 (research; contact via github)"

# ============================================================
# ARCHIVE FETCHERS
# ============================================================

def fetch_openmeteo_archive(lat, lon, start, end, model="era5"):
    """Open-Meteo ERA5 / ERA5-Land / CERRA archive.
    ERA5 is the default (no models param needed). ERA5-Land and CERRA use lowercase names.
    """
    # ERA5 is the default model — adding models param with wrong casing causes 400
    model_suffix = "" if model == "era5" else f"&models={model}"
    url = (
        f"https://archive-api.open-meteo.com/v1/archive"
        f"?latitude={lat}&longitude={lon}"
        f"&daily=precipitation_sum"
        f"&start_date={start}&end_date={end}"
        f"{model_suffix}"
        f"&timezone=UTC"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    df = pd.DataFrame({
        "date":    pd.to_datetime(body["daily"]["time"]),
        "rain_mm": body["daily"]["precipitation_sum"],
    })
    df["rain_mm"] = pd.to_numeric(df["rain_mm"], errors="coerce").clip(lower=0).fillna(0)
    # ERA5-Land sometimes returns values in meters (Open-Meteo API quirk).
    # Auto-detect: if annual total < 50mm for any non-polar location, assume m → convert.
    if df["rain_mm"].sum() < 50 and len(df) > 30:
        df["rain_mm"] = df["rain_mm"] * 1000
        print(f"    ⚠ Unit auto-converted m→mm (values were implausibly small)")
    return df


def fetch_nasa_power(lat, lon, start, end):
    """NASA POWER — daily PRECTOTCORR (precipitation corrected), 0.5° grid."""
    url = (
        "https://power.larc.nasa.gov/api/temporal/daily/point"
        f"?parameters=PRECTOTCORR"
        f"&community=AG"
        f"&longitude={lon}&latitude={lat}"
        f"&start={start.strftime('%Y%m%d')}&end={end.strftime('%Y%m%d')}"
        f"&format=JSON"
    )
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    body = r.json()
    raw = body["properties"]["parameter"]["PRECTOTCORR"]
    df = pd.DataFrame([
        {"date": pd.to_datetime(k, format="%Y%m%d"), "rain_mm": max(v, 0)}
        for k, v in raw.items()
        if v != -999  # NASA POWER fill value
    ])
    return df.sort_values("date").reset_index(drop=True)


# ============================================================
# FORECAST FETCHERS
# ============================================================

def fetch_openmeteo_forecast(lat, lon, days=8):
    """Open-Meteo NWP forecast — default best_match model."""
    end = datetime.date.today() + datetime.timedelta(days=days)
    url = (
        f"https://api.open-meteo.com/v1/forecast"
        f"?latitude={lat}&longitude={lon}"
        f"&daily=precipitation_sum"
        f"&forecast_days={days + 1}"
        f"&timezone=UTC"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    df = pd.DataFrame({
        "date":    pd.to_datetime(body["daily"]["time"]),
        "rain_mm": body["daily"]["precipitation_sum"],
    })
    df["rain_mm"] = pd.to_numeric(df["rain_mm"], errors="coerce").fillna(0)
    return df


def fetch_yr_forecast(lat, lon):
    """YR.no LocationForecast 2.0 — hourly precip aggregated to daily."""
    url = f"https://api.met.no/weatherapi/locationforecast/2.0/compact?lat={lat}&lon={lon}"
    headers = {"User-Agent": USER_AGENT}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    body = r.json()

    records = []
    for entry in body["properties"]["timeseries"]:
        ts   = pd.to_datetime(entry["time"])
        data = entry["data"]
        precip = 0.0
        if "next_1_hours" in data:
            precip = data["next_1_hours"]["details"].get("precipitation_amount", 0.0)
        elif "next_6_hours" in data:
            precip = data["next_6_hours"]["details"].get("precipitation_amount", 0.0) / 6
        records.append({"datetime": ts, "precip_hour": precip})

    hourly = pd.DataFrame(records)
    hourly["date"] = hourly["datetime"].dt.date
    daily = (
        hourly.groupby("date")["precip_hour"]
        .sum()
        .reset_index()
        .rename(columns={"precip_hour": "rain_mm"})
    )
    daily["date"] = pd.to_datetime(daily["date"])
    return daily


def fetch_openweathermap_forecast(lat, lon, api_key):
    """OpenWeatherMap One Call 3.0 — daily forecast (needs paid/free key)."""
    url = (
        f"https://api.openweathermap.org/data/3.0/onecall"
        f"?lat={lat}&lon={lon}"
        f"&exclude=current,minutely,hourly,alerts"
        f"&appid={api_key}&units=metric"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    records = []
    for day in body.get("daily", []):
        records.append({
            "date":    pd.to_datetime(day["dt"], unit="s").normalize(),
            "rain_mm": day.get("rain", 0.0),
        })
    return pd.DataFrame(records)


def fetch_weatherapi_forecast(lat, lon, api_key, days=7):
    """WeatherAPI.com free forecast (up to 3 days on free tier, 14 on paid)."""
    url = (
        f"https://api.weatherapi.com/v1/forecast.json"
        f"?key={api_key}&q={lat},{lon}&days={days}&aqi=no&alerts=no"
    )
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    body = r.json()
    records = []
    for day in body["forecast"]["forecastday"]:
        records.append({
            "date":    pd.to_datetime(day["date"]),
            "rain_mm": day["day"].get("totalprecip_mm", 0.0),
        })
    return pd.DataFrame(records)


# ============================================================
# STATS
# ============================================================

def compare_stats(df, ref_col, other_col):
    """Compute MAE, RMSE, bias, Pearson r between two columns."""
    valid = df[[ref_col, other_col]].dropna()
    if len(valid) < 5:
        return {"n": len(valid), "MAE": None, "RMSE": None, "Bias": None, "r": None}
    diff  = valid[other_col] - valid[ref_col]
    mae   = diff.abs().mean()
    rmse  = (diff**2).mean()**0.5
    bias  = diff.mean()
    r     = valid[ref_col].corr(valid[other_col])
    return {"n": len(valid), "MAE": round(mae,2), "RMSE": round(rmse,2),
            "Bias": round(bias,2), "r": round(r,3)}


# ============================================================
# PLOTTING
# ============================================================

def plot_archive(data_dict, location_name, start, end, output_dir):
    """Line plot of all archive providers for one location."""
    fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

    colors = {
        "ERA5 (Open-Meteo)":      "#1f77b4",
        "ERA5-Land (Open-Meteo)": "#ff7f0e",
        "CERRA (Open-Meteo)":     "#2ca02c",
        "NASA POWER":             "#d62728",
    }

    # Top: daily raw
    ax1 = axes[0]
    for name, df in data_dict.items():
        if df is not None and len(df) > 0:
            ax1.plot(df["date"], df["rain_mm"], label=name,
                     color=colors.get(name), linewidth=0.8, alpha=0.85)
    ax1.set_ylabel("Precipitation (mm/day)")
    ax1.set_title(f"{location_name} — Daily Precipitation (archive {start} → {end})")
    ax1.legend(fontsize=8)
    ax1.grid(True, alpha=0.3)

    # Bottom: 30-day rolling mean
    ax2 = axes[1]
    for name, df in data_dict.items():
        if df is not None and len(df) > 0:
            rolled = df.set_index("date")["rain_mm"].rolling(30, min_periods=15).mean()
            ax2.plot(rolled.index, rolled.values, label=name,
                     color=colors.get(name), linewidth=1.5)
    ax2.set_ylabel("30-day rolling mean (mm/day)")
    ax2.set_xlabel("Date")
    ax2.legend(fontsize=8)
    ax2.grid(True, alpha=0.3)
    ax2.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))
    fig.autofmt_xdate()

    plt.tight_layout()
    path = output_dir / f"archive_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


def plot_forecast(data_dict, location_name, output_dir):
    """Bar chart comparing 7-day forecasts across providers."""
    fig, ax = plt.subplots(figsize=(12, 5))

    providers = [(name, df) for name, df in data_dict.items() if df is not None and len(df) > 0]
    n = len(providers)
    if n == 0:
        plt.close()
        return

    # Collect all forecast dates
    all_dates = sorted(set(
        d for _, df in providers
        for d in df["date"].dt.date.tolist()
    ))
    x = np.arange(len(all_dates))
    width = 0.8 / n

    cmap = matplotlib.colormaps["tab10"].resampled(n)
    for i, (name, df) in enumerate(providers):
        vals = []
        date_map = dict(zip(df["date"].dt.date, df["rain_mm"]))
        for d in all_dates:
            vals.append(date_map.get(d, 0.0))
        ax.bar(x + i * width, vals, width, label=name, color=cmap(i), alpha=0.85)

    ax.set_xticks(x + width * (n - 1) / 2)
    ax.set_xticklabels([d.strftime("%d %b") for d in all_dates], rotation=45, ha="right")
    ax.set_ylabel("Precipitation (mm/day)")
    ax.set_title(f"{location_name} — 7-Day Forecast Comparison")
    ax.legend(fontsize=9)
    ax.grid(True, axis="y", alpha=0.3)
    plt.tight_layout()

    path = output_dir / f"forecast_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


def plot_vs_era5(data_dict, location_name, output_dir):
    """Each provider vs ERA5 reference: scatter + regression line.

    How to read:
      - Each panel shows one provider (y-axis) vs ERA5 (x-axis) for daily precip.
      - Points on the red diagonal = perfect agreement.
      - Points above = provider wetter than ERA5 on that day.
      - r = Pearson correlation (1 = perfect). MAE = mean absolute error in mm/day.
      - Bias = provider minus ERA5 on average (positive = provider wetter).
    """
    ref_name = "ERA5 (Open-Meteo)"
    ref_df = data_dict.get(ref_name)
    if ref_df is None:
        return

    others = [(n, df) for n, df in data_dict.items()
              if n != ref_name and df is not None and len(df) > 0]
    if not others:
        return

    n = len(others)
    fig, axes = plt.subplots(1, n, figsize=(5 * n, 5), squeeze=False)

    colors = {
        "ERA5-Land (Open-Meteo)": "#ff7f0e",
        "CERRA (Open-Meteo)":     "#2ca02c",
        "NASA POWER":             "#d62728",
    }

    for i, (name, df) in enumerate(others):
        ax = axes[0][i]
        merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp"))
        valid  = merged[["rain_mm_ref", "rain_mm_cmp"]].dropna()

        color = colors.get(name, "steelblue")
        ax.scatter(valid["rain_mm_ref"], valid["rain_mm_cmp"],
                   s=4, alpha=0.35, color=color)

        # Perfect-agreement diagonal
        lim = max(valid.max().max(), 1) * 1.05
        ax.plot([0, lim], [0, lim], "r--", linewidth=1, label="Perfect agreement")

        # Linear regression line
        if len(valid) > 5:
            coeffs = np.polyfit(valid["rain_mm_ref"], valid["rain_mm_cmp"], 1)
            x_fit  = np.linspace(0, lim, 100)
            ax.plot(x_fit, np.polyval(coeffs, x_fit), "k-", linewidth=1,
                    alpha=0.6, label=f"Regression (slope={coeffs[0]:.2f})")

        stats = compare_stats(merged, "rain_mm_ref", "rain_mm_cmp")
        ax.set_xlim(0, lim); ax.set_ylim(0, lim)
        ax.set_xlabel("ERA5 (Open-Meteo) mm/day", fontsize=9)
        ax.set_ylabel(f"{name} mm/day", fontsize=9)
        ax.set_title(
            f"{name}\nr={stats['r']}  MAE={stats['MAE']} mm  Bias={stats['Bias']:+.2f} mm",
            fontsize=9
        )
        ax.legend(fontsize=7)
        ax.grid(True, alpha=0.3)

    fig.suptitle(
        f"{location_name} — Daily Precip vs ERA5 Reference\n"
        "Red dashed = perfect agreement. Points above line = provider wetter than ERA5.",
        fontsize=10
    )
    plt.tight_layout()
    path = output_dir / f"vs_era5_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


def plot_cumulative(data_dict, location_name, output_dir):
    """Cumulative annual precipitation — most relevant for crop/irrigation context."""
    fig, ax = plt.subplots(figsize=(14, 5))

    colors = {
        "ERA5 (Open-Meteo)":      "#1f77b4",
        "ERA5-Land (Open-Meteo)": "#ff7f0e",
        "CERRA (Open-Meteo)":     "#2ca02c",
        "NASA POWER":             "#d62728",
    }

    for name, df in data_dict.items():
        if df is None or len(df) == 0:
            continue
        s = df.set_index("date")["rain_mm"].sort_index().cumsum()
        total = s.iloc[-1]
        ax.plot(s.index, s.values, label=f"{name}  (total: {total:.0f} mm)",
                color=colors.get(name), linewidth=1.8)

    ax.set_ylabel("Cumulative precipitation (mm)")
    ax.set_xlabel("Date")
    ax.set_title(
        f"{location_name} — Cumulative Annual Precipitation by Provider\n"
        "Divergence = sources disagree on total seasonal rainfall"
    )
    ax.legend(fontsize=9)
    ax.grid(True, alpha=0.3)
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))
    fig.autofmt_xdate()
    plt.tight_layout()

    path = output_dir / f"cumulative_{location_name}.png"
    plt.savefig(path, dpi=150)
    plt.close()
    print(f"  Saved: {path}")


# ============================================================
# MAIN
# ============================================================

def run_location(loc_name, lat, lon):
    print(f"\n{'='*60}")
    print(f"  {loc_name}  ({lat}°, {lon}°)")
    print(f"{'='*60}")

    # ---- ARCHIVE ----
    print("\n[Archive]")
    archive_data = {}

    print("  Fetching Open-Meteo ERA5...")
    try:
        archive_data["ERA5 (Open-Meteo)"] = fetch_openmeteo_archive(
            lat, lon, ARCHIVE_START, ARCHIVE_END, model="era5"
        )
        print(f"    → {len(archive_data['ERA5 (Open-Meteo)'])} days")
    except Exception as e:
        print(f"    ✗ ERA5 failed: {e}")
        archive_data["ERA5 (Open-Meteo)"] = None

    time.sleep(0.5)
    print("  Fetching Open-Meteo ERA5-Land...")
    try:
        archive_data["ERA5-Land (Open-Meteo)"] = fetch_openmeteo_archive(
            lat, lon, ARCHIVE_START, ARCHIVE_END, model="era5_land"
        )
        print(f"    → {len(archive_data['ERA5-Land (Open-Meteo)'])} days")
    except Exception as e:
        print(f"    ✗ ERA5-Land failed: {e}")
        archive_data["ERA5-Land (Open-Meteo)"] = None

    time.sleep(0.5)
    # CERRA only covers Europe (roughly 20°W–45°E, 30°N–80°N)
    if -20 <= lon <= 45 and 30 <= lat <= 80:
        print("  Fetching Open-Meteo CERRA (EU only)...")
        try:
            archive_data["CERRA (Open-Meteo)"] = fetch_openmeteo_archive(
                lat, lon, ARCHIVE_START, ARCHIVE_END, model="cerra"
            )
            print(f"    → {len(archive_data['CERRA (Open-Meteo)'])} days")
        except Exception as e:
            print(f"    ✗ CERRA failed: {e}")
            archive_data["CERRA (Open-Meteo)"] = None
    else:
        print("  Skipping CERRA (outside EU coverage)")
        archive_data["CERRA (Open-Meteo)"] = None

    time.sleep(0.5)
    print("  Fetching NASA POWER...")
    try:
        archive_data["NASA POWER"] = fetch_nasa_power(lat, lon, ARCHIVE_START, ARCHIVE_END)
        print(f"    → {len(archive_data['NASA POWER'])} days")
    except Exception as e:
        print(f"    ✗ NASA POWER failed: {e}")
        archive_data["NASA POWER"] = None

    # Stats vs ERA5 reference
    print("\n  Stats vs ERA5 (Open-Meteo) reference:")
    ref_df = archive_data.get("ERA5 (Open-Meteo)")
    for name, df in archive_data.items():
        if name == "ERA5 (Open-Meteo)" or df is None:
            continue
        if ref_df is None:
            continue
        merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp"))
        stats = compare_stats(merged, "rain_mm_ref", "rain_mm_cmp")
        print(f"    {name:30s}  n={stats['n']:4d}  MAE={stats['MAE']}  "
              f"RMSE={stats['RMSE']}  Bias={stats['Bias']}  r={stats['r']}")

    plot_archive(archive_data, loc_name, ARCHIVE_START, ARCHIVE_END, OUTPUT_DIR)
    plot_cumulative(archive_data, loc_name, OUTPUT_DIR)
    plot_vs_era5(archive_data, loc_name, OUTPUT_DIR)

    # ---- FORECAST ----
    print("\n[Forecast]")
    forecast_data = {}

    print("  Fetching Open-Meteo forecast...")
    try:
        forecast_data["Open-Meteo Forecast"] = fetch_openmeteo_forecast(lat, lon)
        print(f"    → {len(forecast_data['Open-Meteo Forecast'])} days")
    except Exception as e:
        print(f"    ✗ Open-Meteo forecast failed: {e}")
        forecast_data["Open-Meteo Forecast"] = None

    time.sleep(0.5)
    print("  Fetching YR.no LocationForecast...")
    try:
        forecast_data["YR.no"] = fetch_yr_forecast(lat, lon)
        print(f"    → {len(forecast_data['YR.no'])} days")
    except Exception as e:
        print(f"    ✗ YR.no failed: {e}")
        forecast_data["YR.no"] = None

    if OPENWEATHERMAP_KEY:
        time.sleep(0.5)
        print("  Fetching OpenWeatherMap forecast...")
        try:
            forecast_data["OpenWeatherMap"] = fetch_openweathermap_forecast(
                lat, lon, OPENWEATHERMAP_KEY
            )
            print(f"    → {len(forecast_data['OpenWeatherMap'])} days")
        except Exception as e:
            print(f"    ✗ OpenWeatherMap failed: {e}")
            forecast_data["OpenWeatherMap"] = None

    if WEATHERAPI_KEY:
        time.sleep(0.5)
        print("  Fetching WeatherAPI.com forecast...")
        try:
            forecast_data["WeatherAPI.com"] = fetch_weatherapi_forecast(
                lat, lon, WEATHERAPI_KEY
            )
            print(f"    → {len(forecast_data['WeatherAPI.com'])} days")
        except Exception as e:
            print(f"    ✗ WeatherAPI.com failed: {e}")
            forecast_data["WeatherAPI.com"] = None

    plot_forecast(forecast_data, loc_name, OUTPUT_DIR)


if __name__ == "__main__":
    print(f"Weather API Comparison — {datetime.date.today()}")
    print(f"Archive: {ARCHIVE_START} → {ARCHIVE_END}")
    print(f"Forecast: {FORECAST_START} → {FORECAST_END}")
    print(f"Output: {OUTPUT_DIR.resolve()}")

    for loc_name, coords in LOCATIONS.items():
        run_location(loc_name, coords["lat"], coords["lon"])
        time.sleep(1)

    print(f"\nDone. Plots saved to: {OUTPUT_DIR.resolve()}")