SmartCane/python_app/weather_api_comparison.py
Timon 1da5c0d0a7 Add weather API comparison scripts for precipitation analysis
- Implemented `weather_api_comparison.py` to compare daily precipitation from multiple weather APIs for Arnhem, Netherlands and Angata, Kenya.
- Integrated fetching functions for various weather data sources including Open-Meteo, NASA POWER, OpenWeatherMap, and WeatherAPI.com.
- Added plotting functions to visualize archive and forecast data, including cumulative precipitation and comparison against ERA5 reference.
- Created `90_rainfall_utils.R` for R to fetch rainfall data and overlay it on CI plots, supporting multiple providers with a generic fetch wrapper.
- Included spatial helpers for efficient API calls based on unique geographical tiles.
2026-03-12 17:30:01 +01:00

568 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
weather_api_comparison.py
=========================
Compare daily precipitation from multiple free weather APIs across two locations:
- Arnhem, Netherlands (51.985°N, 5.899°E) — European climate
- Angata, Kenya ( 1.330°S, 34.738°E) — tropical / sugarcane context
ARCHIVE providers (no API key required):
1. Open-Meteo ERA5 — current SmartCane provider (0.25°, global)
2. Open-Meteo ERA5-Land — higher resolution variant (0.10°, global)
3. Open-Meteo CERRA — EU regional reanalysis (0.05°, EU only)
4. NASA POWER — completely independent source (0.50°, global)
FORECAST providers (no API key required):
5. Open-Meteo Forecast — ensemble of NWP models (global)
6. YR.no LocationForecast — Norwegian Met Institute (~10 days, global)
FORECAST providers (API key required — set in CONFIG below, leave "" to skip):
7. OpenWeatherMap — free tier, 1000 calls/day
8. WeatherAPI.com — free tier
OUTPUT:
Plots saved to: weather_comparison_plots/
Summary stats printed to console.
Usage:
python weather_api_comparison.py
"""
import os
import json
import time
import datetime
import requests
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
from pathlib import Path
# ============================================================
# CONFIG
# ============================================================
LOCATIONS = {
"Arnhem_NL": {"lat": 51.985, "lon": 5.899},
"Angata_KE": {"lat": -1.330, "lon": 34.738},
}
# Archive: last 12 months
ARCHIVE_END = datetime.date.today() - datetime.timedelta(days=2) # ERA5 lags ~2 days
ARCHIVE_START = ARCHIVE_END - datetime.timedelta(days=365)
# Forecast: today + 7 days
FORECAST_START = datetime.date.today()
FORECAST_END = FORECAST_START + datetime.timedelta(days=7)
# Optional API keys — leave "" to skip that provider
OPENWEATHERMAP_KEY = "" # https://openweathermap.org/api
WEATHERAPI_KEY = "" # https://www.weatherapi.com/
OUTPUT_DIR = Path("weather_comparison_plots")
OUTPUT_DIR.mkdir(exist_ok=True)
USER_AGENT = "SmartCane-WeatherComparison/1.0 (research; contact via github)"
# ============================================================
# ARCHIVE FETCHERS
# ============================================================
def fetch_openmeteo_archive(lat, lon, start, end, model="era5"):
"""Open-Meteo ERA5 / ERA5-Land / CERRA archive.
ERA5 is the default (no models param needed). ERA5-Land and CERRA use lowercase names.
"""
# ERA5 is the default model — adding models param with wrong casing causes 400
model_suffix = "" if model == "era5" else f"&models={model}"
url = (
f"https://archive-api.open-meteo.com/v1/archive"
f"?latitude={lat}&longitude={lon}"
f"&daily=precipitation_sum"
f"&start_date={start}&end_date={end}"
f"{model_suffix}"
f"&timezone=UTC"
)
r = requests.get(url, timeout=30)
r.raise_for_status()
body = r.json()
df = pd.DataFrame({
"date": pd.to_datetime(body["daily"]["time"]),
"rain_mm": body["daily"]["precipitation_sum"],
})
df["rain_mm"] = pd.to_numeric(df["rain_mm"], errors="coerce").clip(lower=0).fillna(0)
# ERA5-Land sometimes returns values in meters (Open-Meteo API quirk).
# Auto-detect: if annual total < 50mm for any non-polar location, assume m → convert.
if df["rain_mm"].sum() < 50 and len(df) > 30:
df["rain_mm"] = df["rain_mm"] * 1000
print(f" ⚠ Unit auto-converted m→mm (values were implausibly small)")
return df
def fetch_nasa_power(lat, lon, start, end):
"""NASA POWER — daily PRECTOTCORR (precipitation corrected), 0.5° grid."""
url = (
"https://power.larc.nasa.gov/api/temporal/daily/point"
f"?parameters=PRECTOTCORR"
f"&community=AG"
f"&longitude={lon}&latitude={lat}"
f"&start={start.strftime('%Y%m%d')}&end={end.strftime('%Y%m%d')}"
f"&format=JSON"
)
r = requests.get(url, timeout=60)
r.raise_for_status()
body = r.json()
raw = body["properties"]["parameter"]["PRECTOTCORR"]
df = pd.DataFrame([
{"date": pd.to_datetime(k, format="%Y%m%d"), "rain_mm": max(v, 0)}
for k, v in raw.items()
if v != -999 # NASA POWER fill value
])
return df.sort_values("date").reset_index(drop=True)
# ============================================================
# FORECAST FETCHERS
# ============================================================
def fetch_openmeteo_forecast(lat, lon, days=8):
"""Open-Meteo NWP forecast — default best_match model."""
end = datetime.date.today() + datetime.timedelta(days=days)
url = (
f"https://api.open-meteo.com/v1/forecast"
f"?latitude={lat}&longitude={lon}"
f"&daily=precipitation_sum"
f"&forecast_days={days + 1}"
f"&timezone=UTC"
)
r = requests.get(url, timeout=30)
r.raise_for_status()
body = r.json()
df = pd.DataFrame({
"date": pd.to_datetime(body["daily"]["time"]),
"rain_mm": body["daily"]["precipitation_sum"],
})
df["rain_mm"] = pd.to_numeric(df["rain_mm"], errors="coerce").fillna(0)
return df
def fetch_yr_forecast(lat, lon):
"""YR.no LocationForecast 2.0 — hourly precip aggregated to daily."""
url = f"https://api.met.no/weatherapi/locationforecast/2.0/compact?lat={lat}&lon={lon}"
headers = {"User-Agent": USER_AGENT}
r = requests.get(url, headers=headers, timeout=30)
r.raise_for_status()
body = r.json()
records = []
for entry in body["properties"]["timeseries"]:
ts = pd.to_datetime(entry["time"])
data = entry["data"]
precip = 0.0
if "next_1_hours" in data:
precip = data["next_1_hours"]["details"].get("precipitation_amount", 0.0)
elif "next_6_hours" in data:
precip = data["next_6_hours"]["details"].get("precipitation_amount", 0.0) / 6
records.append({"datetime": ts, "precip_hour": precip})
hourly = pd.DataFrame(records)
hourly["date"] = hourly["datetime"].dt.date
daily = (
hourly.groupby("date")["precip_hour"]
.sum()
.reset_index()
.rename(columns={"precip_hour": "rain_mm"})
)
daily["date"] = pd.to_datetime(daily["date"])
return daily
def fetch_openweathermap_forecast(lat, lon, api_key):
"""OpenWeatherMap One Call 3.0 — daily forecast (needs paid/free key)."""
url = (
f"https://api.openweathermap.org/data/3.0/onecall"
f"?lat={lat}&lon={lon}"
f"&exclude=current,minutely,hourly,alerts"
f"&appid={api_key}&units=metric"
)
r = requests.get(url, timeout=30)
r.raise_for_status()
body = r.json()
records = []
for day in body.get("daily", []):
records.append({
"date": pd.to_datetime(day["dt"], unit="s").normalize(),
"rain_mm": day.get("rain", 0.0),
})
return pd.DataFrame(records)
def fetch_weatherapi_forecast(lat, lon, api_key, days=7):
"""WeatherAPI.com free forecast (up to 3 days on free tier, 14 on paid)."""
url = (
f"https://api.weatherapi.com/v1/forecast.json"
f"?key={api_key}&q={lat},{lon}&days={days}&aqi=no&alerts=no"
)
r = requests.get(url, timeout=30)
r.raise_for_status()
body = r.json()
records = []
for day in body["forecast"]["forecastday"]:
records.append({
"date": pd.to_datetime(day["date"]),
"rain_mm": day["day"].get("totalprecip_mm", 0.0),
})
return pd.DataFrame(records)
# ============================================================
# STATS
# ============================================================
def compare_stats(df, ref_col, other_col):
"""Compute MAE, RMSE, bias, Pearson r between two columns."""
valid = df[[ref_col, other_col]].dropna()
if len(valid) < 5:
return {"n": len(valid), "MAE": None, "RMSE": None, "Bias": None, "r": None}
diff = valid[other_col] - valid[ref_col]
mae = diff.abs().mean()
rmse = (diff**2).mean()**0.5
bias = diff.mean()
r = valid[ref_col].corr(valid[other_col])
return {"n": len(valid), "MAE": round(mae,2), "RMSE": round(rmse,2),
"Bias": round(bias,2), "r": round(r,3)}
# ============================================================
# PLOTTING
# ============================================================
def plot_archive(data_dict, location_name, start, end, output_dir):
"""Line plot of all archive providers for one location."""
fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)
colors = {
"ERA5 (Open-Meteo)": "#1f77b4",
"ERA5-Land (Open-Meteo)": "#ff7f0e",
"CERRA (Open-Meteo)": "#2ca02c",
"NASA POWER": "#d62728",
}
# Top: daily raw
ax1 = axes[0]
for name, df in data_dict.items():
if df is not None and len(df) > 0:
ax1.plot(df["date"], df["rain_mm"], label=name,
color=colors.get(name), linewidth=0.8, alpha=0.85)
ax1.set_ylabel("Precipitation (mm/day)")
ax1.set_title(f"{location_name} — Daily Precipitation (archive {start}{end})")
ax1.legend(fontsize=8)
ax1.grid(True, alpha=0.3)
# Bottom: 30-day rolling mean
ax2 = axes[1]
for name, df in data_dict.items():
if df is not None and len(df) > 0:
rolled = df.set_index("date")["rain_mm"].rolling(30, min_periods=15).mean()
ax2.plot(rolled.index, rolled.values, label=name,
color=colors.get(name), linewidth=1.5)
ax2.set_ylabel("30-day rolling mean (mm/day)")
ax2.set_xlabel("Date")
ax2.legend(fontsize=8)
ax2.grid(True, alpha=0.3)
ax2.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))
fig.autofmt_xdate()
plt.tight_layout()
path = output_dir / f"archive_{location_name}.png"
plt.savefig(path, dpi=150)
plt.close()
print(f" Saved: {path}")
def plot_forecast(data_dict, location_name, output_dir):
"""Bar chart comparing 7-day forecasts across providers."""
fig, ax = plt.subplots(figsize=(12, 5))
providers = [(name, df) for name, df in data_dict.items() if df is not None and len(df) > 0]
n = len(providers)
if n == 0:
plt.close()
return
# Collect all forecast dates
all_dates = sorted(set(
d for _, df in providers
for d in df["date"].dt.date.tolist()
))
x = np.arange(len(all_dates))
width = 0.8 / n
cmap = matplotlib.colormaps["tab10"].resampled(n)
for i, (name, df) in enumerate(providers):
vals = []
date_map = dict(zip(df["date"].dt.date, df["rain_mm"]))
for d in all_dates:
vals.append(date_map.get(d, 0.0))
ax.bar(x + i * width, vals, width, label=name, color=cmap(i), alpha=0.85)
ax.set_xticks(x + width * (n - 1) / 2)
ax.set_xticklabels([d.strftime("%d %b") for d in all_dates], rotation=45, ha="right")
ax.set_ylabel("Precipitation (mm/day)")
ax.set_title(f"{location_name} — 7-Day Forecast Comparison")
ax.legend(fontsize=9)
ax.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
path = output_dir / f"forecast_{location_name}.png"
plt.savefig(path, dpi=150)
plt.close()
print(f" Saved: {path}")
def plot_vs_era5(data_dict, location_name, output_dir):
"""Each provider vs ERA5 reference: scatter + regression line.
How to read:
- Each panel shows one provider (y-axis) vs ERA5 (x-axis) for daily precip.
- Points on the red diagonal = perfect agreement.
- Points above = provider wetter than ERA5 on that day.
- r = Pearson correlation (1 = perfect). MAE = mean absolute error in mm/day.
- Bias = provider minus ERA5 on average (positive = provider wetter).
"""
ref_name = "ERA5 (Open-Meteo)"
ref_df = data_dict.get(ref_name)
if ref_df is None:
return
others = [(n, df) for n, df in data_dict.items()
if n != ref_name and df is not None and len(df) > 0]
if not others:
return
n = len(others)
fig, axes = plt.subplots(1, n, figsize=(5 * n, 5), squeeze=False)
colors = {
"ERA5-Land (Open-Meteo)": "#ff7f0e",
"CERRA (Open-Meteo)": "#2ca02c",
"NASA POWER": "#d62728",
}
for i, (name, df) in enumerate(others):
ax = axes[0][i]
merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp"))
valid = merged[["rain_mm_ref", "rain_mm_cmp"]].dropna()
color = colors.get(name, "steelblue")
ax.scatter(valid["rain_mm_ref"], valid["rain_mm_cmp"],
s=4, alpha=0.35, color=color)
# Perfect-agreement diagonal
lim = max(valid.max().max(), 1) * 1.05
ax.plot([0, lim], [0, lim], "r--", linewidth=1, label="Perfect agreement")
# Linear regression line
if len(valid) > 5:
coeffs = np.polyfit(valid["rain_mm_ref"], valid["rain_mm_cmp"], 1)
x_fit = np.linspace(0, lim, 100)
ax.plot(x_fit, np.polyval(coeffs, x_fit), "k-", linewidth=1,
alpha=0.6, label=f"Regression (slope={coeffs[0]:.2f})")
stats = compare_stats(merged, "rain_mm_ref", "rain_mm_cmp")
ax.set_xlim(0, lim); ax.set_ylim(0, lim)
ax.set_xlabel("ERA5 (Open-Meteo) mm/day", fontsize=9)
ax.set_ylabel(f"{name} mm/day", fontsize=9)
ax.set_title(
f"{name}\nr={stats['r']} MAE={stats['MAE']} mm Bias={stats['Bias']:+.2f} mm",
fontsize=9
)
ax.legend(fontsize=7)
ax.grid(True, alpha=0.3)
fig.suptitle(
f"{location_name} — Daily Precip vs ERA5 Reference\n"
"Red dashed = perfect agreement. Points above line = provider wetter than ERA5.",
fontsize=10
)
plt.tight_layout()
path = output_dir / f"vs_era5_{location_name}.png"
plt.savefig(path, dpi=150)
plt.close()
print(f" Saved: {path}")
def plot_cumulative(data_dict, location_name, output_dir):
"""Cumulative annual precipitation — most relevant for crop/irrigation context."""
fig, ax = plt.subplots(figsize=(14, 5))
colors = {
"ERA5 (Open-Meteo)": "#1f77b4",
"ERA5-Land (Open-Meteo)": "#ff7f0e",
"CERRA (Open-Meteo)": "#2ca02c",
"NASA POWER": "#d62728",
}
for name, df in data_dict.items():
if df is None or len(df) == 0:
continue
s = df.set_index("date")["rain_mm"].sort_index().cumsum()
total = s.iloc[-1]
ax.plot(s.index, s.values, label=f"{name} (total: {total:.0f} mm)",
color=colors.get(name), linewidth=1.8)
ax.set_ylabel("Cumulative precipitation (mm)")
ax.set_xlabel("Date")
ax.set_title(
f"{location_name} — Cumulative Annual Precipitation by Provider\n"
"Divergence = sources disagree on total seasonal rainfall"
)
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %Y"))
fig.autofmt_xdate()
plt.tight_layout()
path = output_dir / f"cumulative_{location_name}.png"
plt.savefig(path, dpi=150)
plt.close()
print(f" Saved: {path}")
# ============================================================
# MAIN
# ============================================================
def run_location(loc_name, lat, lon):
print(f"\n{'='*60}")
print(f" {loc_name} ({lat}°, {lon}°)")
print(f"{'='*60}")
# ---- ARCHIVE ----
print("\n[Archive]")
archive_data = {}
print(" Fetching Open-Meteo ERA5...")
try:
archive_data["ERA5 (Open-Meteo)"] = fetch_openmeteo_archive(
lat, lon, ARCHIVE_START, ARCHIVE_END, model="era5"
)
print(f"{len(archive_data['ERA5 (Open-Meteo)'])} days")
except Exception as e:
print(f" ✗ ERA5 failed: {e}")
archive_data["ERA5 (Open-Meteo)"] = None
time.sleep(0.5)
print(" Fetching Open-Meteo ERA5-Land...")
try:
archive_data["ERA5-Land (Open-Meteo)"] = fetch_openmeteo_archive(
lat, lon, ARCHIVE_START, ARCHIVE_END, model="era5_land"
)
print(f"{len(archive_data['ERA5-Land (Open-Meteo)'])} days")
except Exception as e:
print(f" ✗ ERA5-Land failed: {e}")
archive_data["ERA5-Land (Open-Meteo)"] = None
time.sleep(0.5)
# CERRA only covers Europe (roughly 20°W45°E, 30°N80°N)
if -20 <= lon <= 45 and 30 <= lat <= 80:
print(" Fetching Open-Meteo CERRA (EU only)...")
try:
archive_data["CERRA (Open-Meteo)"] = fetch_openmeteo_archive(
lat, lon, ARCHIVE_START, ARCHIVE_END, model="cerra"
)
print(f"{len(archive_data['CERRA (Open-Meteo)'])} days")
except Exception as e:
print(f" ✗ CERRA failed: {e}")
archive_data["CERRA (Open-Meteo)"] = None
else:
print(" Skipping CERRA (outside EU coverage)")
archive_data["CERRA (Open-Meteo)"] = None
time.sleep(0.5)
print(" Fetching NASA POWER...")
try:
archive_data["NASA POWER"] = fetch_nasa_power(lat, lon, ARCHIVE_START, ARCHIVE_END)
print(f"{len(archive_data['NASA POWER'])} days")
except Exception as e:
print(f" ✗ NASA POWER failed: {e}")
archive_data["NASA POWER"] = None
# Stats vs ERA5 reference
print("\n Stats vs ERA5 (Open-Meteo) reference:")
ref_df = archive_data.get("ERA5 (Open-Meteo)")
for name, df in archive_data.items():
if name == "ERA5 (Open-Meteo)" or df is None:
continue
if ref_df is None:
continue
merged = ref_df.merge(df, on="date", suffixes=("_ref", "_cmp"))
stats = compare_stats(merged, "rain_mm_ref", "rain_mm_cmp")
print(f" {name:30s} n={stats['n']:4d} MAE={stats['MAE']} "
f"RMSE={stats['RMSE']} Bias={stats['Bias']} r={stats['r']}")
plot_archive(archive_data, loc_name, ARCHIVE_START, ARCHIVE_END, OUTPUT_DIR)
plot_cumulative(archive_data, loc_name, OUTPUT_DIR)
plot_vs_era5(archive_data, loc_name, OUTPUT_DIR)
# ---- FORECAST ----
print("\n[Forecast]")
forecast_data = {}
print(" Fetching Open-Meteo forecast...")
try:
forecast_data["Open-Meteo Forecast"] = fetch_openmeteo_forecast(lat, lon)
print(f"{len(forecast_data['Open-Meteo Forecast'])} days")
except Exception as e:
print(f" ✗ Open-Meteo forecast failed: {e}")
forecast_data["Open-Meteo Forecast"] = None
time.sleep(0.5)
print(" Fetching YR.no LocationForecast...")
try:
forecast_data["YR.no"] = fetch_yr_forecast(lat, lon)
print(f"{len(forecast_data['YR.no'])} days")
except Exception as e:
print(f" ✗ YR.no failed: {e}")
forecast_data["YR.no"] = None
if OPENWEATHERMAP_KEY:
time.sleep(0.5)
print(" Fetching OpenWeatherMap forecast...")
try:
forecast_data["OpenWeatherMap"] = fetch_openweathermap_forecast(
lat, lon, OPENWEATHERMAP_KEY
)
print(f"{len(forecast_data['OpenWeatherMap'])} days")
except Exception as e:
print(f" ✗ OpenWeatherMap failed: {e}")
forecast_data["OpenWeatherMap"] = None
if WEATHERAPI_KEY:
time.sleep(0.5)
print(" Fetching WeatherAPI.com forecast...")
try:
forecast_data["WeatherAPI.com"] = fetch_weatherapi_forecast(
lat, lon, WEATHERAPI_KEY
)
print(f"{len(forecast_data['WeatherAPI.com'])} days")
except Exception as e:
print(f" ✗ WeatherAPI.com failed: {e}")
forecast_data["WeatherAPI.com"] = None
plot_forecast(forecast_data, loc_name, OUTPUT_DIR)
if __name__ == "__main__":
print(f"Weather API Comparison — {datetime.date.today()}")
print(f"Archive: {ARCHIVE_START}{ARCHIVE_END}")
print(f"Forecast: {FORECAST_START}{FORECAST_END}")
print(f"Output: {OUTPUT_DIR.resolve()}")
for loc_name, coords in LOCATIONS.items():
run_location(loc_name, coords["lat"], coords["lon"])
time.sleep(1)
print(f"\nDone. Plots saved to: {OUTPUT_DIR.resolve()}")