Merge branch 'code-improvements' into review_perField_code

This commit is contained in:
Timon 2026-02-18 15:17:11 +01:00
commit 55d67ef503
14 changed files with 2217 additions and 132 deletions

4
.gitignore vendored
View file

@ -45,6 +45,10 @@ output/
reports/
*.docx
# Experiment Outputs (temporary plots, analysis artifacts)
python_app/harvest_detection_experiments/*/plots/
python_app/harvest_detection_experiments/*/*.ipynb_checkpoints/
# Cache Files
rosm.cache/
*.cache

View file

@ -0,0 +1,258 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d90f7f7f",
"metadata": {},
"source": [
"# Validate & Explore ESA `pivot.geojson`\n",
"\n",
"Quick inspection of the field boundary file for the ESA project:\n",
"- Geometry validity, CRS, field count\n",
"- Area/perimeter statistics\n",
"- Map visualization with field labels\n",
"- Export summary CSV for downstream use"
]
},
{
"cell_type": "markdown",
"id": "e99594bb",
"metadata": {},
"source": [
"## 1. Import Required Libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "652485c7",
"metadata": {},
"outputs": [],
"source": [
"import geopandas as gpd\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from pathlib import Path\n",
"from shapely.validation import make_valid\n",
"\n",
"# Project paths\n",
"PROJECT = \"esa\"\n",
"REPO_ROOT = Path.cwd().parents[2] # SmartCane_code\n",
"PROJECT_STORAGE = REPO_ROOT / \"laravel_app\" / \"storage\" / \"app\" / PROJECT\n",
"GEOJSON_PATH = PROJECT_STORAGE / \"pivot.geojson\"\n",
"DATA_DIR = PROJECT_STORAGE / \"Data\"\n",
"DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
"\n",
"print(f\"Repo root: {REPO_ROOT}\")\n",
"print(f\"GeoJSON: {GEOJSON_PATH}\")\n",
"print(f\"Exists: {GEOJSON_PATH.exists()}\")"
]
},
{
"cell_type": "markdown",
"id": "4ff6d825",
"metadata": {},
"source": [
"## 2. Load GeoJSON & Basic Info"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92012a8c",
"metadata": {},
"outputs": [],
"source": [
"gdf = gpd.read_file(GEOJSON_PATH)\n",
"\n",
"print(f\"Features: {len(gdf)}\")\n",
"print(f\"Geometry type: {gdf.geom_type.unique()}\")\n",
"print(f\"CRS: {gdf.crs}\")\n",
"print(f\"Bounds: {gdf.total_bounds}\")\n",
"print(f\"\\nColumns: {list(gdf.columns)}\")\n",
"print(f\"\\nAttribute table:\")\n",
"gdf.drop(columns=\"geometry\")"
]
},
{
"cell_type": "markdown",
"id": "edadb2de",
"metadata": {},
"source": [
"## 3. Validate Geometries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15b5e915",
"metadata": {},
"outputs": [],
"source": [
"# Check validity\n",
"validity = gdf.is_valid\n",
"empty = gdf.is_empty\n",
"\n",
"print(f\"Valid geometries: {validity.sum()}/{len(gdf)}\")\n",
"print(f\"Invalid geometries: {(~validity).sum()}\")\n",
"print(f\"Empty geometries: {empty.sum()}\")\n",
"\n",
"# Repair if needed\n",
"if not validity.all():\n",
" print(\"\\nRepairing invalid geometries...\")\n",
" gdf[\"geometry\"] = gdf[\"geometry\"].apply(make_valid)\n",
" print(f\"After repair: {gdf.is_valid.sum()}/{len(gdf)} valid\")\n",
"\n",
"# Remove empty\n",
"if empty.any():\n",
" gdf = gdf[~gdf.is_empty]\n",
" print(f\"After removing empty: {len(gdf)} features remaining\")"
]
},
{
"cell_type": "markdown",
"id": "c3f01400",
"metadata": {},
"source": [
"## 4. Field-Level Statistics (Area, Perimeter, Centroid)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "922ffdbd",
"metadata": {},
"outputs": [],
"source": [
"# Reproject to UTM for accurate area/perimeter (auto-detect UTM zone from centroid)\n",
"centroid = gdf.dissolve().centroid.iloc[0]\n",
"utm_zone = int((centroid.x + 180) / 6) + 1\n",
"hemisphere = \"north\" if centroid.y >= 0 else \"south\"\n",
"utm_epsg = 32600 + utm_zone if hemisphere == \"north\" else 32700 + utm_zone\n",
"print(f\"Auto-detected UTM zone: {utm_zone}{hemisphere[0].upper()} (EPSG:{utm_epsg})\")\n",
"\n",
"gdf_utm = gdf.to_crs(epsg=utm_epsg)\n",
"\n",
"# Compute stats\n",
"gdf_utm[\"area_m2\"] = gdf_utm.geometry.area\n",
"gdf_utm[\"area_ha\"] = gdf_utm[\"area_m2\"] / 10_000\n",
"gdf_utm[\"area_acres\"] = gdf_utm[\"area_ha\"] * 2.47105\n",
"gdf_utm[\"perimeter_m\"] = gdf_utm.geometry.length\n",
"centroids = gdf_utm.geometry.centroid\n",
"gdf_utm[\"centroid_x\"] = centroids.x\n",
"gdf_utm[\"centroid_y\"] = centroids.y\n",
"\n",
"# Summary table\n",
"stats = gdf_utm[[\"field\", \"area_ha\", \"area_acres\", \"perimeter_m\", \"centroid_x\", \"centroid_y\"]].copy()\n",
"stats = stats.sort_values(\"area_ha\", ascending=False).reset_index(drop=True)\n",
"print(f\"\\nTotal area: {stats['area_ha'].sum():.1f} ha ({stats['area_acres'].sum():.1f} acres)\")\n",
"print(f\"Fields: {len(stats)}\")\n",
"print(f\"Area range: {stats['area_ha'].min():.1f} {stats['area_ha'].max():.1f} ha\\n\")\n",
"stats"
]
},
{
"cell_type": "markdown",
"id": "3dbf84c7",
"metadata": {},
"source": [
"## 5. Visualize Field Boundaries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "be1be0ea",
"metadata": {},
"outputs": [],
"source": [
"fig, ax = plt.subplots(1, 1, figsize=(12, 10))\n",
"gdf_utm.plot(ax=ax, column=\"field\", legend=True, edgecolor=\"black\", linewidth=0.8,\n",
" cmap=\"Set3\", alpha=0.7, legend_kwds={\"loc\": \"upper left\", \"fontsize\": 8})\n",
"\n",
"# Add field labels at centroids\n",
"for _, row in gdf_utm.iterrows():\n",
" c = row.geometry.centroid\n",
" ax.annotate(row[\"field\"], xy=(c.x, c.y), ha=\"center\", va=\"center\",\n",
" fontsize=7, fontweight=\"bold\",\n",
" bbox=dict(boxstyle=\"round,pad=0.2\", fc=\"white\", alpha=0.7))\n",
"\n",
"ax.set_title(f\"ESA Field Boundaries ({len(gdf_utm)} fields)\", fontsize=14)\n",
"ax.set_xlabel(\"Easting (m)\")\n",
"ax.set_ylabel(\"Northing (m)\")\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "52301688",
"metadata": {},
"source": [
"## 6. CRS Check & Comparison"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4c712fa",
"metadata": {},
"outputs": [],
"source": [
"# Original CRS info\n",
"print(\"=== Original CRS ===\")\n",
"print(f\"CRS: {gdf.crs}\")\n",
"print(f\"Is geographic: {gdf.crs.is_geographic if gdf.crs else 'No CRS'}\")\n",
"print(f\"Is projected: {gdf.crs.is_projected if gdf.crs else 'No CRS'}\")\n",
"\n",
"if gdf.crs and gdf.crs.is_geographic:\n",
" # Compare area in geographic vs projected CRS\n",
" area_geographic = gdf.geometry.area # in degrees² (meaningless)\n",
" area_projected = gdf_utm.geometry.area # in m²\n",
" \n",
" print(f\"\\n=== Area comparison ===\")\n",
" print(f\"Geographic CRS area (degrees²): meaningless for spatial analysis\")\n",
" print(f\"Projected UTM area (EPSG:{utm_epsg}):\")\n",
" for _, row in gdf_utm.iterrows():\n",
" print(f\" {row['field']}: {row['area_ha']:.2f} ha\")\n",
" print(f\"\\nAlways use projected CRS (UTM) for area/distance calculations.\")"
]
},
{
"cell_type": "markdown",
"id": "3a21236a",
"metadata": {},
"source": [
"## 7. Export Summary Table"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "acd4b3c7",
"metadata": {},
"outputs": [],
"source": [
"# Save summary CSV to project data directory\n",
"output_csv = DATA_DIR / \"field_summary.csv\"\n",
"stats.to_csv(output_csv, index=False)\n",
"print(f\"Saved field summary to: {output_csv}\")\n",
"print(f\"\\nFinal summary:\")\n",
"print(f\" Project: {PROJECT}\")\n",
"print(f\" Fields: {len(stats)}\")\n",
"print(f\" Total area: {stats['area_ha'].sum():.1f} ha ({stats['area_acres'].sum():.1f} acres)\")\n",
"print(f\" CRS: {gdf.crs} (original) → EPSG:{utm_epsg} (projected)\")\n",
"print(f\" All valid: {gdf.is_valid.all()}\")\n",
"stats"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,431 @@
"""
01_spectral_feature_exploration.py SC-161 Exploration
=======================================================
Explore spectral indices extracted from ESA 4-band TIFFs.
Indices (8 total from RGB+NIR):
NDVI, BSI, NDWI, CI_green, CI_red, GNDVI, SAVI, EVI2
Key questions:
1. How do indices correlate with each other? Which are redundant?
2. Does any index capture harvest signal better than CI?
3. Which indices add independent information beyond CI_green?
4. Pre/post harvest distribution shifts per index
Usage:
cd python_app/harvest_detection_experiments/angata_improvements
python 01_spectral_feature_exploration.py
"""
import matplotlib
matplotlib.use("Agg") # Non-interactive backend — save only, no window blocking
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path
# =============================================================================
# CONFIG
# =============================================================================
PROJECT = "esa"
REPO_ROOT = Path(__file__).resolve().parents[3]
PROJECT_STORAGE = REPO_ROOT / "laravel_app" / "storage" / "app" / PROJECT
SPECTRAL_CSV = PROJECT_STORAGE / "Data" / "extracted_ci" / "ci_data_for_python" / "spectral_indices.csv"
HARVEST_XLSX = PROJECT_STORAGE / "harvest.xlsx"
# Output directory for plots
OUTPUT_DIR = Path(__file__).parent / "plots"
OUTPUT_DIR.mkdir(exist_ok=True)
# All mean index columns
INDEX_COLS = [
"mean_ndvi", "mean_bsi", "mean_ndwi", "mean_ci_green",
"mean_ci_red", "mean_gndvi", "mean_savi", "mean_evi2",
]
INDEX_LABELS = {
"mean_ndvi": "NDVI",
"mean_bsi": "BSI",
"mean_ndwi": "NDWI",
"mean_ci_green": "CI_green",
"mean_ci_red": "CI_red",
"mean_gndvi": "GNDVI",
"mean_savi": "SAVI",
"mean_evi2": "EVI2",
}
INDEX_COLORS = {
"mean_ndvi": "green",
"mean_bsi": "saddlebrown",
"mean_ndwi": "steelblue",
"mean_ci_green": "darkgreen",
"mean_ci_red": "darkred",
"mean_gndvi": "olive",
"mean_savi": "teal",
"mean_evi2": "purple",
}
# =============================================================================
# LOAD DATA
# =============================================================================
print("=" * 80)
print("SC-161: SPECTRAL FEATURE EXPLORATION — ESA (8 indices)")
print("=" * 80)
# Spectral indices
si = pd.read_csv(SPECTRAL_CSV, parse_dates=["Date"])
si = si[si["field"] != "00F25"] # drop always-NaN field
print(f"\nSpectral indices: {len(si)} rows, {si['field'].nunique()} fields")
print(f" Date range: {si['Date'].min().date()}{si['Date'].max().date()}")
print(f" Columns: {[c for c in si.columns if c.startswith('mean_')]}")
for col in INDEX_COLS:
if col in si.columns:
nan_pct = si[col].isna().mean() * 100
print(f" {INDEX_LABELS[col]:10s} NaN: {nan_pct:.1f}% mean: {si[col].mean():.4f} range: [{si[col].min():.4f}, {si[col].max():.4f}]")
# Harvest data
harvest = pd.read_excel(HARVEST_XLSX)
harvest["season_start"] = pd.to_datetime(harvest["season_start"], errors="coerce")
harvest["season_end"] = pd.to_datetime(harvest["season_end"], errors="coerce")
# Filter to fields we have spectral data for
our_fields = set(si["field"].unique())
harvest = harvest[harvest["field"].isin(our_fields)].copy()
# Only keep seasons with an actual end date (= confirmed harvest)
harvest_events = harvest.dropna(subset=["season_end"]).copy()
print(f"\nHarvest events (confirmed): {len(harvest_events)} across {harvest_events['field'].nunique()} fields")
# =============================================================================
# 1. FULL CORRELATION MATRIX — all 8 indices
# =============================================================================
print("\n--- Full correlation matrix (8 indices) ---")
available_cols = [c for c in INDEX_COLS if c in si.columns]
valid = si.dropna(subset=available_cols)
corr_matrix = valid[available_cols].corr()
# Print in readable format
labels = [INDEX_LABELS[c] for c in available_cols]
print(f"\n{'':>12s}", " ".join(f"{l:>10s}" for l in labels))
for i, (col, label) in enumerate(zip(available_cols, labels)):
vals = " ".join(f"{corr_matrix.iloc[i, j]:>10.3f}" for j in range(len(available_cols)))
print(f"{label:>12s} {vals}")
# Heatmap
fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(corr_matrix.values, cmap="RdBu_r", vmin=-1, vmax=1, aspect="equal")
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=10)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels, fontsize=10)
ax.set_title("Spectral Index Correlation Matrix (ESA)", fontsize=14)
# Annotate cells
for i in range(len(labels)):
for j in range(len(labels)):
val = corr_matrix.iloc[i, j]
txt_color = "white" if abs(val) > 0.7 else "black"
ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=9, color=txt_color)
plt.colorbar(im, ax=ax, label="Pearson r", shrink=0.8)
plt.tight_layout()
fig.savefig(OUTPUT_DIR / "correlation_matrix_all_indices.png", dpi=150, bbox_inches="tight")
plt.close()
print(f" → Saved correlation_matrix_all_indices.png")
# =============================================================================
# 2. PER-FIELD CORRELATION — CI_green vs each index
# =============================================================================
print("\n--- Per-field correlation: CI_green vs each index ---")
ci_col = "mean_ci_green"
other_cols = [c for c in available_cols if c != ci_col]
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes_flat = axes.flatten()
for i, col in enumerate(other_cols):
ax = axes_flat[i]
label = INDEX_LABELS[col]
field_corrs = valid.groupby("field").apply(
lambda g: g[ci_col].corr(g[col]) if len(g) > 30 else np.nan
).dropna()
ax.hist(field_corrs, bins=20, color=INDEX_COLORS.get(col, "gray"), edgecolor="white", alpha=0.8)
ax.axvline(field_corrs.mean(), color="red", linestyle="--", linewidth=2,
label=f"mean={field_corrs.mean():.3f}")
ax.set_xlabel(f"r(CI_green, {label})")
ax.set_ylabel("Field count")
ax.set_title(f"CI_green vs {label}")
ax.legend(fontsize=8)
ax.set_xlim(-1, 1)
print(f" CI_green vs {label:8s}: mean r = {field_corrs.mean():.4f}, std = {field_corrs.std():.4f}")
# Remove unused subplot
if len(other_cols) < len(axes_flat):
for j in range(len(other_cols), len(axes_flat)):
axes_flat[j].set_visible(False)
fig.suptitle("Per-field correlation: CI_green vs each index", fontsize=14, fontweight="bold")
plt.tight_layout()
fig.savefig(OUTPUT_DIR / "ci_green_vs_all_per_field_corr.png", dpi=150, bbox_inches="tight")
plt.close()
print(f" → Saved ci_green_vs_all_per_field_corr.png")
# =============================================================================
# 3. PRE/POST HARVEST SHIFTS — all indices
# =============================================================================
print("\n--- Pre/post harvest distribution shifts (all indices) ---")
WINDOW_DAYS = 30
shift_results = {}
for col in available_cols:
label = INDEX_LABELS[col]
pre_vals = []
post_vals = []
for _, h in harvest_events.iterrows():
field_data = si[si["field"] == h["field"]].copy()
if field_data.empty:
continue
harvest_date = h["season_end"]
pre = field_data[
(field_data["Date"] >= harvest_date - pd.Timedelta(days=WINDOW_DAYS))
& (field_data["Date"] < harvest_date)
][col].dropna()
pre_vals.extend(pre.tolist())
post = field_data[
(field_data["Date"] > harvest_date)
& (field_data["Date"] <= harvest_date + pd.Timedelta(days=WINDOW_DAYS))
][col].dropna()
post_vals.extend(post.tolist())
pre_arr = np.array(pre_vals)
post_arr = np.array(post_vals)
delta = post_arr.mean() - pre_arr.mean()
# Effect size (Cohen's d)
pooled_std = np.sqrt((pre_arr.std() ** 2 + post_arr.std() ** 2) / 2)
cohens_d = delta / pooled_std if pooled_std > 0 else 0
shift_results[col] = {
"label": label,
"pre_mean": pre_arr.mean(),
"post_mean": post_arr.mean(),
"delta": delta,
"abs_delta": abs(delta),
"cohens_d": cohens_d,
"pre_n": len(pre_arr),
"post_n": len(post_arr),
}
# Print sorted by absolute Cohen's d (effect size)
print(f"\n{'Index':>12s} {'Pre':>8s} {'Post':>8s} {'Delta':>8s} {'Cohen d':>8s} {'n_pre':>6s} {'n_post':>6s}")
for col in sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True):
r = shift_results[col]
print(f"{r['label']:>12s} {r['pre_mean']:>8.4f} {r['post_mean']:>8.4f} {r['delta']:>+8.4f} {r['cohens_d']:>+8.3f} {r['pre_n']:>6d} {r['post_n']:>6d}")
# Bar chart of effect sizes
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
sorted_cols = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)
labels_sorted = [shift_results[c]["label"] for c in sorted_cols]
deltas = [shift_results[c]["delta"] for c in sorted_cols]
cohens_ds = [shift_results[c]["cohens_d"] for c in sorted_cols]
colors = [INDEX_COLORS.get(c, "gray") for c in sorted_cols]
# Raw delta
ax1.barh(labels_sorted, deltas, color=colors, alpha=0.8, edgecolor="white")
ax1.set_xlabel("Post - Pre harvest mean")
ax1.set_title("Raw shift at harvest (±30 days)")
ax1.axvline(0, color="black", linewidth=0.5)
ax1.grid(True, alpha=0.3, axis="x")
# Cohen's d
ax2.barh(labels_sorted, [abs(d) for d in cohens_ds], color=colors, alpha=0.8, edgecolor="white")
ax2.set_xlabel("|Cohen's d| (effect size)")
ax2.set_title("Harvest signal strength (effect size)")
ax2.axvline(0.2, color="gray", linestyle=":", label="small (0.2)")
ax2.axvline(0.5, color="gray", linestyle="--", label="medium (0.5)")
ax2.axvline(0.8, color="gray", linestyle="-", label="large (0.8)")
ax2.legend(fontsize=8)
ax2.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
fig.savefig(OUTPUT_DIR / "harvest_signal_all_indices.png", dpi=150, bbox_inches="tight")
plt.close()
print(f" → Saved harvest_signal_all_indices.png")
# =============================================================================
# 4. PRE/POST DISTRIBUTIONS — top 4 by harvest signal
# =============================================================================
print("\n--- Pre/post harvest distributions (top 4 indices by effect size) ---")
top4_cols = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)[:4]
fig, axes = plt.subplots(1, 4, figsize=(20, 5))
for ax, col in zip(axes, top4_cols):
label = INDEX_LABELS[col]
pre_vals = []
post_vals = []
for _, h in harvest_events.iterrows():
field_data = si[si["field"] == h["field"]].copy()
if field_data.empty:
continue
harvest_date = h["season_end"]
pre = field_data[
(field_data["Date"] >= harvest_date - pd.Timedelta(days=WINDOW_DAYS))
& (field_data["Date"] < harvest_date)
][col].dropna()
pre_vals.extend(pre.tolist())
post = field_data[
(field_data["Date"] > harvest_date)
& (field_data["Date"] <= harvest_date + pd.Timedelta(days=WINDOW_DAYS))
][col].dropna()
post_vals.extend(post.tolist())
ax.hist(pre_vals, bins=40, alpha=0.6, color="green", label=f"Pre (n={len(pre_vals)})", density=True)
ax.hist(post_vals, bins=40, alpha=0.6, color="brown", label=f"Post (n={len(post_vals)})", density=True)
d = shift_results[col]["cohens_d"]
ax.set_title(f"{label} (d={d:+.2f})")
ax.set_xlabel(label)
ax.set_ylabel("Density")
ax.legend(fontsize=8)
ax.grid(True, alpha=0.3)
fig.suptitle(f"Pre/Post Harvest ±{WINDOW_DAYS}d — Top 4 by effect size", fontsize=13, fontweight="bold")
plt.tight_layout()
fig.savefig(OUTPUT_DIR / "pre_post_harvest_top4.png", dpi=150, bbox_inches="tight")
plt.close()
print(f" → Saved pre_post_harvest_top4.png")
# =============================================================================
# 5. TIMESERIES — top field, all 8 indices
# =============================================================================
print("\n--- Timeseries: all indices for top harvest-rich fields ---")
top_fields = harvest_events.groupby("field").size().sort_values(ascending=False).head(3).index.tolist()
print(f"Top fields: {top_fields}")
for field_id in top_fields:
field_data = si[si["field"] == field_id].sort_values("Date").copy()
field_harvests = harvest_events[harvest_events["field"] == field_id]
fig, axes = plt.subplots(4, 2, figsize=(20, 16), sharex=True)
fig.suptitle(f"Field {field_id} — All Spectral Indices", fontsize=14, fontweight="bold")
for ax, col in zip(axes.flatten(), available_cols):
label = INDEX_LABELS[col]
color = INDEX_COLORS.get(col, "gray")
vals = field_data.dropna(subset=[col])
ax.plot(vals["Date"], vals[col], color=color, linewidth=0.6, alpha=0.7)
# 14-day rolling mean
if len(vals) > 14:
rolling = vals.set_index("Date")[col].rolling("14D", min_periods=3).mean()
ax.plot(rolling.index, rolling.values, color=color, linewidth=2, alpha=0.8)
# Harvest dates
for _, h in field_harvests.iterrows():
ax.axvline(h["season_end"], color="red", linestyle="--", alpha=0.6, linewidth=1)
ax.set_ylabel(label, fontsize=10, fontweight="bold")
ax.grid(True, alpha=0.3)
axes[-1, 0].xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
axes[-1, 1].xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
plt.tight_layout()
fig.savefig(OUTPUT_DIR / f"all_indices_{field_id}.png", dpi=150, bbox_inches="tight")
plt.close()
print(f" → Saved all_indices_{field_id}.png")
# =============================================================================
# 6. REDUNDANCY ANALYSIS — which indices add information beyond CI_green?
# =============================================================================
print("\n--- Redundancy analysis: residual variance after regressing vs CI_green ---")
from numpy.polynomial import polynomial as P
residual_variance = {}
for col in [c for c in available_cols if c != ci_col]:
label = INDEX_LABELS[col]
pair = valid[[ci_col, col]].dropna()
if len(pair) < 100:
continue
x, y = pair[ci_col].values, pair[col].values
# Fit linear regression CI_green → index
coeffs = P.polyfit(x, y, deg=1)
y_pred = P.polyval(x, coeffs)
residuals = y - y_pred
total_var = np.var(y)
resid_var = np.var(residuals)
r_squared = 1 - resid_var / total_var
residual_variance[col] = {
"label": label,
"r_squared": r_squared,
"residual_pct": (1 - r_squared) * 100,
}
print(f"\n{'Index':>12s} {'R² vs CI':>10s} {'Residual %':>12s} {'Interpretation'}")
for col in sorted(residual_variance.keys(), key=lambda c: residual_variance[c]["residual_pct"], reverse=True):
r = residual_variance[col]
interp = "UNIQUE signal" if r["residual_pct"] > 20 else "Some unique" if r["residual_pct"] > 5 else "Redundant"
print(f"{r['label']:>12s} {r['r_squared']:>10.4f} {r['residual_pct']:>11.1f}% {interp}")
# =============================================================================
# 7. SUMMARY
# =============================================================================
print("\n" + "=" * 80)
print("SUMMARY — SC-161 Expanded Index Analysis")
print("=" * 80)
# Best harvest signal
best_col = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)[0]
best = shift_results[best_col]
# Most unique signal (highest residual variance vs CI_green)
most_unique = sorted(residual_variance.keys(), key=lambda c: residual_variance[c]["residual_pct"], reverse=True)
print(f"""
INDICES COMPUTED: {', '.join(INDEX_LABELS[c] for c in available_cols)}
DATA: {si['field'].nunique()} fields, {si['Date'].nunique()} dates
HARVEST SIGNAL (strongest weakest by Cohen's d):
""")
for col in sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True):
r = shift_results[col]
print(f" {r['label']:>12s}: Cohen's d = {r['cohens_d']:+.3f} (Δ = {r['delta']:+.4f})")
print(f"""
REDUNDANCY vs CI_green (unique information):
""")
for col in most_unique:
r = residual_variance[col]
print(f" {r['label']:>12s}: {r['residual_pct']:.1f}% unique variance (R² = {r['r_squared']:.3f})")
print(f"""
RECOMMENDATION:
Best harvest signal: {best['label']} (d={best['cohens_d']:+.3f})
Most unique vs CI: {residual_variance[most_unique[0]]['label']} ({residual_variance[most_unique[0]]['residual_pct']:.1f}% independent)
Plots saved to: {OUTPUT_DIR}
""")

View file

@ -0,0 +1,465 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e9cdc8ee",
"metadata": {},
"source": [
"# SC-162: Season-Length Normalization Analysis\n",
"## Compare ESA vs Angata Spectral Indices (CI_green, SAVI)\n",
"\n",
"Multi-year data analysis focusing on:\n",
"- **Peak timing** (at what season-age % do peaks occur?)\n",
"- **Amplitude** (how high/low?)\n",
"- **Phase** (lag between indices)\n",
"- **Shape** (growth curve pattern)\n",
"\n",
"Note: Cannot compare absolute DOY across multiple years, but CAN compare normalized season age (0-1 scale) and pattern shapes."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59943da7",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.dates as mdates\n",
"from pathlib import Path\n",
"from scipy import stats\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"# Configuration\n",
"REPO_ROOT = Path.cwd().parent.parent.parent.parent\n",
"print(f\"Repo root: {REPO_ROOT}\")\n",
"\n",
"# Project-specific config\n",
"PROJECTS = {\n",
" 'esa': {\n",
" 'expected_season_length': 365,\n",
" 'storage_path': REPO_ROOT / 'laravel_app' / 'storage' / 'app' / 'esa',\n",
" },\n",
" 'angata': {\n",
" 'expected_season_length': 540,\n",
" 'storage_path': REPO_ROOT / 'laravel_app' / 'storage' / 'app' / 'angata',\n",
" }\n",
"}\n",
"\n",
"# Load data for both projects\n",
"data = {}\n",
"harvest_data = {}\n",
"\n",
"for project, config in PROJECTS.items():\n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"Loading {project.upper()}\")\n",
" print(f\"{'='*60}\")\n",
" \n",
" # Spectral indices CSV\n",
" csv_path = config['storage_path'] / 'Data' / 'extracted_ci' / 'ci_data_for_python' / 'spectral_indices.csv'\n",
" df = pd.read_csv(csv_path, parse_dates=['Date'])\n",
" df = df[df['field'] != '00F25'] # Remove always-NaN field if present\n",
" print(f\"Spectral data: {len(df)} rows, {df['field'].nunique()} fields\")\n",
" print(f\"Date range: {df['Date'].min().date()} → {df['Date'].max().date()}\")\n",
" \n",
" # Harvest dates\n",
" harvest_path = config['storage_path'] / 'Data' / 'harvest.xlsx'\n",
" harvest_df = pd.read_excel(harvest_path)\n",
" harvest_df['season_start'] = pd.to_datetime(harvest_df['season_start'], errors='coerce')\n",
" harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')\n",
" \n",
" # Filter to fields we have spectral data for\n",
" our_fields = set(df['field'].unique())\n",
" harvest_df = harvest_df[harvest_df['field'].isin(our_fields)].copy()\n",
" harvest_events = harvest_df.dropna(subset=['season_end']).copy()\n",
" print(f\"Harvest events: {len(harvest_events)} across {harvest_events['field'].nunique()} fields\")\n",
" \n",
" data[project] = df\n",
" harvest_data[project] = harvest_events"
]
},
{
"cell_type": "markdown",
"id": "260b405b",
"metadata": {},
"source": [
"## Step 1: Compute Season Age for Each Observation\n",
"\n",
"For each field-date pair, calculate days since season start and normalize to 0-1 scale."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55a0f94f",
"metadata": {},
"outputs": [],
"source": [
"def compute_season_age(spec_df, harvest_df, project_season_length):\n",
" \"\"\"\n",
" Compute normalized season age (0-1) for each row.\n",
" \n",
" For each field, use the earliest season_start from harvest data as the baseline.\n",
" If no season_start available, use the earliest date in spectral data.\n",
" \"\"\"\n",
" spec_copy = spec_df.copy()\n",
" spec_copy['season_age'] = np.nan\n",
" spec_copy['days_since_start'] = np.nan\n",
" \n",
" for field_id in spec_copy['field'].unique():\n",
" # Get season start for this field\n",
" field_harvest = harvest_df[harvest_df['field'] == field_id]\n",
" \n",
" if not field_harvest.empty and not field_harvest['season_start'].isna().all():\n",
" season_start = field_harvest['season_start'].min()\n",
" else:\n",
" # Fallback: use first observation date\n",
" field_data = spec_copy[spec_copy['field'] == field_id]\n",
" season_start = field_data['Date'].min()\n",
" \n",
" # Compute for all rows of this field\n",
" field_mask = spec_copy['field'] == field_id\n",
" days_since = (spec_copy.loc[field_mask, 'Date'] - season_start).dt.days\n",
" spec_copy.loc[field_mask, 'days_since_start'] = days_since\n",
" spec_copy.loc[field_mask, 'season_age'] = days_since / project_season_length\n",
" \n",
" return spec_copy\n",
"\n",
"# Compute for both projects\n",
"for project, config in PROJECTS.items():\n",
" data[project] = compute_season_age(\n",
" data[project],\n",
" harvest_data[project],\n",
" config['expected_season_length']\n",
" )\n",
" print(f\"{project.upper()} season_age range: [{data[project]['season_age'].min():.3f}, {data[project]['season_age'].max():.3f}]\")"
]
},
{
"cell_type": "markdown",
"id": "b946c36c",
"metadata": {},
"source": [
"## Step 2: Plot CI_green + SAVI Trajectories by Project\n",
"\n",
"Multi-year overlay to compare:\n",
"- Peak timing (at what season-age %?)\n",
"- Amplitude (range of values)\n",
"- Phase (timing relative to each other)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c8de5a03",
"metadata": {},
"outputs": [],
"source": [
"# Create figure: 2 projects × 2 indices = 4 subplots\n",
"fig, axes = plt.subplots(2, 2, figsize=(18, 12))\n",
"\n",
"indices = ['mean_ci_green', 'mean_savi']\n",
"index_labels = {'mean_ci_green': 'CI_green (Chlorophyll)', 'mean_savi': 'SAVI (Soil-Adjusted)'}\n",
"\n",
"for row, project in enumerate(['esa', 'angata']):\n",
" df = data[project]\n",
" season_len = PROJECTS[project]['expected_season_length']\n",
" \n",
" for col, idx in enumerate(indices):\n",
" ax = axes[row, col]\n",
" \n",
" # Plot all field timeseries (normalized season age)\n",
" for field in df['field'].unique():\n",
" field_data = df[df['field'] == field].dropna(subset=[idx, 'season_age']).sort_values('season_age')\n",
" ax.plot(field_data['season_age'], field_data[idx], alpha=0.15, color='steelblue', linewidth=0.8)\n",
" \n",
" # Overlay mean trajectory with rolling average\n",
" binned = df.dropna(subset=[idx, 'season_age']).copy()\n",
" binned['season_age_bin'] = pd.cut(binned['season_age'], bins=20)\n",
" mean_trajectory = binned.groupby('season_age_bin')[idx].mean()\n",
" bin_centers = [interval.mid for interval in mean_trajectory.index]\n",
" ax.plot(bin_centers, mean_trajectory.values, color='red', linewidth=3, label='Average', zorder=10)\n",
" \n",
" ax.set_xlabel('Season Age (0 = start, 1 = expected end)', fontsize=11)\n",
" ax.set_ylabel(index_labels[idx], fontsize=11)\n",
" ax.set_title(f\"{project.upper()} - {index_labels[idx]}\\n(Season length: {season_len}d)\", fontsize=12, fontweight='bold')\n",
" ax.set_xlim(0, 1.2) # Allow slight overgrowth\n",
" ax.grid(True, alpha=0.3)\n",
" ax.legend(fontsize=9)\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig('season_normalization_trajectories.png', dpi=150, bbox_inches='tight')\n",
"plt.show()\n",
"print(\"\\nSaved: season_normalization_trajectories.png\")"
]
},
{
"cell_type": "markdown",
"id": "f60a0dc5",
"metadata": {},
"source": [
"## Step 3: Peak Analysis - Where do maxima occur?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e149843",
"metadata": {},
"outputs": [],
"source": [
"def find_peaks_per_field(df, index_col, season_age_col='season_age'):\n",
" \"\"\"\n",
" For each field, find the peak (maximum) in the normalized season age.\n",
" Return statistics about where peaks occur.\n",
" \"\"\"\n",
" peaks = []\n",
" \n",
" for field in df['field'].unique():\n",
" field_data = df[df['field'] == field].dropna(subset=[index_col, season_age_col]).sort_values(season_age_col)\n",
" \n",
" if len(field_data) > 10: # Need enough data\n",
" max_idx = field_data[index_col].idxmax()\n",
" peak_season_age = field_data.loc[max_idx, season_age_col]\n",
" peak_value = field_data.loc[max_idx, index_col]\n",
" peaks.append({\n",
" 'field': field,\n",
" 'peak_season_age': peak_season_age,\n",
" 'peak_value': peak_value\n",
" })\n",
" \n",
" return pd.DataFrame(peaks)\n",
"\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"PEAK TIMING ANALYSIS (at what season-age do indices peak?)\")\n",
"print(\"=\"*70)\n",
"\n",
"for project in ['esa', 'angata']:\n",
" print(f\"\\n{project.upper()}:\")\n",
" print(\"-\" * 70)\n",
" \n",
" for idx, label in [('mean_ci_green', 'CI_green'), ('mean_savi', 'SAVI')]:\n",
" peaks_df = find_peaks_per_field(data[project], idx, 'season_age')\n",
" \n",
" print(f\"\\n {label}:\")\n",
" print(f\" Fields analyzed: {len(peaks_df)}\")\n",
" print(f\" Peak season_age: mean={peaks_df['peak_season_age'].mean():.3f}, std={peaks_df['peak_season_age'].std():.3f}\")\n",
" print(f\" Peak occurs at: {peaks_df['peak_season_age'].mean()*100:.1f}% through season\")\n",
" print(f\" Peak value range: [{peaks_df['peak_value'].min():.4f}, {peaks_df['peak_value'].max():.4f}]\")"
]
},
{
"cell_type": "markdown",
"id": "1d6e97e4",
"metadata": {},
"source": [
"## Step 4: Amplitude Comparison - How strong are the signals?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75c72340",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n\" + \"=\"*70)\n",
"print(\"AMPLITUDE ANALYSIS (value ranges per project)\")\n",
"print(\"=\"*70)\n",
"\n",
"for project in ['esa', 'angata']:\n",
" print(f\"\\n{project.upper()}:\")\n",
" print(\"-\" * 70)\n",
" \n",
" df = data[project]\n",
" \n",
" for idx, label in [('mean_ci_green', 'CI_green'), ('mean_savi', 'SAVI')]:\n",
" valid = df[idx].dropna()\n",
" amplitude = valid.max() - valid.min()\n",
" \n",
" print(f\"\\n {label}:\")\n",
" print(f\" Mean: {valid.mean():.4f}\")\n",
" print(f\" Std: {valid.std():.4f}\")\n",
" print(f\" Min: {valid.min():.4f}\")\n",
" print(f\" Max: {valid.max():.4f}\")\n",
" print(f\" Amplitude (max-min): {amplitude:.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "8c2e3fc8",
"metadata": {},
"source": [
"## Step 5: Phase Analysis - Lead/lag between CI_green and SAVI"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "867d39e9",
"metadata": {},
"outputs": [],
"source": [
"def compute_cross_correlation(df, idx1, idx2, max_lag=0.2):\n",
" \"\"\"\n",
" Compute cross-correlation between two indices to find phase lag.\n",
" \"\"\"\n",
" # Aggregate by season_age bins\n",
" binned = df.dropna(subset=[idx1, idx2, 'season_age']).copy()\n",
" binned['bin'] = pd.cut(binned['season_age'], bins=30)\n",
" \n",
" agg = binned.groupby('bin')[[idx1, idx2]].mean()\n",
" agg = agg.dropna()\n",
" \n",
" if len(agg) < 10:\n",
" return np.nan, np.nan\n",
" \n",
" # Normalize\n",
" x = (agg[idx1] - agg[idx1].mean()) / agg[idx1].std()\n",
" y = (agg[idx2] - agg[idx2].mean()) / agg[idx2].std()\n",
" \n",
" # Cross-correlation\n",
" corr = np.correlate(x.values, y.values, mode='full')\n",
" lags = np.arange(-len(x)+1, len(x))\n",
" \n",
" best_lag_idx = np.argmax(np.abs(corr))\n",
" best_lag = lags[best_lag_idx]\n",
" max_corr = corr[best_lag_idx] / (len(x) - 1)\n",
" \n",
" return best_lag, max_corr\n",
"\n",
"print(\"\\n\" + \"=\"*70)\n",
"print(\"PHASE ANALYSIS (CI_green vs SAVI lag)\")\n",
"print(\"=\"*70)\n",
"\n",
"for project in ['esa', 'angata']:\n",
" df = data[project]\n",
" lag, corr = compute_cross_correlation(df, 'mean_ci_green', 'mean_savi')\n",
" \n",
" print(f\"\\n{project.upper()}:\")\n",
" if not np.isnan(lag):\n",
" print(f\" CI_green → SAVI lag: {lag:.2f} bins (~{(lag/30):.3f} of season)\")\n",
" print(f\" Max cross-correlation: {corr:.3f}\")\n",
" print(f\" Interpretation: {'SAVI slightly leads CI_green' if lag < 0 else 'CI_green slightly leads SAVI' if lag > 0 else 'Synchronized'}\")\n",
" else:\n",
" print(f\" Insufficient data for lag analysis\")"
]
},
{
"cell_type": "markdown",
"id": "5277beb9",
"metadata": {},
"source": [
"## Step 6: Growth Curve Shape - Do they follow similar patterns?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "24685c7d",
"metadata": {},
"outputs": [],
"source": [
"# Compare growth curves directly: normalized to 0-1 on both axes\n",
"fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
"\n",
"for col, idx in enumerate(['mean_ci_green', 'mean_savi']):\n",
" ax = axes[col]\n",
" \n",
" for project, color in [('esa', 'blue'), ('angata', 'orange')]:\n",
" df = data[project]\n",
" binned = df.dropna(subset=[idx, 'season_age']).copy()\n",
" binned['bin'] = pd.cut(binned['season_age'], bins=25)\n",
" \n",
" # Mean trajectory per bin\n",
" traj = binned.groupby('bin')[idx].agg(['mean', 'std', 'count'])\n",
" traj = traj[traj['count'] > 5] # Only bins with N>5\n",
" bin_centers = [interval.mid for interval in traj.index]\n",
" \n",
" # Normalize to 0-1 for visual comparison\n",
" normalized_vals = (traj['mean'] - traj['mean'].min()) / (traj['mean'].max() - traj['mean'].min())\n",
" \n",
" ax.plot(bin_centers, normalized_vals.values, color=color, linewidth=2.5, \n",
" marker='o', markersize=6, label=f\"{project.upper()}\", zorder=10)\n",
" ax.fill_between(bin_centers, \n",
" (normalized_vals - traj['std']/traj['mean'].std()).values,\n",
" (normalized_vals + traj['std']/traj['mean'].std()).values,\n",
" alpha=0.2, color=color)\n",
" \n",
" ax.set_xlabel('Season Age (0-1)', fontsize=12)\n",
" ax.set_ylabel('Normalized Index (0-1)', fontsize=12)\n",
" ax.set_title(f\"Growth Curve Shape: {['CI_green', 'SAVI'][col]}\", fontsize=13, fontweight='bold')\n",
" ax.set_xlim(0, 1.1)\n",
" ax.set_ylim(-0.1, 1.1)\n",
" ax.grid(True, alpha=0.3)\n",
" ax.legend(fontsize=11, loc='best')\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig('growth_curve_comparison.png', dpi=150, bbox_inches='tight')\n",
"plt.show()\n",
"print(\"\\nSaved: growth_curve_comparison.png\")"
]
},
{
"cell_type": "markdown",
"id": "27b5e61a",
"metadata": {},
"source": [
"## Summary: Does Season-Age Normalization Work?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9252cf10",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n\" + \"=\"*70)\n",
"print(\"SC-162 FINDINGS: Season-Age Normalization Validation\")\n",
"print(\"=\"*70)\n",
"\n",
"print(\"\"\"\n",
"▶ KEY QUESTION: Do ESA and Angata follow similar growth patterns when normalized to season age?\n",
"\n",
"If YES (patterns align on 0-1 scale):\n",
" ✅ Season-age normalization is valid\n",
" ✅ Can train model on ESA, apply to Angata with confidence\n",
" ✅ Use proportional imminent window (7-8% of season length)\n",
"\n",
"If NO (patterns diverge):\n",
" ❌ Need project-specific models\n",
" ❌ Investigate why patterns differ (climate, variety, soil, etc.)\n",
" ❌ Consider multivariate approach (season_age + other features)\n",
"\n",
"OBSERVATIONS TO CHECK ABOVE:\n",
"1. Peak timing: Do both peak at ~same season-age?\n",
"2. Growth curve shape: Do normalized curves look similar?\n",
"3. Amplitude: Is the relative range comparable?\n",
"4. Phase: Is lag between CI_green and SAVI similar across projects?\n",
"\"\"\")\n",
"\n",
"print(\"\\nNEXT STEPS:\")\n",
"print(\"1. If patterns align → Proceed with feature engineering (Phase 5)\")\n",
"print(\"2. If patterns differ → Debug and adjust project configs\")\n",
"print(\"3. Test both fixed (28d) and proportional imminent windows on ESA\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch_gpu",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,199 @@
"""
run_spectral_extraction.py Batch extract spectral indices from all TIFFs
==========================================================================
Part of SC-161: Extract spectral indices from 4-band TIFFs (BSI, NDVI, NDWI)
Loops over all dated TIF files in a project's TIF folder, computes per-field
NDVI, BSI, NDWI and saves a single CSV with the same field/date structure
as ci_data_for_python.csv (can be joined on field + Date).
Usage:
cd python_app/harvest_detection_experiments/angata_improvements
python run_spectral_extraction.py --project esa
python run_spectral_extraction.py --project esa --tif-folder merged_final_tif
python run_spectral_extraction.py --project esa --start 2024-01-01 --end 2024-12-31
"""
import argparse
import sys
import time
from datetime import datetime
from pathlib import Path
import geopandas as gpd
import pandas as pd
# Add parent dirs to path so we can import the module
sys.path.insert(0, str(Path(__file__).parent))
from spectral_features import extract_field_spectral_indices
def find_tif_dates(tif_folder: Path) -> list[tuple[str, Path]]:
"""
Find all dated TIF files (YYYY-MM-DD.tif) in a folder.
Returns sorted list of (date_str, path) tuples.
"""
tifs = []
for f in sorted(tif_folder.glob("*.tif")):
stem = f.stem # e.g. "2024-01-15"
try:
datetime.strptime(stem, "%Y-%m-%d")
tifs.append((stem, f))
except ValueError:
continue # skip non-date-named TIFs
return tifs
def main():
parser = argparse.ArgumentParser(
description="Extract spectral indices (NDVI, BSI, NDWI) from 4-band TIFFs per field"
)
parser.add_argument("--project", required=True, help="Project name (e.g. esa, angata)")
parser.add_argument(
"--tif-folder", default="merged_final_tif",
help="TIF subfolder name within project storage (default: merged_final_tif)"
)
parser.add_argument("--start", default=None, help="Start date filter (YYYY-MM-DD)")
parser.add_argument("--end", default=None, help="End date filter (YYYY-MM-DD)")
parser.add_argument(
"--output", default=None,
help="Output CSV path. Default: {project_storage}/Data/extracted_ci/ci_data_for_python/spectral_indices.csv"
)
parser.add_argument("--field-id-col", default="field", help="Field ID column in GeoJSON")
args = parser.parse_args()
# ==========================================================================
# RESOLVE PATHS
# ==========================================================================
# Navigate from this script to the repo root
# __file__ = angata_improvements/run_spectral_extraction.py
# parents: [0]=angata_improvements, [1]=harvest_detection_experiments, [2]=python_app, [3]=SmartCane_code
repo_root = Path(__file__).resolve().parents[3]
project_storage = repo_root / "laravel_app" / "storage" / "app" / args.project
tif_folder = project_storage / args.tif_folder
geojson_path = project_storage / "pivot.geojson"
if args.output:
output_csv = Path(args.output)
else:
output_dir = project_storage / "Data" / "extracted_ci" / "ci_data_for_python"
output_dir.mkdir(parents=True, exist_ok=True)
output_csv = output_dir / "spectral_indices.csv"
# ==========================================================================
# VALIDATION
# ==========================================================================
print("=" * 80)
print(f"SPECTRAL INDEX EXTRACTION — {args.project.upper()}")
print("=" * 80)
if not tif_folder.exists():
print(f"ERROR: TIF folder not found: {tif_folder}")
sys.exit(1)
if not geojson_path.exists():
print(f"ERROR: Field boundaries not found: {geojson_path}")
sys.exit(1)
# Load field boundaries once
print(f"\nLoading field boundaries: {geojson_path}")
gdf = gpd.read_file(geojson_path)
n_fields = len(gdf)
print(f" Fields: {n_fields}")
print(f" Field IDs: {', '.join(gdf[args.field_id_col].astype(str).tolist())}")
# Find all dated TIFs
all_tifs = find_tif_dates(tif_folder)
print(f"\nTIF files found: {len(all_tifs)}")
# Apply date filters
if args.start:
all_tifs = [(d, p) for d, p in all_tifs if d >= args.start]
if args.end:
all_tifs = [(d, p) for d, p in all_tifs if d <= args.end]
print(f"TIF files after filtering: {len(all_tifs)}")
if not all_tifs:
print("ERROR: No TIF files to process.")
sys.exit(1)
print(f"Date range: {all_tifs[0][0]}{all_tifs[-1][0]}")
print(f"Output: {output_csv}")
print()
# ==========================================================================
# EXTRACTION LOOP
# ==========================================================================
all_rows = []
errors = 0
t0 = time.time()
for i, (date_str, tif_path) in enumerate(all_tifs):
try:
df = extract_field_spectral_indices(
tif_path, gdf, field_id_col=args.field_id_col
)
df.insert(1, "Date", date_str)
all_rows.append(df)
except Exception as e:
errors += 1
print(f" [{i+1}/{len(all_tifs)}] ERROR {date_str}: {e}")
continue
# Progress every 100 files
if (i + 1) % 100 == 0 or (i + 1) == len(all_tifs):
elapsed = time.time() - t0
rate = (i + 1) / elapsed
remaining = (len(all_tifs) - i - 1) / rate if rate > 0 else 0
print(
f" [{i+1}/{len(all_tifs)}] {date_str} "
f"({rate:.1f} files/s, ~{remaining:.0f}s remaining)"
)
elapsed_total = time.time() - t0
# ==========================================================================
# COMBINE & SAVE
# ==========================================================================
if not all_rows:
print("ERROR: No data extracted.")
sys.exit(1)
result = pd.concat(all_rows, ignore_index=True)
# Sort by field, date for consistency with CI data
result = result.sort_values(["field", "Date"]).reset_index(drop=True)
# Save
output_csv.parent.mkdir(parents=True, exist_ok=True)
result.to_csv(output_csv, index=False)
# ==========================================================================
# SUMMARY
# ==========================================================================
print()
print("=" * 80)
print("EXTRACTION COMPLETE")
print("=" * 80)
print(f" Dates processed: {len(all_tifs)}")
print(f" Errors: {errors}")
print(f" Total rows: {len(result)}")
print(f" Fields: {result['field'].nunique()}")
print(f" Date range: {result['Date'].min()}{result['Date'].max()}")
print(f" Time: {elapsed_total:.1f}s ({len(all_tifs)/elapsed_total:.1f} files/s)")
print()
# Quick stats
for col in ["mean_ndvi", "mean_bsi", "mean_ndwi"]:
vals = result[col].dropna()
if len(vals) > 0:
print(f" {col}: [{vals.min():.4f}, {vals.max():.4f}] (mean={vals.mean():.4f})")
nan_pct = result[["mean_ndvi", "mean_bsi", "mean_ndwi"]].isna().mean() * 100
print(f"\n NaN rates: NDVI={nan_pct['mean_ndvi']:.1f}%, BSI={nan_pct['mean_bsi']:.1f}%, NDWI={nan_pct['mean_ndwi']:.1f}%")
print(f"\n Output: {output_csv}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,241 @@
"""
spectral_features.py Extract per-field spectral indices from 4-band GeoTIFFs
================================================================================
Part of SC-161: Extract spectral indices from 4-band TIFFs (BSI, NDVI, NDWI)
Band order in TIFFs: [R, G, B, NIR] (uint16)
Spectral Indices:
NDVI = (NIR - R) / (NIR + R) vegetation density
BSI = ((R + NIR) - (G + B)) / ((R + NIR) + (G + B)) bare soil detection
NDWI = (G - NIR) / (G + NIR) moisture content
CI_green = NIR / Green - 1 chlorophyll index (= SmartCane CI)
CI_red = NIR / Red - 1 chlorophyll index (red-based)
GNDVI = (NIR - G) / (NIR + G) green NDVI (less saturation)
SAVI = 1.5 * (NIR - R) / (NIR + R + 0.5) soil-adjusted vegetation
EVI2 = 2.5 * (NIR - R) / (NIR + 2.4*R + 1) enhanced vegetation (2-band)
"""
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio.mask import mask as rio_mask
from pathlib import Path
from typing import Optional
# =============================================================================
# SPECTRAL INDEX CALCULATIONS
# =============================================================================
def _safe_normalized_diff(a: np.ndarray, b: np.ndarray) -> np.ndarray:
"""Compute (a - b) / (a + b) with safe division, masking invalid pixels."""
with np.errstate(divide="ignore", invalid="ignore"):
result = (a - b) / (a + b)
result[~np.isfinite(result)] = np.nan
return result
def compute_ndvi(red: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""NDVI = (NIR - R) / (NIR + R)"""
return _safe_normalized_diff(nir, red)
def compute_bsi(red: np.ndarray, green: np.ndarray, blue: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""BSI = ((R + NIR) - (G + B)) / ((R + NIR) + (G + B))"""
numerator = (red + nir) - (green + blue)
denominator = (red + nir) + (green + blue)
with np.errstate(divide="ignore", invalid="ignore"):
result = numerator / denominator
result[~np.isfinite(result)] = np.nan
return result
def compute_ndwi(green: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""NDWI = (G - NIR) / (G + NIR)"""
return _safe_normalized_diff(green, nir)
def compute_ci_green(green: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""CI_green = NIR / Green - 1 (= SmartCane pipeline CI)"""
with np.errstate(divide="ignore", invalid="ignore"):
result = nir / green - 1.0
result[~np.isfinite(result)] = np.nan
return result
def compute_ci_red(red: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""CI_red = NIR / Red - 1 (more sensitive at low chlorophyll)"""
with np.errstate(divide="ignore", invalid="ignore"):
result = nir / red - 1.0
result[~np.isfinite(result)] = np.nan
return result
def compute_gndvi(green: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""GNDVI = (NIR - G) / (NIR + G)"""
return _safe_normalized_diff(nir, green)
def compute_savi(red: np.ndarray, nir: np.ndarray, L: float = 0.5) -> np.ndarray:
"""SAVI = (1 + L) * (NIR - R) / (NIR + R + L), L=0.5 typical"""
with np.errstate(divide="ignore", invalid="ignore"):
result = (1.0 + L) * (nir - red) / (nir + red + L)
result[~np.isfinite(result)] = np.nan
return result
def compute_evi2(red: np.ndarray, nir: np.ndarray) -> np.ndarray:
"""EVI2 = 2.5 * (NIR - R) / (NIR + 2.4*R + 1)"""
with np.errstate(divide="ignore", invalid="ignore"):
result = 2.5 * (nir - red) / (nir + 2.4 * red + 1.0)
result[~np.isfinite(result)] = np.nan
return result
# =============================================================================
# PER-FIELD EXTRACTION
# =============================================================================
def extract_field_spectral_indices(
tif_path: str | Path,
field_boundaries_gdf: gpd.GeoDataFrame,
field_id_col: str = "field",
nodata_threshold: float = 0.0,
min_valid_fraction: float = 0.1,
) -> pd.DataFrame:
"""
Extract mean/std spectral indices per field from a single 4-band TIF.
Parameters
----------
tif_path : path to 4-band GeoTIFF (R, G, B, NIR uint16)
field_boundaries_gdf : GeoDataFrame with field polygons
field_id_col : column name for field identifier
nodata_threshold : pixels with ALL bands <= this value are treated as nodata
min_valid_fraction : minimum fraction of valid pixels required (else NaN)
Returns
-------
DataFrame with columns:
field, mean_ndvi, mean_bsi, mean_ndwi, mean_ci_green, mean_ci_red,
mean_gndvi, mean_savi, mean_evi2,
std_ndvi, std_bsi, std_ndwi, std_ci_green, std_ci_red,
std_gndvi, std_savi, std_evi2,
valid_pixel_count, total_pixel_count
"""
tif_path = Path(tif_path)
rows = []
with rasterio.open(tif_path) as src:
tif_crs = src.crs
# Reproject field boundaries to match TIF CRS if needed
gdf = field_boundaries_gdf.copy()
if gdf.crs is not None and not gdf.crs.equals(tif_crs):
gdf = gdf.to_crs(tif_crs)
elif gdf.crs is None:
# Assume EPSG:4326 if not set
gdf = gdf.set_crs("EPSG:4326").to_crs(tif_crs)
for _, row_geom in gdf.iterrows():
field_id = row_geom[field_id_col]
geom = [row_geom.geometry.__geo_interface__]
try:
out_image, _ = rio_mask(src, geom, crop=True, all_touched=True, nodata=0)
except Exception:
# Field doesn't overlap with this TIF
rows.append(_make_nan_row(field_id))
continue
# out_image shape: (bands, height, width) — convert to float
bands = out_image.astype(np.float64)
red = bands[0]
green = bands[1]
blue = bands[2]
nir = bands[3]
# Mask: valid pixels have at least one band > nodata_threshold
valid_mask = np.any(bands > nodata_threshold, axis=0)
total_pixels = valid_mask.size
valid_pixels = int(valid_mask.sum())
if valid_pixels < max(1, int(total_pixels * min_valid_fraction)):
rows.append(_make_nan_row(field_id, valid_pixels, total_pixels))
continue
# Compute indices (only on valid pixels)
ndvi = compute_ndvi(red, nir)
bsi = compute_bsi(red, green, blue, nir)
ndwi = compute_ndwi(green, nir)
ci_green = compute_ci_green(green, nir)
ci_red = compute_ci_red(red, nir)
gndvi = compute_gndvi(green, nir)
savi = compute_savi(red, nir)
evi2 = compute_evi2(red, nir)
# Apply mask
ndvi_valid = ndvi[valid_mask]
bsi_valid = bsi[valid_mask]
ndwi_valid = ndwi[valid_mask]
ci_green_valid = ci_green[valid_mask]
ci_red_valid = ci_red[valid_mask]
gndvi_valid = gndvi[valid_mask]
savi_valid = savi[valid_mask]
evi2_valid = evi2[valid_mask]
rows.append({
"field": field_id,
"mean_ndvi": np.nanmean(ndvi_valid),
"mean_bsi": np.nanmean(bsi_valid),
"mean_ndwi": np.nanmean(ndwi_valid),
"mean_ci_green": np.nanmean(ci_green_valid),
"mean_ci_red": np.nanmean(ci_red_valid),
"mean_gndvi": np.nanmean(gndvi_valid),
"mean_savi": np.nanmean(savi_valid),
"mean_evi2": np.nanmean(evi2_valid),
"std_ndvi": np.nanstd(ndvi_valid),
"std_bsi": np.nanstd(bsi_valid),
"std_ndwi": np.nanstd(ndwi_valid),
"std_ci_green": np.nanstd(ci_green_valid),
"std_ci_red": np.nanstd(ci_red_valid),
"std_gndvi": np.nanstd(gndvi_valid),
"std_savi": np.nanstd(savi_valid),
"std_evi2": np.nanstd(evi2_valid),
"valid_pixel_count": valid_pixels,
"total_pixel_count": total_pixels,
})
return pd.DataFrame(rows)
def _make_nan_row(
field_id: str,
valid_pixels: int = 0,
total_pixels: int = 0,
) -> dict:
"""Return a row with NaN values for a field that couldn't be processed."""
return {
"field": field_id,
"mean_ndvi": np.nan,
"mean_bsi": np.nan,
"mean_ndwi": np.nan,
"mean_ci_green": np.nan,
"mean_ci_red": np.nan,
"mean_gndvi": np.nan,
"mean_savi": np.nan,
"mean_evi2": np.nan,
"std_ndvi": np.nan,
"std_bsi": np.nan,
"std_ndwi": np.nan,
"std_ci_green": np.nan,
"std_ci_red": np.nan,
"std_gndvi": np.nan,
"std_savi": np.nan,
"std_evi2": np.nan,
"valid_pixel_count": valid_pixels,
"total_pixel_count": total_pixels,
}

View file

@ -153,7 +153,6 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
# TIER 2: PER-FIELD TIFFS (Script 10 output)
field_tiles_dir <- here(laravel_storage_dir, "field_tiles")
field_tiles_ci_dir <- here(laravel_storage_dir, "field_tiles_CI")
daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split")
# SUPPORT TIER: DATA DIRECTORY (define early for use in later tiers)
data_dir <- here(laravel_storage_dir, "Data")
@ -170,6 +169,7 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
# TIER 5: MOSAICS (Script 40 output)
weekly_mosaic_dir <- here(laravel_storage_dir, "weekly_mosaic")
weekly_tile_max_dir <- here(laravel_storage_dir, "weekly_tile_max")
# TIER 6: KPI & REPORTING (Scripts 80/90/91 output)
reports_dir <- here(laravel_storage_dir, "reports")
@ -182,11 +182,11 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
# Create all directories
all_dirs <- c(
merged_tif_folder, field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir,
merged_tif_folder, field_tiles_dir, field_tiles_ci_dir,
extracted_ci_base_dir, daily_ci_vals_dir, cumulative_ci_vals_dir, ci_for_python_dir,
growth_model_interpolated_dir,
weekly_mosaic_dir,
reports_dir, kpi_reports_dir,
weekly_mosaic_dir, weekly_tile_max_dir,
reports_dir, kpi_reports_dir, kpi_field_stats_dir, kpi_field_analysis_dir,
data_dir, vrt_dir, harvest_dir, log_dir
)
@ -208,7 +208,6 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
# TIER 2: Per-field TIFFs
field_tiles_dir = field_tiles_dir,
field_tiles_ci_dir = field_tiles_ci_dir,
daily_tiles_split_dir = daily_tiles_split_dir,
# TIER 3: CI extraction
extracted_ci_base_dir = extracted_ci_base_dir,
@ -221,6 +220,7 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
# TIER 5: Mosaics
weekly_mosaic_dir = weekly_mosaic_dir,
weekly_tile_max_dir = weekly_tile_max_dir,
# TIER 6: KPI & reporting
reports_dir = reports_dir,
@ -509,109 +509,6 @@ setup_logging <- function(log_dir) {
))
}
# ==============================================================================
# SECTION 6B: DATA SOURCE DETECTION
# ==============================================================================
#' Detect data source for project
#'
#' Returns the data source directory (always "merged_tif" for consistency)
#'
#' @param project_dir Character. Project name
#' @return Character. "merged_tif"
detect_data_source <- function(project_dir) {
return("merged_tif")
}
#' Detect tile structure from merged TIF directory
#'
#' Checks if tiles exist by looking for:
#' 1. tiling_config.json metadata file (most reliable)
#' 2. Grid subdirectories (5x5, 10x10, etc.)
#' 3. Tile-named files (*_XX.tif pattern)
#'
#' @param merged_final_tif_dir Character. Path to merged TIF directory
#' @param daily_tiles_split_dir Character. Optional path to tiles directory
#' @return List with has_tiles (logical), detected_tiles, total_files, source, grid_size
detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) {
# PRIORITY 1: Check for tiling_config.json metadata file from script 10
if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) {
config_files <- list.files(daily_tiles_split_dir,
pattern = "tiling_config\\.json$",
recursive = TRUE,
full.names = TRUE)
if (length(config_files) > 0) {
config_file <- config_files[which.max(file.info(config_files)$mtime)]
tryCatch({
config_json <- jsonlite::read_json(config_file)
return(list(
has_tiles = config_json$has_tiles %||% TRUE,
detected_tiles = character(),
total_files = 0,
source = "tiling_config.json",
grid_size = config_json$grid_size %||% "unknown"
))
}, error = function(e) {
warning("Error reading tiling_config.json: ", e$message)
})
}
}
# PRIORITY 2: File-based detection (fallback)
if (!dir.exists(merged_final_tif_dir)) {
return(list(
has_tiles = FALSE,
detected_tiles = character(),
total_files = 0,
source = "directory_not_found"
))
}
# Check for grid-size subdirectories (5x5, 10x10, etc.)
grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
if (length(grid_patterns) > 0) {
grid_size <- grid_patterns[1]
grid_dir <- file.path(merged_final_tif_dir, grid_size)
sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
return(list(
has_tiles = TRUE,
detected_tiles = sample_tiles,
total_files = length(sample_tiles),
source = "grid_subdirectory_detection",
grid_size = grid_size,
grid_path = grid_dir
))
}
# Check for tile-named files (*_XX.tif pattern)
tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)
if (length(tif_files) == 0) {
return(list(
has_tiles = FALSE,
detected_tiles = character(),
total_files = 0,
source = "no_files_found"
))
}
tile_pattern <- "_(\\d{2})\\.tif$"
tile_files <- tif_files[grepl(tile_pattern, tif_files)]
has_tiles <- length(tile_files) > 0
return(list(
has_tiles = has_tiles,
detected_tiles = tile_files,
total_files = length(tif_files),
source = "file_pattern_detection"
))
}
# ==============================================================================
# SECTION 7: MOSAIC & KPI VERIFICATION HELPERS
# ==============================================================================
@ -619,19 +516,13 @@ detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_
#' Detect mosaic mode from project structure
#'
#' Determine mosaic architecture (legacy detection function)
#'
#' NOTE: This is a legacy function kept for backward compatibility.
#' The project has moved to per-field (single-file) architecture.
#' `weekly_tile_max` is no longer created in all_dirs, so this will always return "single-file"
#'
#' Determines if project uses "tiled" (legacy) or "single-file" (per-field) mosaics
#'
#' @param project_dir Character. Project name
#' @return Character. "tiled" or "single-file" (now always "single-file")
#' @return Character. "tiled" or "single-file"
detect_mosaic_mode <- function(project_dir) {
# Per-field architecture is standard - always return "single-file"
# Legacy support: check if weekly_tile_max exists (it won't in standard setup)
# unless weekly_tile_max directory exists with content
mosaic_tiled_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max")
if (dir.exists(mosaic_tiled_dir) && length(list.files(mosaic_tiled_dir)) > 0) {
@ -664,15 +555,11 @@ get_project_storage_path <- function(project_dir, subdir = NULL) {
return(path)
}
#' Get mosaic directory (legacy function)
#'
#' NOTE: This is a legacy helper kept for backward compatibility.
#' In the standard per-field workflow, this returns weekly_mosaic directory.
#' The "tiled" mode is no longer created (weekly_tile_max_dir was removed from all_dirs).
#' Get mosaic directory
#'
#' @param project_dir Character. Project name
#' @param mosaic_mode Character. "tiled" or "single-file" (auto-detects if "auto")
#' @return Character. Full path to mosaic directory (typically weekly_mosaic)
#' @param mosaic_mode Character. "tiled" or "single-file"
#' @return Character. Full path to mosaic directory
get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") {
if (mosaic_mode == "auto") {
mosaic_mode <- detect_mosaic_mode(project_dir)
@ -724,14 +611,11 @@ check_harvest_output_exists <- function(project_dir, week_num, year_num) {
file.exists(path)
}
#' Get mosaic verification directory (legacy function)
#'
#' NOTE: This is a legacy helper kept for backward compatibility.
#' Standard workflow uses weekly_mosaic; tiled mode is no longer created.
#' Get mosaic verification directory
#'
#' @param project_dir Character. Project name
#' @param mosaic_mode Character. "tiled" or "single-file"
#' @return Character. Full path to mosaic directory for verification
#' @return Character. Full path to mosaic directory
get_mosaic_verification_dir <- function(project_dir, mosaic_mode) {
base <- file.path("laravel_app", "storage", "app", project_dir)

View file

@ -456,6 +456,10 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = "
x_label <- switch(x_unit,
"days" = if (facet_on) "Date" else "Age of Crop (Days)",
"weeks" = "Week Number")
# Calculate dynamic max values for breaks
max_dah <- max(plot_data$DAH, na.rm = TRUE) + 20
max_week <- max(as.numeric(plot_data$week), na.rm = TRUE) + ceiling(20 / 7)
# Create plot with either facets by season or overlay by DAH/week
if (facet_on) {
@ -552,7 +556,7 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = "
if (x_var == "DAH") {
ggplot2::scale_x_continuous(breaks = seq(0, 450, by = 50), sec.axis = ggplot2::sec_axis(~ . / 30.44, name = "Age in Months", breaks = seq(0, 14, by = 1)))
} else if (x_var == "week") {
ggplot2::scale_x_continuous(breaks = seq(0, 64, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1)))
ggplot2::scale_x_continuous(breaks = seq(0, max_week, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1)))
}
} +
ggplot2::theme_minimal() +
@ -605,14 +609,18 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = "
x_label <- switch(x_unit,
"days" = if (facet_on) "Date" else "Age of Crop (Days)",
"weeks" = "Week Number")
# Choose color palette based on colorblind_friendly flag
color_scale <- if (colorblind_friendly) {
ggplot2::scale_color_brewer(type = "qual", palette = "Set2")
} else {
ggplot2::scale_color_discrete()
}
# Calculate dynamic max values for breaks
max_dah_both <- max(plot_data_both$DAH, na.rm = TRUE) + 20
max_week_both <- max(as.numeric(plot_data_both$week), na.rm = TRUE) + ceiling(20 / 7)
# Create the faceted plot
g_both <- ggplot2::ggplot(data = plot_data_both) +
# Add benchmark lines first (behind season lines)
@ -676,7 +684,7 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = "
if (x_var == "DAH") {
ggplot2::scale_x_continuous(breaks = seq(0, 450, by = 50), sec.axis = ggplot2::sec_axis(~ . / 30.44, name = "Age in Months", breaks = seq(0, 14, by = 1)))
} else if (x_var == "week") {
ggplot2::scale_x_continuous(breaks = seq(0, 64, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1)))
ggplot2::scale_x_continuous(breaks = seq(0, max_week_both, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1)))
} else if (x_var == "Date") {
ggplot2::scale_x_date(breaks = "1 month", date_labels = "%b-%Y", sec.axis = ggplot2::sec_axis(~ ., name = "Age in Months", breaks = scales::breaks_pretty()))
}

View file

@ -254,6 +254,7 @@
<h2>ODK Data Viewer</h2>
<p>Upload ODK CSVs to visualise and turn field corner points into polygons</p>
<ul class="app-features">
<li>Upload ODK CSVs</li>
<li>View ODK data</li>
<li>Turn corner points into polygons</li>
<li>Download created polygons</li>
@ -261,6 +262,22 @@
<a href="./odk_viewer/" class="app-btn">Open Viewer</a>
</div>
</div>
<!-- Unique ID Generator -->
<div class="app-card">
<div class="app-icon">🆔</div>
<div class="app-content">
<h2>Unique ID generator</h2>
<p>Generate unique IDs for fields from different clients</p>
<ul class="app-features">
<li>Upload GeoJSONs</li>
<li>Identify client</li>
<li>Generate unique IDs following SmartCane structure</li>
<li>Download GeoJSON with new IDs</li>
</ul>
<a href="./uid_generator/" class="app-btn">Open Editor</a>
</div>
</div>
</div>
<footer>

View file

@ -5,6 +5,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>SmartCane Web Apps - Login</title>
<link rel="stylesheet" href="theme.css">
<link rel="icon" type="image/png" href="res/key.png">
<style>
* {
margin: 0;

BIN
webapps/res/id.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.5 KiB

BIN
webapps/res/key.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

View file

@ -0,0 +1,317 @@
// Global State
let clientList = [];
let rawGeoJSON = null;
let prevGeoJSON = null;
let generatedGeoJSON = null;
let detectedClientCode = null;
// DOM Elements
const clientListInput = document.getElementById('clientListInput');
const rawGeoJSONInput = document.getElementById('rawGeoJSONInput');
const prevGeoJSONInput = document.getElementById('prevGeoJSONInput');
const clientSelect = document.getElementById('clientSelect');
const processBtn = document.getElementById('processBtn');
const downloadBtn = document.getElementById('downloadBtn');
const controlsSection = document.getElementById('controlsSection');
const diffViewer = document.getElementById('diffViewer');
const rawTableContainer = document.getElementById('rawTableContainer');
const editedTableContainer = document.getElementById('editedTableContainer');
const clearAllBtn = document.getElementById('clearAllBtn');
// Event Listeners
clientListInput.addEventListener('change', handleClientListUpload);
rawGeoJSONInput.addEventListener('change', handleRawGeoJSONUpload);
prevGeoJSONInput.addEventListener('change', handlePrevGeoJSONUpload);
processBtn.addEventListener('click', processGeoJSON);
downloadBtn.addEventListener('click', downloadGeoJSON);
clientSelect.addEventListener('change', () => {
// If we have raw data, re-process or enable processing
if (rawGeoJSON) {
processBtn.disabled = false;
}
});
// Clear All Logic
clearAllBtn.addEventListener('click', () => {
if(confirm('Are you sure you want to clear all data and reset?')) {
window.location.reload();
}
});
// Helper: Attempt to match filename to client code
function tryAutoSelectClient(filename) {
if (!filename) return;
// 1. Always clear the current selection and detected code first
clientSelect.value = "";
detectedClientCode = null;
// Matches strings starting with alphanumerics followed by an underscore OR a dot
const match = filename.match(/^([A-Z0-9]+)[_.]/);
if (match && match[1]) {
const potentialCode = match[1];
console.log(`Detected potential client code from filename: ${potentialCode}`);
// Save this in case the client list is uploaded AFTER the GeoJSON
detectedClientCode = potentialCode;
// 2. Try to select it in the dropdown immediately if the list exists
const option = Array.from(clientSelect.options).find(opt => opt.value === potentialCode);
if (option) {
clientSelect.value = potentialCode;
} else {
// Optional: Log that a code was found in the file, but not in the dropdown list yet
console.log("Client code found in filename but not in the current list.");
}
}
}
// Helper: Parse Excel
function handleClientListUpload(e) {
const file = e.target.files[0];
if (!file) return;
const reader = new FileReader();
reader.onload = function(e) {
const data = new Uint8Array(e.target.result);
const workbook = XLSX.read(data, { type: 'array' });
let sheetName = workbook.SheetNames.find(n => n === 'Client List') || workbook.SheetNames[0];
const worksheet = workbook.Sheets[sheetName];
const jsonData = XLSX.utils.sheet_to_json(worksheet);
clientList = jsonData.map(row => ({
country: row['Country'],
name: row['Name'],
code: row['SC-client-Code']
})).filter(item => item.code);
// Show count instead of filename
document.getElementById('clientListStatus').textContent = `${clientList.length} clients uploaded`;
populateClientDropdown();
controlsSection.classList.remove('hidden');
};
reader.readAsArrayBuffer(file);
}
function populateClientDropdown() {
clientSelect.innerHTML = '<option value="">-- Select Client --</option>';
clientList.forEach(client => {
const option = document.createElement('option');
option.value = client.code;
option.textContent = `${client.code} - ${client.name} (${client.country})`;
clientSelect.appendChild(option);
});
// Attempt auto-selection if code was detected previously (e.g., from raw upload before client list)
if (detectedClientCode) {
const option = Array.from(clientSelect.options).find(opt => opt.value === detectedClientCode);
if (option) {
clientSelect.value = detectedClientCode;
}
}
}
// Helper: Parse GeoJSON
function handleRawGeoJSONUpload(e) {
const file = e.target.files[0];
if (!file) return;
// Show filename
document.getElementById('rawGeoJSONStatus').textContent = `Uploaded: ${file.name}`;
// Try to detect client code from this filename
tryAutoSelectClient(file.name);
const reader = new FileReader();
reader.onload = function(e) {
try {
rawGeoJSON = JSON.parse(e.target.result);
renderTable(rawGeoJSON, rawTableContainer, false);
// Clear the right column (edited view) when new raw file is uploaded
editedTableContainer.innerHTML = '';
// Reset generated GeoJSON and disable download
generatedGeoJSON = null;
downloadBtn.disabled = true;
diffViewer.classList.remove('hidden');
processBtn.disabled = false;
} catch (err) {
console.error('Error parsing GeoJSON:', err);
alert('Error parsing Raw GeoJSON');
}
};
reader.readAsText(file);
}
function handlePrevGeoJSONUpload(e) {
const file = e.target.files[0];
if (!file) return;
// Show filename
document.getElementById('prevGeoJSONStatus').textContent = `Uploaded: ${file.name}`;
// Try to detect client code from this filename
tryAutoSelectClient(file.name);
const reader = new FileReader();
reader.onload = function(e) {
try {
prevGeoJSON = JSON.parse(e.target.result);
console.log("Previous GeoJSON loaded", prevGeoJSON);
} catch (err) {
console.error("Error parsing Previous GeoJSON", err);
alert("Error parsing Previous GeoJSON");
}
};
reader.readAsText(file);
}
// Core Logic: Generate IDs
function processGeoJSON() {
if (!rawGeoJSON) {
alert("Please upload Raw GeoJSON first.");
return;
}
// Validate Client Selection for Prefix
const clientCode = clientSelect.value;
if (!clientCode) {
alert("Please select a Client first to generate the correct ID format.");
return;
}
// Helper to extract number from ID
// Handles simple "0001" or complex "CODE_0001"
const extractIdNumber = (idStr) => {
if (!idStr) return 0;
// Split by underscore and take the last segment
const parts = String(idStr).split('_');
const numPart = parts[parts.length - 1];
const num = parseInt(numPart, 10);
return isNaN(num) ? 0 : num;
};
// Determine max ID
let maxId = 0;
// 1. Scan Previous GeoJSON
if (prevGeoJSON && prevGeoJSON.features) {
prevGeoJSON.features.forEach(f => {
if (f.properties && f.properties['sc_id']) {
const num = extractIdNumber(f.properties['sc_id']);
if (num > maxId) maxId = num;
}
});
}
// 2. Scan Raw GeoJSON (avoid collisions)
if (rawGeoJSON && rawGeoJSON.features) {
rawGeoJSON.features.forEach(f => {
if (f.properties && f.properties['sc_id']) {
const num = extractIdNumber(f.properties['sc_id']);
if (num > maxId) maxId = num;
}
});
}
// Clone Raw GeoJSON to create Generated one
generatedGeoJSON = JSON.parse(JSON.stringify(rawGeoJSON));
let currentId = maxId;
generatedGeoJSON.features.forEach(f => {
if (!f.properties) f.properties = {};
let existingId = f.properties['sc_id'];
// Check if empty or missing
if (existingId === undefined || existingId === null || existingId === "") {
currentId++;
// Format: CLIENTCODE_0001
f.properties['sc_id'] = `${clientCode}_${String(currentId).padStart(4, '0')}`;
f.isNew = true; // Flag for UI
}
});
renderTable(generatedGeoJSON, editedTableContainer, true);
downloadBtn.disabled = false;
}
function renderTable(geojson, container, isEdited) {
if (!geojson || !geojson.features) return;
container.innerHTML = '';
const table = document.createElement('table');
const thead = document.createElement('thead');
const trHead = document.createElement('tr');
['Field', 'Subfield', 'SC_ID'].forEach(text => {
const th = document.createElement('th');
th.textContent = text;
trHead.appendChild(th);
});
thead.appendChild(trHead);
table.appendChild(thead);
const tbody = document.createElement('tbody');
geojson.features.forEach(f => {
const props = f.properties || {};
const field = props['field'] || '';
const subfield = props['subfield'] || '';
const scId = props['sc_id'] || '';
const tr = document.createElement('tr');
if (isEdited && f.isNew) {
tr.className = 'diff-row new';
}
const tdField = document.createElement('td');
tdField.textContent = field;
tr.appendChild(tdField);
const tdSubfield = document.createElement('td');
tdSubfield.textContent = subfield;
tr.appendChild(tdSubfield);
const tdScId = document.createElement('td');
tdScId.textContent = scId;
tr.appendChild(tdScId);
tbody.appendChild(tr);
});
table.appendChild(tbody);
container.appendChild(table);
}
function downloadGeoJSON() {
if (!generatedGeoJSON) return;
// Remove temporary flags
const cleanGeoJSON = JSON.parse(JSON.stringify(generatedGeoJSON));
cleanGeoJSON.features.forEach(f => {
delete f.isNew;
});
const clientCode = clientSelect.value || "UNKNOWN";
const date = new Date();
const dateStr = date.toISOString().split('T')[0]; // YYYY-MM-DD
const filename = `${clientCode}_${dateStr}.geojson`;
const blob = new Blob([JSON.stringify(cleanGeoJSON, null, 2)], { type: 'application/geo+json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = filename;
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
URL.revokeObjectURL(url);
}

View file

@ -0,0 +1,260 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Unique ID generator</title>
<script src="https://cdn.sheetjs.com/xlsx-0.20.2/package/dist/xlsx.full.min.js"></script>
<link rel="icon" type="image/png" href="../res/id.png">
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background: #faf8f3;
min-height: 100vh;
display: flex;
flex-direction: column;
margin: 0;
}
header {
background: linear-gradient(135deg, #2aab95 0%, #1e8e7b 100%); /* Adjusted var defaults */
color: white;
padding: 15px 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
display: flex;
align-items: center;
gap: 20px;
position: sticky;
top: 0;
z-index: 100;
}
header h1 {
font-size: 24px;
font-weight: 600;
flex: 1;
margin: 0;
}
/* Single Container Fix */
.container {
flex: 1;
max-width: 1200px;
margin: 20px auto; /* Reduced top/bottom margin */
width: 100%;
padding: 0 20px;
box-sizing: border-box;
display: flex;
flex-direction: column;
gap: 15px; /* Controls distance between cards */
}
.card {
background: white;
padding: 25px;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border: 1px solid #e0d9d0;
width: 100%; /* Force full width */
box-sizing: border-box;
}
.card h2 {
margin-top: 0;
color: #333;
border-bottom: 2px solid #2aab95;
padding-bottom: 10px;
margin-bottom: 15px;
}
/* Diff Viewer Layout */
.diff-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
}
.diff-column {
max-height: 600px;
overflow-y: auto;
border: 1px solid #eee;
padding: 10px;
border-radius: 4px;
}
.diff-row.new { background-color: #e6ffec; }
.diff-row.modified { background-color: #fffbdd; }
/* File Input Styling */
.file-input-wrapper {
position: relative;
display: inline-block;
width: 100%;
margin-top: 10px;
}
.file-input-label {
display: block;
padding: 30px;
border: 2px dashed #2aab95;
border-radius: 6px;
text-align: center;
cursor: pointer;
transition: all 0.3s;
background: rgba(42, 171, 149, 0.05);
}
.file-input-label:hover {
border-color: #1e8e7b;
background: rgba(37, 150, 190, 0.05);
}
.file-input-wrapper input[type="file"] { display: none; }
/* Buttons */
button {
background: #2aab95;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
font-size: 14px;
transition: background 0.3s ease;
}
button:hover { background: #238b7d; }
button:disabled { background: #ccc; cursor: not-allowed; }
.header-btn {
background: rgba(255, 255, 255, 0.2);
border: 1px solid rgba(255, 255, 255, 0.4);
}
.header-btn:hover {
background: rgba(255, 255, 255, 0.3);
}
.hidden { display: none; }
.actions-row {
display: flex;
gap: 10px;
margin-top: 15px;
align-items: center;
flex-wrap: wrap;
}
footer {
background: white;
padding: 20px;
border-radius: 8px;
margin-top: 20px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
text-align: center;
font-size: 13px;
color: #666;
border: 1px solid #e0d9d0;
}
.two-column-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
margin-top: 15px;
}
.upload-column { min-width: 0; }
/* Table Styling */
table { width: 100%; border-collapse: collapse; font-size: 13px; }
th { text-align: left; background: #f4f4f4; padding: 5px; position: sticky; top: 0; }
td { padding: 5px; border-bottom: 1px solid #eee; }
.status-text {
margin-top: 10px;
color: #2aab95;
font-weight: 500;
}
</style>
</head>
<body>
<!-- <script>
if (sessionStorage.getItem('authenticated') !== 'true') {
window.location.href = '../login.html';
}
</script> -->
<header>
<button class="header-btn" onclick="window.location.href='../';" title="Back to main tools">← Back</button>
<h1>🆔 Unique ID generator</h1>
<button class="header-btn" id="clearAllBtn">Clear all</button>
</header>
<main class="container">
<div class="card">
<h2><span style="font-size: 24px;">📋</span> Upload client list</h2>
<p>Upload the master client list (SmartCane_client_list.xlsx)</p>
<div class="file-input-wrapper">
<label class="file-input-label" for="clientListInput">
<div id="fileLabelText">Drop the client list file here<br><small>or click to browse</small></div>
</label>
<input type="file" id="clientListInput" accept=".xlsx, .xls"/>
</div>
<p id="clientListStatus" class="status-text"></p>
</div>
<div class="card">
<h2><span style="font-size: 24px;">🗺️</span> Upload GeoJSON file(s)</h2>
<div class="two-column-grid">
<div class="upload-column">
<p><strong>Raw GeoJSON</strong></p>
<p>Upload the raw GeoJSON for which you wish to generate Unique IDs.</p>
<div class="file-input-wrapper">
<label class="file-input-label" for="rawGeoJSONInput">
<div>Drop the raw GeoJSON file here (Required)<br><small>or click to browse</small></div>
</label>
<input type="file" id="rawGeoJSONInput" accept=".geojson, .json"/>
</div>
<p id="rawGeoJSONStatus" class="status-text"></p>
</div>
<div class="upload-column">
<p><strong>Previously edited GeoJSON</strong></p>
<p>If you have an edited GeoJSON, upload it here to continue numbering.</p>
<div class="file-input-wrapper">
<label class="file-input-label" for="prevGeoJSONInput">
<div>Drop the previous GeoJSON file here (Optional)<br><small>or click to browse</small></div>
</label>
<input type="file" id="prevGeoJSONInput" accept=".geojson, .json"/>
</div>
<p id="prevGeoJSONStatus" class="status-text"></p>
</div>
</div>
</div>
<div class="controls hidden" id="controlsSection">
<div class="card">
<h2><span style="font-size: 24px;">⚙️</span> Client selection & UID Generation</h2>
<label for="clientSelect">Select Client:</label>
<select id="clientSelect" style="padding: 8px; width: 100%; max-width: 300px;">
<option value="">-- Select Client --</option>
</select>
<div class="actions-row">
<button id="processBtn">Process / Update Diff</button>
<button id="downloadBtn" disabled>Download GeoJSON</button>
</div>
</div>
</div>
<div class="diff-viewer hidden" id="diffViewer">
<div class="card">
<h2><span style="font-size: 24px;">🔍</span> Review Changes</h2>
<div class="diff-grid">
<div class="diff-column" id="rawColumn">
<h3>Raw GeoJSON</h3>
<div id="rawTableContainer"></div>
</div>
<div class="diff-column" id="editedColumn">
<h3>Edited GeoJSON</h3>
<div id="editedTableContainer"></div>
</div>
</div>
</div>
</div>
<footer>
Unique ID Generator for SmartCane clients
</footer>
</main>
<script src="app.js"></script>
</body>
</html>