Merge pull request #10 from TimonWeitkamp/harvest-model-improvements

feat: Add spectral extraction and analysis scripts for SC-161 and SC-162
2026-02-16 10:16:40 +01:00 · 2026-02-16 10:16:40 +01:00 · 787a2d5b90
parent d81f1a4e5a d075302efa
commit 787a2d5b90
7 changed files with 1602 additions and 179 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,6 +45,10 @@ output/
 reports/
 *.docx
 # Experiment Outputs (temporary plots, analysis artifacts)
 python_app/harvest_detection_experiments/*/plots/
 python_app/harvest_detection_experiments/*/*.ipynb_checkpoints/
 # Cache Files
 rosm.cache/
 *.cache
--- a/python_app/harvest_detection_experiments/angata_improvements/00_validate_pivot_geojson.ipynb
+++ b/python_app/harvest_detection_experiments/angata_improvements/00_validate_pivot_geojson.ipynb
@ -0,0 +1,258 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d90f7f7f",
   "metadata": {},
   "source": [
    "# Validate & Explore ESA `pivot.geojson`\n",
    "\n",
    "Quick inspection of the field boundary file for the ESA project:\n",
    "- Geometry validity, CRS, field count\n",
    "- Area/perimeter statistics\n",
    "- Map visualization with field labels\n",
    "- Export summary CSV for downstream use"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e99594bb",
   "metadata": {},
   "source": [
    "## 1. Import Required Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "652485c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from pathlib import Path\n",
    "from shapely.validation import make_valid\n",
    "\n",
    "# Project paths\n",
    "PROJECT = \"esa\"\n",
    "REPO_ROOT = Path.cwd().parents[2]  # SmartCane_code\n",
    "PROJECT_STORAGE = REPO_ROOT / \"laravel_app\" / \"storage\" / \"app\" / PROJECT\n",
    "GEOJSON_PATH = PROJECT_STORAGE / \"pivot.geojson\"\n",
    "DATA_DIR = PROJECT_STORAGE / \"Data\"\n",
    "DATA_DIR.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "print(f\"Repo root: {REPO_ROOT}\")\n",
    "print(f\"GeoJSON:   {GEOJSON_PATH}\")\n",
    "print(f\"Exists:    {GEOJSON_PATH.exists()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ff6d825",
   "metadata": {},
   "source": [
    "## 2. Load GeoJSON & Basic Info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92012a8c",
   "metadata": {},
   "outputs": [],
   "source": [
    "gdf = gpd.read_file(GEOJSON_PATH)\n",
    "\n",
    "print(f\"Features:      {len(gdf)}\")\n",
    "print(f\"Geometry type: {gdf.geom_type.unique()}\")\n",
    "print(f\"CRS:           {gdf.crs}\")\n",
    "print(f\"Bounds:        {gdf.total_bounds}\")\n",
    "print(f\"\\nColumns: {list(gdf.columns)}\")\n",
    "print(f\"\\nAttribute table:\")\n",
    "gdf.drop(columns=\"geometry\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "edadb2de",
   "metadata": {},
   "source": [
    "## 3. Validate Geometries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15b5e915",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check validity\n",
    "validity = gdf.is_valid\n",
    "empty = gdf.is_empty\n",
    "\n",
    "print(f\"Valid geometries:   {validity.sum()}/{len(gdf)}\")\n",
    "print(f\"Invalid geometries: {(~validity).sum()}\")\n",
    "print(f\"Empty geometries:   {empty.sum()}\")\n",
    "\n",
    "# Repair if needed\n",
    "if not validity.all():\n",
    "    print(\"\\nRepairing invalid geometries...\")\n",
    "    gdf[\"geometry\"] = gdf[\"geometry\"].apply(make_valid)\n",
    "    print(f\"After repair: {gdf.is_valid.sum()}/{len(gdf)} valid\")\n",
    "\n",
    "# Remove empty\n",
    "if empty.any():\n",
    "    gdf = gdf[~gdf.is_empty]\n",
    "    print(f\"After removing empty: {len(gdf)} features remaining\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3f01400",
   "metadata": {},
   "source": [
    "## 4. Field-Level Statistics (Area, Perimeter, Centroid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "922ffdbd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reproject to UTM for accurate area/perimeter (auto-detect UTM zone from centroid)\n",
    "centroid = gdf.dissolve().centroid.iloc[0]\n",
    "utm_zone = int((centroid.x + 180) / 6) + 1\n",
    "hemisphere = \"north\" if centroid.y >= 0 else \"south\"\n",
    "utm_epsg = 32600 + utm_zone if hemisphere == \"north\" else 32700 + utm_zone\n",
    "print(f\"Auto-detected UTM zone: {utm_zone}{hemisphere[0].upper()} (EPSG:{utm_epsg})\")\n",
    "\n",
    "gdf_utm = gdf.to_crs(epsg=utm_epsg)\n",
    "\n",
    "# Compute stats\n",
    "gdf_utm[\"area_m2\"] = gdf_utm.geometry.area\n",
    "gdf_utm[\"area_ha\"] = gdf_utm[\"area_m2\"] / 10_000\n",
    "gdf_utm[\"area_acres\"] = gdf_utm[\"area_ha\"] * 2.47105\n",
    "gdf_utm[\"perimeter_m\"] = gdf_utm.geometry.length\n",
    "centroids = gdf_utm.geometry.centroid\n",
    "gdf_utm[\"centroid_x\"] = centroids.x\n",
    "gdf_utm[\"centroid_y\"] = centroids.y\n",
    "\n",
    "# Summary table\n",
    "stats = gdf_utm[[\"field\", \"area_ha\", \"area_acres\", \"perimeter_m\", \"centroid_x\", \"centroid_y\"]].copy()\n",
    "stats = stats.sort_values(\"area_ha\", ascending=False).reset_index(drop=True)\n",
    "print(f\"\\nTotal area: {stats['area_ha'].sum():.1f} ha ({stats['area_acres'].sum():.1f} acres)\")\n",
    "print(f\"Fields: {len(stats)}\")\n",
    "print(f\"Area range: {stats['area_ha'].min():.1f} – {stats['area_ha'].max():.1f} ha\\n\")\n",
    "stats"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3dbf84c7",
   "metadata": {},
   "source": [
    "## 5. Visualize Field Boundaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be1be0ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(1, 1, figsize=(12, 10))\n",
    "gdf_utm.plot(ax=ax, column=\"field\", legend=True, edgecolor=\"black\", linewidth=0.8,\n",
    "             cmap=\"Set3\", alpha=0.7, legend_kwds={\"loc\": \"upper left\", \"fontsize\": 8})\n",
    "\n",
    "# Add field labels at centroids\n",
    "for _, row in gdf_utm.iterrows():\n",
    "    c = row.geometry.centroid\n",
    "    ax.annotate(row[\"field\"], xy=(c.x, c.y), ha=\"center\", va=\"center\",\n",
    "                fontsize=7, fontweight=\"bold\",\n",
    "                bbox=dict(boxstyle=\"round,pad=0.2\", fc=\"white\", alpha=0.7))\n",
    "\n",
    "ax.set_title(f\"ESA Field Boundaries ({len(gdf_utm)} fields)\", fontsize=14)\n",
    "ax.set_xlabel(\"Easting (m)\")\n",
    "ax.set_ylabel(\"Northing (m)\")\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52301688",
   "metadata": {},
   "source": [
    "## 6. CRS Check & Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4c712fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Original CRS info\n",
    "print(\"=== Original CRS ===\")\n",
    "print(f\"CRS: {gdf.crs}\")\n",
    "print(f\"Is geographic: {gdf.crs.is_geographic if gdf.crs else 'No CRS'}\")\n",
    "print(f\"Is projected:  {gdf.crs.is_projected if gdf.crs else 'No CRS'}\")\n",
    "\n",
    "if gdf.crs and gdf.crs.is_geographic:\n",
    "    # Compare area in geographic vs projected CRS\n",
    "    area_geographic = gdf.geometry.area  # in degrees² (meaningless)\n",
    "    area_projected = gdf_utm.geometry.area  # in m²\n",
    "    \n",
    "    print(f\"\\n=== Area comparison ===\")\n",
    "    print(f\"Geographic CRS area (degrees²): meaningless for spatial analysis\")\n",
    "    print(f\"Projected UTM area (EPSG:{utm_epsg}):\")\n",
    "    for _, row in gdf_utm.iterrows():\n",
    "        print(f\"  {row['field']}: {row['area_ha']:.2f} ha\")\n",
    "    print(f\"\\nAlways use projected CRS (UTM) for area/distance calculations.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3a21236a",
   "metadata": {},
   "source": [
    "## 7. Export Summary Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "acd4b3c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save summary CSV to project data directory\n",
    "output_csv = DATA_DIR / \"field_summary.csv\"\n",
    "stats.to_csv(output_csv, index=False)\n",
    "print(f\"Saved field summary to: {output_csv}\")\n",
    "print(f\"\\nFinal summary:\")\n",
    "print(f\"  Project:     {PROJECT}\")\n",
    "print(f\"  Fields:      {len(stats)}\")\n",
    "print(f\"  Total area:  {stats['area_ha'].sum():.1f} ha ({stats['area_acres'].sum():.1f} acres)\")\n",
    "print(f\"  CRS:         {gdf.crs} (original) → EPSG:{utm_epsg} (projected)\")\n",
    "print(f\"  All valid:   {gdf.is_valid.all()}\")\n",
    "stats"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/python_app/harvest_detection_experiments/angata_improvements/01_spectral_feature_exploration.py
+++ b/python_app/harvest_detection_experiments/angata_improvements/01_spectral_feature_exploration.py
@ -0,0 +1,431 @@
 """
 01_spectral_feature_exploration.py — SC-161 Exploration
 =======================================================
 Explore spectral indices extracted from ESA 4-band TIFFs.
 Indices (8 total from RGB+NIR):
  NDVI, BSI, NDWI, CI_green, CI_red, GNDVI, SAVI, EVI2
 Key questions:
 1. How do indices correlate with each other? Which are redundant?
 2. Does any index capture harvest signal better than CI?
 3. Which indices add independent information beyond CI_green?
 4. Pre/post harvest distribution shifts per index
 Usage:
  cd python_app/harvest_detection_experiments/angata_improvements
  python 01_spectral_feature_exploration.py
 """
 import matplotlib
 matplotlib.use("Agg")  # Non-interactive backend — save only, no window blocking
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 from pathlib import Path
 # =============================================================================
 # CONFIG
 # =============================================================================
 PROJECT = "esa"
 REPO_ROOT = Path(__file__).resolve().parents[3]
 PROJECT_STORAGE = REPO_ROOT / "laravel_app" / "storage" / "app" / PROJECT
 SPECTRAL_CSV = PROJECT_STORAGE / "Data" / "extracted_ci" / "ci_data_for_python" / "spectral_indices.csv"
 HARVEST_XLSX = PROJECT_STORAGE / "harvest.xlsx"
 # Output directory for plots
 OUTPUT_DIR = Path(__file__).parent / "plots"
 OUTPUT_DIR.mkdir(exist_ok=True)
 # All mean index columns
 INDEX_COLS = [
    "mean_ndvi", "mean_bsi", "mean_ndwi", "mean_ci_green",
    "mean_ci_red", "mean_gndvi", "mean_savi", "mean_evi2",
 ]
 INDEX_LABELS = {
    "mean_ndvi": "NDVI",
    "mean_bsi": "BSI",
    "mean_ndwi": "NDWI",
    "mean_ci_green": "CI_green",
    "mean_ci_red": "CI_red",
    "mean_gndvi": "GNDVI",
    "mean_savi": "SAVI",
    "mean_evi2": "EVI2",
 }
 INDEX_COLORS = {
    "mean_ndvi": "green",
    "mean_bsi": "saddlebrown",
    "mean_ndwi": "steelblue",
    "mean_ci_green": "darkgreen",
    "mean_ci_red": "darkred",
    "mean_gndvi": "olive",
    "mean_savi": "teal",
    "mean_evi2": "purple",
 }
 # =============================================================================
 # LOAD DATA
 # =============================================================================
 print("=" * 80)
 print("SC-161: SPECTRAL FEATURE EXPLORATION — ESA (8 indices)")
 print("=" * 80)
 # Spectral indices
 si = pd.read_csv(SPECTRAL_CSV, parse_dates=["Date"])
 si = si[si["field"] != "00F25"]  # drop always-NaN field
 print(f"\nSpectral indices: {len(si)} rows, {si['field'].nunique()} fields")
 print(f"  Date range: {si['Date'].min().date()} → {si['Date'].max().date()}")
 print(f"  Columns: {[c for c in si.columns if c.startswith('mean_')]}")
 for col in INDEX_COLS:
    if col in si.columns:
        nan_pct = si[col].isna().mean() * 100
        print(f"  {INDEX_LABELS[col]:10s} NaN: {nan_pct:.1f}%  mean: {si[col].mean():.4f}  range: [{si[col].min():.4f}, {si[col].max():.4f}]")
 # Harvest data
 harvest = pd.read_excel(HARVEST_XLSX)
 harvest["season_start"] = pd.to_datetime(harvest["season_start"], errors="coerce")
 harvest["season_end"] = pd.to_datetime(harvest["season_end"], errors="coerce")
 # Filter to fields we have spectral data for
 our_fields = set(si["field"].unique())
 harvest = harvest[harvest["field"].isin(our_fields)].copy()
 # Only keep seasons with an actual end date (= confirmed harvest)
 harvest_events = harvest.dropna(subset=["season_end"]).copy()
 print(f"\nHarvest events (confirmed): {len(harvest_events)} across {harvest_events['field'].nunique()} fields")
 # =============================================================================
 # 1. FULL CORRELATION MATRIX — all 8 indices
 # =============================================================================
 print("\n--- Full correlation matrix (8 indices) ---")
 available_cols = [c for c in INDEX_COLS if c in si.columns]
 valid = si.dropna(subset=available_cols)
 corr_matrix = valid[available_cols].corr()
 # Print in readable format
 labels = [INDEX_LABELS[c] for c in available_cols]
 print(f"\n{'':>12s}", "  ".join(f"{l:>10s}" for l in labels))
 for i, (col, label) in enumerate(zip(available_cols, labels)):
    vals = " ".join(f"{corr_matrix.iloc[i, j]:>10.3f}" for j in range(len(available_cols)))
    print(f"{label:>12s} {vals}")
 # Heatmap
 fig, ax = plt.subplots(figsize=(10, 8))
 im = ax.imshow(corr_matrix.values, cmap="RdBu_r", vmin=-1, vmax=1, aspect="equal")
 ax.set_xticks(range(len(labels)))
 ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=10)
 ax.set_yticks(range(len(labels)))
 ax.set_yticklabels(labels, fontsize=10)
 ax.set_title("Spectral Index Correlation Matrix (ESA)", fontsize=14)
 # Annotate cells
 for i in range(len(labels)):
    for j in range(len(labels)):
        val = corr_matrix.iloc[i, j]
        txt_color = "white" if abs(val) > 0.7 else "black"
        ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=9, color=txt_color)
 plt.colorbar(im, ax=ax, label="Pearson r", shrink=0.8)
 plt.tight_layout()
 fig.savefig(OUTPUT_DIR / "correlation_matrix_all_indices.png", dpi=150, bbox_inches="tight")
 plt.close()
 print(f"  → Saved correlation_matrix_all_indices.png")
 # =============================================================================
 # 2. PER-FIELD CORRELATION — CI_green vs each index
 # =============================================================================
 print("\n--- Per-field correlation: CI_green vs each index ---")
 ci_col = "mean_ci_green"
 other_cols = [c for c in available_cols if c != ci_col]
 fig, axes = plt.subplots(2, 4, figsize=(20, 10))
 axes_flat = axes.flatten()
 for i, col in enumerate(other_cols):
    ax = axes_flat[i]
    label = INDEX_LABELS[col]
    field_corrs = valid.groupby("field").apply(
        lambda g: g[ci_col].corr(g[col]) if len(g) > 30 else np.nan
    ).dropna()
    ax.hist(field_corrs, bins=20, color=INDEX_COLORS.get(col, "gray"), edgecolor="white", alpha=0.8)
    ax.axvline(field_corrs.mean(), color="red", linestyle="--", linewidth=2,
               label=f"mean={field_corrs.mean():.3f}")
    ax.set_xlabel(f"r(CI_green, {label})")
    ax.set_ylabel("Field count")
    ax.set_title(f"CI_green vs {label}")
    ax.legend(fontsize=8)
    ax.set_xlim(-1, 1)
    print(f"  CI_green vs {label:8s}: mean r = {field_corrs.mean():.4f}, std = {field_corrs.std():.4f}")
 # Remove unused subplot
 if len(other_cols) < len(axes_flat):
    for j in range(len(other_cols), len(axes_flat)):
        axes_flat[j].set_visible(False)
 fig.suptitle("Per-field correlation: CI_green vs each index", fontsize=14, fontweight="bold")
 plt.tight_layout()
 fig.savefig(OUTPUT_DIR / "ci_green_vs_all_per_field_corr.png", dpi=150, bbox_inches="tight")
 plt.close()
 print(f"  → Saved ci_green_vs_all_per_field_corr.png")
 # =============================================================================
 # 3. PRE/POST HARVEST SHIFTS — all indices
 # =============================================================================
 print("\n--- Pre/post harvest distribution shifts (all indices) ---")
 WINDOW_DAYS = 30
 shift_results = {}
 for col in available_cols:
    label = INDEX_LABELS[col]
    pre_vals = []
    post_vals = []
    for _, h in harvest_events.iterrows():
        field_data = si[si["field"] == h["field"]].copy()
        if field_data.empty:
            continue
        harvest_date = h["season_end"]
        pre = field_data[
            (field_data["Date"] >= harvest_date - pd.Timedelta(days=WINDOW_DAYS))
            & (field_data["Date"] < harvest_date)
        ][col].dropna()
        pre_vals.extend(pre.tolist())
        post = field_data[
            (field_data["Date"] > harvest_date)
            & (field_data["Date"] <= harvest_date + pd.Timedelta(days=WINDOW_DAYS))
        ][col].dropna()
        post_vals.extend(post.tolist())
    pre_arr = np.array(pre_vals)
    post_arr = np.array(post_vals)
    delta = post_arr.mean() - pre_arr.mean()
    # Effect size (Cohen's d)
    pooled_std = np.sqrt((pre_arr.std() ** 2 + post_arr.std() ** 2) / 2)
    cohens_d = delta / pooled_std if pooled_std > 0 else 0
    shift_results[col] = {
        "label": label,
        "pre_mean": pre_arr.mean(),
        "post_mean": post_arr.mean(),
        "delta": delta,
        "abs_delta": abs(delta),
        "cohens_d": cohens_d,
        "pre_n": len(pre_arr),
        "post_n": len(post_arr),
    }
 # Print sorted by absolute Cohen's d (effect size)
 print(f"\n{'Index':>12s}  {'Pre':>8s}  {'Post':>8s}  {'Delta':>8s}  {'Cohen d':>8s}  {'n_pre':>6s}  {'n_post':>6s}")
 for col in sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True):
    r = shift_results[col]
    print(f"{r['label']:>12s}  {r['pre_mean']:>8.4f}  {r['post_mean']:>8.4f}  {r['delta']:>+8.4f}  {r['cohens_d']:>+8.3f}  {r['pre_n']:>6d}  {r['post_n']:>6d}")
 # Bar chart of effect sizes
 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
 sorted_cols = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)
 labels_sorted = [shift_results[c]["label"] for c in sorted_cols]
 deltas = [shift_results[c]["delta"] for c in sorted_cols]
 cohens_ds = [shift_results[c]["cohens_d"] for c in sorted_cols]
 colors = [INDEX_COLORS.get(c, "gray") for c in sorted_cols]
 # Raw delta
 ax1.barh(labels_sorted, deltas, color=colors, alpha=0.8, edgecolor="white")
 ax1.set_xlabel("Post - Pre harvest mean")
 ax1.set_title("Raw shift at harvest (±30 days)")
 ax1.axvline(0, color="black", linewidth=0.5)
 ax1.grid(True, alpha=0.3, axis="x")
 # Cohen's d
 ax2.barh(labels_sorted, [abs(d) for d in cohens_ds], color=colors, alpha=0.8, edgecolor="white")
 ax2.set_xlabel("|Cohen's d| (effect size)")
 ax2.set_title("Harvest signal strength (effect size)")
 ax2.axvline(0.2, color="gray", linestyle=":", label="small (0.2)")
 ax2.axvline(0.5, color="gray", linestyle="--", label="medium (0.5)")
 ax2.axvline(0.8, color="gray", linestyle="-", label="large (0.8)")
 ax2.legend(fontsize=8)
 ax2.grid(True, alpha=0.3, axis="x")
 plt.tight_layout()
 fig.savefig(OUTPUT_DIR / "harvest_signal_all_indices.png", dpi=150, bbox_inches="tight")
 plt.close()
 print(f"  → Saved harvest_signal_all_indices.png")
 # =============================================================================
 # 4. PRE/POST DISTRIBUTIONS — top 4 by harvest signal
 # =============================================================================
 print("\n--- Pre/post harvest distributions (top 4 indices by effect size) ---")
 top4_cols = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)[:4]
 fig, axes = plt.subplots(1, 4, figsize=(20, 5))
 for ax, col in zip(axes, top4_cols):
    label = INDEX_LABELS[col]
    pre_vals = []
    post_vals = []
    for _, h in harvest_events.iterrows():
        field_data = si[si["field"] == h["field"]].copy()
        if field_data.empty:
            continue
        harvest_date = h["season_end"]
        pre = field_data[
            (field_data["Date"] >= harvest_date - pd.Timedelta(days=WINDOW_DAYS))
            & (field_data["Date"] < harvest_date)
        ][col].dropna()
        pre_vals.extend(pre.tolist())
        post = field_data[
            (field_data["Date"] > harvest_date)
            & (field_data["Date"] <= harvest_date + pd.Timedelta(days=WINDOW_DAYS))
        ][col].dropna()
        post_vals.extend(post.tolist())
    ax.hist(pre_vals, bins=40, alpha=0.6, color="green", label=f"Pre (n={len(pre_vals)})", density=True)
    ax.hist(post_vals, bins=40, alpha=0.6, color="brown", label=f"Post (n={len(post_vals)})", density=True)
    d = shift_results[col]["cohens_d"]
    ax.set_title(f"{label} (d={d:+.2f})")
    ax.set_xlabel(label)
    ax.set_ylabel("Density")
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)
 fig.suptitle(f"Pre/Post Harvest ±{WINDOW_DAYS}d — Top 4 by effect size", fontsize=13, fontweight="bold")
 plt.tight_layout()
 fig.savefig(OUTPUT_DIR / "pre_post_harvest_top4.png", dpi=150, bbox_inches="tight")
 plt.close()
 print(f"  → Saved pre_post_harvest_top4.png")
 # =============================================================================
 # 5. TIMESERIES — top field, all 8 indices
 # =============================================================================
 print("\n--- Timeseries: all indices for top harvest-rich fields ---")
 top_fields = harvest_events.groupby("field").size().sort_values(ascending=False).head(3).index.tolist()
 print(f"Top fields: {top_fields}")
 for field_id in top_fields:
    field_data = si[si["field"] == field_id].sort_values("Date").copy()
    field_harvests = harvest_events[harvest_events["field"] == field_id]
    fig, axes = plt.subplots(4, 2, figsize=(20, 16), sharex=True)
    fig.suptitle(f"Field {field_id} — All Spectral Indices", fontsize=14, fontweight="bold")
    for ax, col in zip(axes.flatten(), available_cols):
        label = INDEX_LABELS[col]
        color = INDEX_COLORS.get(col, "gray")
        vals = field_data.dropna(subset=[col])
        ax.plot(vals["Date"], vals[col], color=color, linewidth=0.6, alpha=0.7)
        # 14-day rolling mean
        if len(vals) > 14:
            rolling = vals.set_index("Date")[col].rolling("14D", min_periods=3).mean()
            ax.plot(rolling.index, rolling.values, color=color, linewidth=2, alpha=0.8)
        # Harvest dates
        for _, h in field_harvests.iterrows():
            ax.axvline(h["season_end"], color="red", linestyle="--", alpha=0.6, linewidth=1)
        ax.set_ylabel(label, fontsize=10, fontweight="bold")
        ax.grid(True, alpha=0.3)
    axes[-1, 0].xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    axes[-1, 1].xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    plt.tight_layout()
    fig.savefig(OUTPUT_DIR / f"all_indices_{field_id}.png", dpi=150, bbox_inches="tight")
    plt.close()
    print(f"  → Saved all_indices_{field_id}.png")
 # =============================================================================
 # 6. REDUNDANCY ANALYSIS — which indices add information beyond CI_green?
 # =============================================================================
 print("\n--- Redundancy analysis: residual variance after regressing vs CI_green ---")
 from numpy.polynomial import polynomial as P
 residual_variance = {}
 for col in [c for c in available_cols if c != ci_col]:
    label = INDEX_LABELS[col]
    pair = valid[[ci_col, col]].dropna()
    if len(pair) < 100:
        continue
    x, y = pair[ci_col].values, pair[col].values
    # Fit linear regression CI_green → index
    coeffs = P.polyfit(x, y, deg=1)
    y_pred = P.polyval(x, coeffs)
    residuals = y - y_pred
    total_var = np.var(y)
    resid_var = np.var(residuals)
    r_squared = 1 - resid_var / total_var
    residual_variance[col] = {
        "label": label,
        "r_squared": r_squared,
        "residual_pct": (1 - r_squared) * 100,
    }
 print(f"\n{'Index':>12s}  {'R² vs CI':>10s}  {'Residual %':>12s}  {'Interpretation'}")
 for col in sorted(residual_variance.keys(), key=lambda c: residual_variance[c]["residual_pct"], reverse=True):
    r = residual_variance[col]
    interp = "UNIQUE signal" if r["residual_pct"] > 20 else "Some unique" if r["residual_pct"] > 5 else "Redundant"
    print(f"{r['label']:>12s}  {r['r_squared']:>10.4f}  {r['residual_pct']:>11.1f}%  {interp}")
 # =============================================================================
 # 7. SUMMARY
 # =============================================================================
 print("\n" + "=" * 80)
 print("SUMMARY — SC-161 Expanded Index Analysis")
 print("=" * 80)
 # Best harvest signal
 best_col = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)[0]
 best = shift_results[best_col]
 # Most unique signal (highest residual variance vs CI_green)
 most_unique = sorted(residual_variance.keys(), key=lambda c: residual_variance[c]["residual_pct"], reverse=True)
 print(f"""
 INDICES COMPUTED: {', '.join(INDEX_LABELS[c] for c in available_cols)}
 DATA: {si['field'].nunique()} fields, {si['Date'].nunique()} dates
 HARVEST SIGNAL (strongest → weakest by Cohen's d):
 """)
 for col in sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True):
    r = shift_results[col]
    print(f"  {r['label']:>12s}: Cohen's d = {r['cohens_d']:+.3f}  (Δ = {r['delta']:+.4f})")
 print(f"""
 REDUNDANCY vs CI_green (unique information):
 """)
 for col in most_unique:
    r = residual_variance[col]
    print(f"  {r['label']:>12s}: {r['residual_pct']:.1f}% unique variance (R² = {r['r_squared']:.3f})")
 print(f"""
 RECOMMENDATION:
  Best harvest signal:   {best['label']} (d={best['cohens_d']:+.3f})
  Most unique vs CI:     {residual_variance[most_unique[0]]['label']} ({residual_variance[most_unique[0]]['residual_pct']:.1f}% independent)
 Plots saved to: {OUTPUT_DIR}
 """)
--- a/python_app/harvest_detection_experiments/angata_improvements/02_season_normalization_analysis.ipynb
+++ b/python_app/harvest_detection_experiments/angata_improvements/02_season_normalization_analysis.ipynb
@ -0,0 +1,465 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e9cdc8ee",
   "metadata": {},
   "source": [
    "# SC-162: Season-Length Normalization Analysis\n",
    "## Compare ESA vs Angata Spectral Indices (CI_green, SAVI)\n",
    "\n",
    "Multi-year data analysis focusing on:\n",
    "- **Peak timing** (at what season-age % do peaks occur?)\n",
    "- **Amplitude** (how high/low?)\n",
    "- **Phase** (lag between indices)\n",
    "- **Shape** (growth curve pattern)\n",
    "\n",
    "Note: Cannot compare absolute DOY across multiple years, but CAN compare normalized season age (0-1 scale) and pattern shapes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59943da7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "from pathlib import Path\n",
    "from scipy import stats\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Configuration\n",
    "REPO_ROOT = Path.cwd().parent.parent.parent.parent\n",
    "print(f\"Repo root: {REPO_ROOT}\")\n",
    "\n",
    "# Project-specific config\n",
    "PROJECTS = {\n",
    "    'esa': {\n",
    "        'expected_season_length': 365,\n",
    "        'storage_path': REPO_ROOT / 'laravel_app' / 'storage' / 'app' / 'esa',\n",
    "    },\n",
    "    'angata': {\n",
    "        'expected_season_length': 540,\n",
    "        'storage_path': REPO_ROOT / 'laravel_app' / 'storage' / 'app' / 'angata',\n",
    "    }\n",
    "}\n",
    "\n",
    "# Load data for both projects\n",
    "data = {}\n",
    "harvest_data = {}\n",
    "\n",
    "for project, config in PROJECTS.items():\n",
    "    print(f\"\\n{'='*60}\")\n",
    "    print(f\"Loading {project.upper()}\")\n",
    "    print(f\"{'='*60}\")\n",
    "    \n",
    "    # Spectral indices CSV\n",
    "    csv_path = config['storage_path'] / 'Data' / 'extracted_ci' / 'ci_data_for_python' / 'spectral_indices.csv'\n",
    "    df = pd.read_csv(csv_path, parse_dates=['Date'])\n",
    "    df = df[df['field'] != '00F25']  # Remove always-NaN field if present\n",
    "    print(f\"Spectral data: {len(df)} rows, {df['field'].nunique()} fields\")\n",
    "    print(f\"Date range: {df['Date'].min().date()} → {df['Date'].max().date()}\")\n",
    "    \n",
    "    # Harvest dates\n",
    "    harvest_path = config['storage_path'] / 'Data' / 'harvest.xlsx'\n",
    "    harvest_df = pd.read_excel(harvest_path)\n",
    "    harvest_df['season_start'] = pd.to_datetime(harvest_df['season_start'], errors='coerce')\n",
    "    harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')\n",
    "    \n",
    "    # Filter to fields we have spectral data for\n",
    "    our_fields = set(df['field'].unique())\n",
    "    harvest_df = harvest_df[harvest_df['field'].isin(our_fields)].copy()\n",
    "    harvest_events = harvest_df.dropna(subset=['season_end']).copy()\n",
    "    print(f\"Harvest events: {len(harvest_events)} across {harvest_events['field'].nunique()} fields\")\n",
    "    \n",
    "    data[project] = df\n",
    "    harvest_data[project] = harvest_events"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "260b405b",
   "metadata": {},
   "source": [
    "## Step 1: Compute Season Age for Each Observation\n",
    "\n",
    "For each field-date pair, calculate days since season start and normalize to 0-1 scale."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55a0f94f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_season_age(spec_df, harvest_df, project_season_length):\n",
    "    \"\"\"\n",
    "    Compute normalized season age (0-1) for each row.\n",
    "    \n",
    "    For each field, use the earliest season_start from harvest data as the baseline.\n",
    "    If no season_start available, use the earliest date in spectral data.\n",
    "    \"\"\"\n",
    "    spec_copy = spec_df.copy()\n",
    "    spec_copy['season_age'] = np.nan\n",
    "    spec_copy['days_since_start'] = np.nan\n",
    "    \n",
    "    for field_id in spec_copy['field'].unique():\n",
    "        # Get season start for this field\n",
    "        field_harvest = harvest_df[harvest_df['field'] == field_id]\n",
    "        \n",
    "        if not field_harvest.empty and not field_harvest['season_start'].isna().all():\n",
    "            season_start = field_harvest['season_start'].min()\n",
    "        else:\n",
    "            # Fallback: use first observation date\n",
    "            field_data = spec_copy[spec_copy['field'] == field_id]\n",
    "            season_start = field_data['Date'].min()\n",
    "        \n",
    "        # Compute for all rows of this field\n",
    "        field_mask = spec_copy['field'] == field_id\n",
    "        days_since = (spec_copy.loc[field_mask, 'Date'] - season_start).dt.days\n",
    "        spec_copy.loc[field_mask, 'days_since_start'] = days_since\n",
    "        spec_copy.loc[field_mask, 'season_age'] = days_since / project_season_length\n",
    "    \n",
    "    return spec_copy\n",
    "\n",
    "# Compute for both projects\n",
    "for project, config in PROJECTS.items():\n",
    "    data[project] = compute_season_age(\n",
    "        data[project],\n",
    "        harvest_data[project],\n",
    "        config['expected_season_length']\n",
    "    )\n",
    "    print(f\"{project.upper()} season_age range: [{data[project]['season_age'].min():.3f}, {data[project]['season_age'].max():.3f}]\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b946c36c",
   "metadata": {},
   "source": [
    "## Step 2: Plot CI_green + SAVI Trajectories by Project\n",
    "\n",
    "Multi-year overlay to compare:\n",
    "- Peak timing (at what season-age %?)\n",
    "- Amplitude (range of values)\n",
    "- Phase (timing relative to each other)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c8de5a03",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create figure: 2 projects × 2 indices = 4 subplots\n",
    "fig, axes = plt.subplots(2, 2, figsize=(18, 12))\n",
    "\n",
    "indices = ['mean_ci_green', 'mean_savi']\n",
    "index_labels = {'mean_ci_green': 'CI_green (Chlorophyll)', 'mean_savi': 'SAVI (Soil-Adjusted)'}\n",
    "\n",
    "for row, project in enumerate(['esa', 'angata']):\n",
    "    df = data[project]\n",
    "    season_len = PROJECTS[project]['expected_season_length']\n",
    "    \n",
    "    for col, idx in enumerate(indices):\n",
    "        ax = axes[row, col]\n",
    "        \n",
    "        # Plot all field timeseries (normalized season age)\n",
    "        for field in df['field'].unique():\n",
    "            field_data = df[df['field'] == field].dropna(subset=[idx, 'season_age']).sort_values('season_age')\n",
    "            ax.plot(field_data['season_age'], field_data[idx], alpha=0.15, color='steelblue', linewidth=0.8)\n",
    "        \n",
    "        # Overlay mean trajectory with rolling average\n",
    "        binned = df.dropna(subset=[idx, 'season_age']).copy()\n",
    "        binned['season_age_bin'] = pd.cut(binned['season_age'], bins=20)\n",
    "        mean_trajectory = binned.groupby('season_age_bin')[idx].mean()\n",
    "        bin_centers = [interval.mid for interval in mean_trajectory.index]\n",
    "        ax.plot(bin_centers, mean_trajectory.values, color='red', linewidth=3, label='Average', zorder=10)\n",
    "        \n",
    "        ax.set_xlabel('Season Age (0 = start, 1 = expected end)', fontsize=11)\n",
    "        ax.set_ylabel(index_labels[idx], fontsize=11)\n",
    "        ax.set_title(f\"{project.upper()} - {index_labels[idx]}\\n(Season length: {season_len}d)\", fontsize=12, fontweight='bold')\n",
    "        ax.set_xlim(0, 1.2)  # Allow slight overgrowth\n",
    "        ax.grid(True, alpha=0.3)\n",
    "        ax.legend(fontsize=9)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('season_normalization_trajectories.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "print(\"\\nSaved: season_normalization_trajectories.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f60a0dc5",
   "metadata": {},
   "source": [
    "## Step 3: Peak Analysis - Where do maxima occur?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e149843",
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_peaks_per_field(df, index_col, season_age_col='season_age'):\n",
    "    \"\"\"\n",
    "    For each field, find the peak (maximum) in the normalized season age.\n",
    "    Return statistics about where peaks occur.\n",
    "    \"\"\"\n",
    "    peaks = []\n",
    "    \n",
    "    for field in df['field'].unique():\n",
    "        field_data = df[df['field'] == field].dropna(subset=[index_col, season_age_col]).sort_values(season_age_col)\n",
    "        \n",
    "        if len(field_data) > 10:  # Need enough data\n",
    "            max_idx = field_data[index_col].idxmax()\n",
    "            peak_season_age = field_data.loc[max_idx, season_age_col]\n",
    "            peak_value = field_data.loc[max_idx, index_col]\n",
    "            peaks.append({\n",
    "                'field': field,\n",
    "                'peak_season_age': peak_season_age,\n",
    "                'peak_value': peak_value\n",
    "            })\n",
    "    \n",
    "    return pd.DataFrame(peaks)\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"PEAK TIMING ANALYSIS (at what season-age do indices peak?)\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "for project in ['esa', 'angata']:\n",
    "    print(f\"\\n{project.upper()}:\")\n",
    "    print(\"-\" * 70)\n",
    "    \n",
    "    for idx, label in [('mean_ci_green', 'CI_green'), ('mean_savi', 'SAVI')]:\n",
    "        peaks_df = find_peaks_per_field(data[project], idx, 'season_age')\n",
    "        \n",
    "        print(f\"\\n  {label}:\")\n",
    "        print(f\"    Fields analyzed: {len(peaks_df)}\")\n",
    "        print(f\"    Peak season_age: mean={peaks_df['peak_season_age'].mean():.3f}, std={peaks_df['peak_season_age'].std():.3f}\")\n",
    "        print(f\"    Peak occurs at: {peaks_df['peak_season_age'].mean()*100:.1f}% through season\")\n",
    "        print(f\"    Peak value range: [{peaks_df['peak_value'].min():.4f}, {peaks_df['peak_value'].max():.4f}]\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1d6e97e4",
   "metadata": {},
   "source": [
    "## Step 4: Amplitude Comparison - How strong are the signals?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75c72340",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"AMPLITUDE ANALYSIS (value ranges per project)\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "for project in ['esa', 'angata']:\n",
    "    print(f\"\\n{project.upper()}:\")\n",
    "    print(\"-\" * 70)\n",
    "    \n",
    "    df = data[project]\n",
    "    \n",
    "    for idx, label in [('mean_ci_green', 'CI_green'), ('mean_savi', 'SAVI')]:\n",
    "        valid = df[idx].dropna()\n",
    "        amplitude = valid.max() - valid.min()\n",
    "        \n",
    "        print(f\"\\n  {label}:\")\n",
    "        print(f\"    Mean: {valid.mean():.4f}\")\n",
    "        print(f\"    Std: {valid.std():.4f}\")\n",
    "        print(f\"    Min: {valid.min():.4f}\")\n",
    "        print(f\"    Max: {valid.max():.4f}\")\n",
    "        print(f\"    Amplitude (max-min): {amplitude:.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c2e3fc8",
   "metadata": {},
   "source": [
    "## Step 5: Phase Analysis - Lead/lag between CI_green and SAVI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "867d39e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_cross_correlation(df, idx1, idx2, max_lag=0.2):\n",
    "    \"\"\"\n",
    "    Compute cross-correlation between two indices to find phase lag.\n",
    "    \"\"\"\n",
    "    # Aggregate by season_age bins\n",
    "    binned = df.dropna(subset=[idx1, idx2, 'season_age']).copy()\n",
    "    binned['bin'] = pd.cut(binned['season_age'], bins=30)\n",
    "    \n",
    "    agg = binned.groupby('bin')[[idx1, idx2]].mean()\n",
    "    agg = agg.dropna()\n",
    "    \n",
    "    if len(agg) < 10:\n",
    "        return np.nan, np.nan\n",
    "    \n",
    "    # Normalize\n",
    "    x = (agg[idx1] - agg[idx1].mean()) / agg[idx1].std()\n",
    "    y = (agg[idx2] - agg[idx2].mean()) / agg[idx2].std()\n",
    "    \n",
    "    # Cross-correlation\n",
    "    corr = np.correlate(x.values, y.values, mode='full')\n",
    "    lags = np.arange(-len(x)+1, len(x))\n",
    "    \n",
    "    best_lag_idx = np.argmax(np.abs(corr))\n",
    "    best_lag = lags[best_lag_idx]\n",
    "    max_corr = corr[best_lag_idx] / (len(x) - 1)\n",
    "    \n",
    "    return best_lag, max_corr\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"PHASE ANALYSIS (CI_green vs SAVI lag)\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "for project in ['esa', 'angata']:\n",
    "    df = data[project]\n",
    "    lag, corr = compute_cross_correlation(df, 'mean_ci_green', 'mean_savi')\n",
    "    \n",
    "    print(f\"\\n{project.upper()}:\")\n",
    "    if not np.isnan(lag):\n",
    "        print(f\"  CI_green → SAVI lag: {lag:.2f} bins (~{(lag/30):.3f} of season)\")\n",
    "        print(f\"  Max cross-correlation: {corr:.3f}\")\n",
    "        print(f\"  Interpretation: {'SAVI slightly leads CI_green' if lag < 0 else 'CI_green slightly leads SAVI' if lag > 0 else 'Synchronized'}\")\n",
    "    else:\n",
    "        print(f\"  Insufficient data for lag analysis\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5277beb9",
   "metadata": {},
   "source": [
    "## Step 6: Growth Curve Shape - Do they follow similar patterns?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24685c7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare growth curves directly: normalized to 0-1 on both axes\n",
    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
    "\n",
    "for col, idx in enumerate(['mean_ci_green', 'mean_savi']):\n",
    "    ax = axes[col]\n",
    "    \n",
    "    for project, color in [('esa', 'blue'), ('angata', 'orange')]:\n",
    "        df = data[project]\n",
    "        binned = df.dropna(subset=[idx, 'season_age']).copy()\n",
    "        binned['bin'] = pd.cut(binned['season_age'], bins=25)\n",
    "        \n",
    "        # Mean trajectory per bin\n",
    "        traj = binned.groupby('bin')[idx].agg(['mean', 'std', 'count'])\n",
    "        traj = traj[traj['count'] > 5]  # Only bins with N>5\n",
    "        bin_centers = [interval.mid for interval in traj.index]\n",
    "        \n",
    "        # Normalize to 0-1 for visual comparison\n",
    "        normalized_vals = (traj['mean'] - traj['mean'].min()) / (traj['mean'].max() - traj['mean'].min())\n",
    "        \n",
    "        ax.plot(bin_centers, normalized_vals.values, color=color, linewidth=2.5, \n",
    "                marker='o', markersize=6, label=f\"{project.upper()}\", zorder=10)\n",
    "        ax.fill_between(bin_centers, \n",
    "                         (normalized_vals - traj['std']/traj['mean'].std()).values,\n",
    "                         (normalized_vals + traj['std']/traj['mean'].std()).values,\n",
    "                         alpha=0.2, color=color)\n",
    "    \n",
    "    ax.set_xlabel('Season Age (0-1)', fontsize=12)\n",
    "    ax.set_ylabel('Normalized Index (0-1)', fontsize=12)\n",
    "    ax.set_title(f\"Growth Curve Shape: {['CI_green', 'SAVI'][col]}\", fontsize=13, fontweight='bold')\n",
    "    ax.set_xlim(0, 1.1)\n",
    "    ax.set_ylim(-0.1, 1.1)\n",
    "    ax.grid(True, alpha=0.3)\n",
    "    ax.legend(fontsize=11, loc='best')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('growth_curve_comparison.png', dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "print(\"\\nSaved: growth_curve_comparison.png\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27b5e61a",
   "metadata": {},
   "source": [
    "## Summary: Does Season-Age Normalization Work?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9252cf10",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n\" + \"=\"*70)\n",
    "print(\"SC-162 FINDINGS: Season-Age Normalization Validation\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "print(\"\"\"\n",
    "▶ KEY QUESTION: Do ESA and Angata follow similar growth patterns when normalized to season age?\n",
    "\n",
    "If YES (patterns align on 0-1 scale):\n",
    "  ✅ Season-age normalization is valid\n",
    "  ✅ Can train model on ESA, apply to Angata with confidence\n",
    "  ✅ Use proportional imminent window (7-8% of season length)\n",
    "\n",
    "If NO (patterns diverge):\n",
    "  ❌ Need project-specific models\n",
    "  ❌ Investigate why patterns differ (climate, variety, soil, etc.)\n",
    "  ❌ Consider multivariate approach (season_age + other features)\n",
    "\n",
    "OBSERVATIONS TO CHECK ABOVE:\n",
    "1. Peak timing: Do both peak at ~same season-age?\n",
    "2. Growth curve shape: Do normalized curves look similar?\n",
    "3. Amplitude: Is the relative range comparable?\n",
    "4. Phase: Is lag between CI_green and SAVI similar across projects?\n",
    "\"\"\")\n",
    "\n",
    "print(\"\\nNEXT STEPS:\")\n",
    "print(\"1. If patterns align → Proceed with feature engineering (Phase 5)\")\n",
    "print(\"2. If patterns differ → Debug and adjust project configs\")\n",
    "print(\"3. Test both fixed (28d) and proportional imminent windows on ESA\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch_gpu",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/python_app/harvest_detection_experiments/angata_improvements/run_spectral_extraction.py
+++ b/python_app/harvest_detection_experiments/angata_improvements/run_spectral_extraction.py
@ -0,0 +1,199 @@
 """
 run_spectral_extraction.py — Batch extract spectral indices from all TIFFs
 ==========================================================================
 Part of SC-161: Extract spectral indices from 4-band TIFFs (BSI, NDVI, NDWI)
 Loops over all dated TIF files in a project's TIF folder, computes per-field
 NDVI, BSI, NDWI and saves a single CSV with the same field/date structure
 as ci_data_for_python.csv (can be joined on field + Date).
 Usage:
  cd python_app/harvest_detection_experiments/angata_improvements
  python run_spectral_extraction.py --project esa
  python run_spectral_extraction.py --project esa --tif-folder merged_final_tif
  python run_spectral_extraction.py --project esa --start 2024-01-01 --end 2024-12-31
 """
 import argparse
 import sys
 import time
 from datetime import datetime
 from pathlib import Path
 import geopandas as gpd
 import pandas as pd
 # Add parent dirs to path so we can import the module
 sys.path.insert(0, str(Path(__file__).parent))
 from spectral_features import extract_field_spectral_indices
 def find_tif_dates(tif_folder: Path) -> list[tuple[str, Path]]:
    """
    Find all dated TIF files (YYYY-MM-DD.tif) in a folder.
    Returns sorted list of (date_str, path) tuples.
    """
    tifs = []
    for f in sorted(tif_folder.glob("*.tif")):
        stem = f.stem  # e.g. "2024-01-15"
        try:
            datetime.strptime(stem, "%Y-%m-%d")
            tifs.append((stem, f))
        except ValueError:
            continue  # skip non-date-named TIFs
    return tifs
 def main():
    parser = argparse.ArgumentParser(
        description="Extract spectral indices (NDVI, BSI, NDWI) from 4-band TIFFs per field"
    )
    parser.add_argument("--project", required=True, help="Project name (e.g. esa, angata)")
    parser.add_argument(
        "--tif-folder", default="merged_final_tif",
        help="TIF subfolder name within project storage (default: merged_final_tif)"
    )
    parser.add_argument("--start", default=None, help="Start date filter (YYYY-MM-DD)")
    parser.add_argument("--end", default=None, help="End date filter (YYYY-MM-DD)")
    parser.add_argument(
        "--output", default=None,
        help="Output CSV path. Default: {project_storage}/Data/extracted_ci/ci_data_for_python/spectral_indices.csv"
    )
    parser.add_argument("--field-id-col", default="field", help="Field ID column in GeoJSON")
    args = parser.parse_args()
    # ==========================================================================
    # RESOLVE PATHS
    # ==========================================================================
    # Navigate from this script to the repo root
    # __file__ = angata_improvements/run_spectral_extraction.py
    # parents: [0]=angata_improvements, [1]=harvest_detection_experiments, [2]=python_app, [3]=SmartCane_code
    repo_root = Path(__file__).resolve().parents[3]
    project_storage = repo_root / "laravel_app" / "storage" / "app" / args.project
    tif_folder = project_storage / args.tif_folder
    geojson_path = project_storage / "pivot.geojson"
    if args.output:
        output_csv = Path(args.output)
    else:
        output_dir = project_storage / "Data" / "extracted_ci" / "ci_data_for_python"
        output_dir.mkdir(parents=True, exist_ok=True)
        output_csv = output_dir / "spectral_indices.csv"
    # ==========================================================================
    # VALIDATION
    # ==========================================================================
    print("=" * 80)
    print(f"SPECTRAL INDEX EXTRACTION — {args.project.upper()}")
    print("=" * 80)
    if not tif_folder.exists():
        print(f"ERROR: TIF folder not found: {tif_folder}")
        sys.exit(1)
    if not geojson_path.exists():
        print(f"ERROR: Field boundaries not found: {geojson_path}")
        sys.exit(1)
    # Load field boundaries once
    print(f"\nLoading field boundaries: {geojson_path}")
    gdf = gpd.read_file(geojson_path)
    n_fields = len(gdf)
    print(f"  Fields: {n_fields}")
    print(f"  Field IDs: {', '.join(gdf[args.field_id_col].astype(str).tolist())}")
    # Find all dated TIFs
    all_tifs = find_tif_dates(tif_folder)
    print(f"\nTIF files found: {len(all_tifs)}")
    # Apply date filters
    if args.start:
        all_tifs = [(d, p) for d, p in all_tifs if d >= args.start]
    if args.end:
        all_tifs = [(d, p) for d, p in all_tifs if d <= args.end]
    print(f"TIF files after filtering: {len(all_tifs)}")
    if not all_tifs:
        print("ERROR: No TIF files to process.")
        sys.exit(1)
    print(f"Date range: {all_tifs[0][0]} → {all_tifs[-1][0]}")
    print(f"Output: {output_csv}")
    print()
    # ==========================================================================
    # EXTRACTION LOOP
    # ==========================================================================
    all_rows = []
    errors = 0
    t0 = time.time()
    for i, (date_str, tif_path) in enumerate(all_tifs):
        try:
            df = extract_field_spectral_indices(
                tif_path, gdf, field_id_col=args.field_id_col
            )
            df.insert(1, "Date", date_str)
            all_rows.append(df)
        except Exception as e:
            errors += 1
            print(f"  [{i+1}/{len(all_tifs)}] ERROR {date_str}: {e}")
            continue
        # Progress every 100 files
        if (i + 1) % 100 == 0 or (i + 1) == len(all_tifs):
            elapsed = time.time() - t0
            rate = (i + 1) / elapsed
            remaining = (len(all_tifs) - i - 1) / rate if rate > 0 else 0
            print(
                f"  [{i+1}/{len(all_tifs)}] {date_str} "
                f"({rate:.1f} files/s, ~{remaining:.0f}s remaining)"
            )
    elapsed_total = time.time() - t0
    # ==========================================================================
    # COMBINE & SAVE
    # ==========================================================================
    if not all_rows:
        print("ERROR: No data extracted.")
        sys.exit(1)
    result = pd.concat(all_rows, ignore_index=True)
    # Sort by field, date for consistency with CI data
    result = result.sort_values(["field", "Date"]).reset_index(drop=True)
    # Save
    output_csv.parent.mkdir(parents=True, exist_ok=True)
    result.to_csv(output_csv, index=False)
    # ==========================================================================
    # SUMMARY
    # ==========================================================================
    print()
    print("=" * 80)
    print("EXTRACTION COMPLETE")
    print("=" * 80)
    print(f"  Dates processed: {len(all_tifs)}")
    print(f"  Errors: {errors}")
    print(f"  Total rows: {len(result)}")
    print(f"  Fields: {result['field'].nunique()}")
    print(f"  Date range: {result['Date'].min()} → {result['Date'].max()}")
    print(f"  Time: {elapsed_total:.1f}s ({len(all_tifs)/elapsed_total:.1f} files/s)")
    print()
    # Quick stats
    for col in ["mean_ndvi", "mean_bsi", "mean_ndwi"]:
        vals = result[col].dropna()
        if len(vals) > 0:
            print(f"  {col}: [{vals.min():.4f}, {vals.max():.4f}] (mean={vals.mean():.4f})")
    nan_pct = result[["mean_ndvi", "mean_bsi", "mean_ndwi"]].isna().mean() * 100
    print(f"\n  NaN rates: NDVI={nan_pct['mean_ndvi']:.1f}%, BSI={nan_pct['mean_bsi']:.1f}%, NDWI={nan_pct['mean_ndwi']:.1f}%")
    print(f"\n  Output: {output_csv}")
 if __name__ == "__main__":
    main()
--- a/python_app/harvest_detection_experiments/angata_improvements/spectral_features.py
+++ b/python_app/harvest_detection_experiments/angata_improvements/spectral_features.py
@ -0,0 +1,241 @@
 """
 spectral_features.py — Extract per-field spectral indices from 4-band GeoTIFFs
 ================================================================================
 Part of SC-161: Extract spectral indices from 4-band TIFFs (BSI, NDVI, NDWI)
 Band order in TIFFs: [R, G, B, NIR] (uint16)
 Spectral Indices:
  NDVI     = (NIR - R) / (NIR + R)                               — vegetation density
  BSI      = ((R + NIR) - (G + B)) / ((R + NIR) + (G + B))       — bare soil detection
  NDWI     = (G - NIR) / (G + NIR)                               — moisture content
  CI_green = NIR / Green - 1                                     — chlorophyll index (= SmartCane CI)
  CI_red   = NIR / Red - 1                                       — chlorophyll index (red-based)
  GNDVI    = (NIR - G) / (NIR + G)                               — green NDVI (less saturation)
  SAVI     = 1.5 * (NIR - R) / (NIR + R + 0.5)                   — soil-adjusted vegetation
  EVI2     = 2.5 * (NIR - R) / (NIR + 2.4*R + 1)                 — enhanced vegetation (2-band)
 """
 import numpy as np
 import pandas as pd
 import geopandas as gpd
 import rasterio
 from rasterio.mask import mask as rio_mask
 from pathlib import Path
 from typing import Optional
 # =============================================================================
 # SPECTRAL INDEX CALCULATIONS
 # =============================================================================
 def _safe_normalized_diff(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    """Compute (a - b) / (a + b) with safe division, masking invalid pixels."""
    with np.errstate(divide="ignore", invalid="ignore"):
        result = (a - b) / (a + b)
    result[~np.isfinite(result)] = np.nan
    return result
 def compute_ndvi(red: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """NDVI = (NIR - R) / (NIR + R)"""
    return _safe_normalized_diff(nir, red)
 def compute_bsi(red: np.ndarray, green: np.ndarray, blue: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """BSI = ((R + NIR) - (G + B)) / ((R + NIR) + (G + B))"""
    numerator = (red + nir) - (green + blue)
    denominator = (red + nir) + (green + blue)
    with np.errstate(divide="ignore", invalid="ignore"):
        result = numerator / denominator
    result[~np.isfinite(result)] = np.nan
    return result
 def compute_ndwi(green: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """NDWI = (G - NIR) / (G + NIR)"""
    return _safe_normalized_diff(green, nir)
 def compute_ci_green(green: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """CI_green = NIR / Green - 1  (= SmartCane pipeline CI)"""
    with np.errstate(divide="ignore", invalid="ignore"):
        result = nir / green - 1.0
    result[~np.isfinite(result)] = np.nan
    return result
 def compute_ci_red(red: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """CI_red = NIR / Red - 1  (more sensitive at low chlorophyll)"""
    with np.errstate(divide="ignore", invalid="ignore"):
        result = nir / red - 1.0
    result[~np.isfinite(result)] = np.nan
    return result
 def compute_gndvi(green: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """GNDVI = (NIR - G) / (NIR + G)"""
    return _safe_normalized_diff(nir, green)
 def compute_savi(red: np.ndarray, nir: np.ndarray, L: float = 0.5) -> np.ndarray:
    """SAVI = (1 + L) * (NIR - R) / (NIR + R + L),  L=0.5 typical"""
    with np.errstate(divide="ignore", invalid="ignore"):
        result = (1.0 + L) * (nir - red) / (nir + red + L)
    result[~np.isfinite(result)] = np.nan
    return result
 def compute_evi2(red: np.ndarray, nir: np.ndarray) -> np.ndarray:
    """EVI2 = 2.5 * (NIR - R) / (NIR + 2.4*R + 1)"""
    with np.errstate(divide="ignore", invalid="ignore"):
        result = 2.5 * (nir - red) / (nir + 2.4 * red + 1.0)
    result[~np.isfinite(result)] = np.nan
    return result
 # =============================================================================
 # PER-FIELD EXTRACTION
 # =============================================================================
 def extract_field_spectral_indices(
    tif_path: str | Path,
    field_boundaries_gdf: gpd.GeoDataFrame,
    field_id_col: str = "field",
    nodata_threshold: float = 0.0,
    min_valid_fraction: float = 0.1,
 ) -> pd.DataFrame:
    """
    Extract mean/std spectral indices per field from a single 4-band TIF.
    Parameters
    ----------
    tif_path : path to 4-band GeoTIFF (R, G, B, NIR — uint16)
    field_boundaries_gdf : GeoDataFrame with field polygons
    field_id_col : column name for field identifier
    nodata_threshold : pixels with ALL bands <= this value are treated as nodata
    min_valid_fraction : minimum fraction of valid pixels required (else NaN)
    Returns
    -------
    DataFrame with columns:
        field, mean_ndvi, mean_bsi, mean_ndwi, mean_ci_green, mean_ci_red,
        mean_gndvi, mean_savi, mean_evi2,
        std_ndvi, std_bsi, std_ndwi, std_ci_green, std_ci_red,
        std_gndvi, std_savi, std_evi2,
        valid_pixel_count, total_pixel_count
    """
    tif_path = Path(tif_path)
    rows = []
    with rasterio.open(tif_path) as src:
        tif_crs = src.crs
        # Reproject field boundaries to match TIF CRS if needed
        gdf = field_boundaries_gdf.copy()
        if gdf.crs is not None and not gdf.crs.equals(tif_crs):
            gdf = gdf.to_crs(tif_crs)
        elif gdf.crs is None:
            # Assume EPSG:4326 if not set
            gdf = gdf.set_crs("EPSG:4326").to_crs(tif_crs)
        for _, row_geom in gdf.iterrows():
            field_id = row_geom[field_id_col]
            geom = [row_geom.geometry.__geo_interface__]
            try:
                out_image, _ = rio_mask(src, geom, crop=True, all_touched=True, nodata=0)
            except Exception:
                # Field doesn't overlap with this TIF
                rows.append(_make_nan_row(field_id))
                continue
            # out_image shape: (bands, height, width) — convert to float
            bands = out_image.astype(np.float64)
            red = bands[0]
            green = bands[1]
            blue = bands[2]
            nir = bands[3]
            # Mask: valid pixels have at least one band > nodata_threshold
            valid_mask = np.any(bands > nodata_threshold, axis=0)
            total_pixels = valid_mask.size
            valid_pixels = int(valid_mask.sum())
            if valid_pixels < max(1, int(total_pixels * min_valid_fraction)):
                rows.append(_make_nan_row(field_id, valid_pixels, total_pixels))
                continue
            # Compute indices (only on valid pixels)
            ndvi = compute_ndvi(red, nir)
            bsi = compute_bsi(red, green, blue, nir)
            ndwi = compute_ndwi(green, nir)
            ci_green = compute_ci_green(green, nir)
            ci_red = compute_ci_red(red, nir)
            gndvi = compute_gndvi(green, nir)
            savi = compute_savi(red, nir)
            evi2 = compute_evi2(red, nir)
            # Apply mask
            ndvi_valid = ndvi[valid_mask]
            bsi_valid = bsi[valid_mask]
            ndwi_valid = ndwi[valid_mask]
            ci_green_valid = ci_green[valid_mask]
            ci_red_valid = ci_red[valid_mask]
            gndvi_valid = gndvi[valid_mask]
            savi_valid = savi[valid_mask]
            evi2_valid = evi2[valid_mask]
            rows.append({
                "field": field_id,
                "mean_ndvi": np.nanmean(ndvi_valid),
                "mean_bsi": np.nanmean(bsi_valid),
                "mean_ndwi": np.nanmean(ndwi_valid),
                "mean_ci_green": np.nanmean(ci_green_valid),
                "mean_ci_red": np.nanmean(ci_red_valid),
                "mean_gndvi": np.nanmean(gndvi_valid),
                "mean_savi": np.nanmean(savi_valid),
                "mean_evi2": np.nanmean(evi2_valid),
                "std_ndvi": np.nanstd(ndvi_valid),
                "std_bsi": np.nanstd(bsi_valid),
                "std_ndwi": np.nanstd(ndwi_valid),
                "std_ci_green": np.nanstd(ci_green_valid),
                "std_ci_red": np.nanstd(ci_red_valid),
                "std_gndvi": np.nanstd(gndvi_valid),
                "std_savi": np.nanstd(savi_valid),
                "std_evi2": np.nanstd(evi2_valid),
                "valid_pixel_count": valid_pixels,
                "total_pixel_count": total_pixels,
            })
    return pd.DataFrame(rows)
 def _make_nan_row(
    field_id: str,
    valid_pixels: int = 0,
    total_pixels: int = 0,
 ) -> dict:
    """Return a row with NaN values for a field that couldn't be processed."""
    return {
        "field": field_id,
        "mean_ndvi": np.nan,
        "mean_bsi": np.nan,
        "mean_ndwi": np.nan,
        "mean_ci_green": np.nan,
        "mean_ci_red": np.nan,
        "mean_gndvi": np.nan,
        "mean_savi": np.nan,
        "mean_evi2": np.nan,
        "std_ndvi": np.nan,
        "std_bsi": np.nan,
        "std_ndwi": np.nan,
        "std_ci_green": np.nan,
        "std_ci_red": np.nan,
        "std_gndvi": np.nan,
        "std_savi": np.nan,
        "std_evi2": np.nan,
        "valid_pixel_count": valid_pixels,
        "total_pixel_count": total_pixels,
    }
--- a/r_app/parameters_project.R
+++ b/r_app/parameters_project.R
@ -146,7 +146,6 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
  # TIER 2: PER-FIELD TIFFS (Script 10 output)
  field_tiles_dir <- here(laravel_storage_dir, "field_tiles")
  field_tiles_ci_dir <- here(laravel_storage_dir, "field_tiles_CI")
  daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split")
  # SUPPORT TIER: DATA DIRECTORY (define early for use in later tiers)
  data_dir <- here(laravel_storage_dir, "Data")
@ -163,7 +162,7 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
  # TIER 5: MOSAICS (Script 40 output)
  weekly_mosaic_dir <- here(laravel_storage_dir, "weekly_mosaic")
-  weekly_tile_max_dir <- here(laravel_storage_dir, "weekly_tile_max")
+  field_tiles_ci_dir <- here(weekly_mosaic_dir, "field_tiles_CI")
  # TIER 6: KPI & REPORTING (Scripts 80/90/91 output)
  reports_dir <- here(laravel_storage_dir, "reports")
@ -178,10 +177,10 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
  # Create all directories
  all_dirs <- c(
-    merged_tif_folder, field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir,
+    merged_tif_folder, field_tiles_dir, field_tiles_ci_dir,
    extracted_ci_base_dir, daily_ci_vals_dir, cumulative_ci_vals_dir, ci_for_python_dir,
    growth_model_interpolated_dir,
-    weekly_mosaic_dir, weekly_tile_max_dir,
+    weekly_mosaic_dir,  field_tiles_ci_dir, 
    reports_dir, kpi_reports_dir, kpi_field_stats_dir, kpi_field_analysis_dir,
    data_dir, vrt_dir, harvest_dir, log_dir
  )
@ -204,7 +203,6 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
    # TIER 2: Per-field TIFFs
    field_tiles_dir = field_tiles_dir,
    field_tiles_ci_dir = field_tiles_ci_dir,
    daily_tiles_split_dir = daily_tiles_split_dir,
    # TIER 3: CI extraction
    extracted_ci_base_dir = extracted_ci_base_dir,
@ -217,7 +215,7 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") {
    # TIER 5: Mosaics
    weekly_mosaic_dir = weekly_mosaic_dir,
-    weekly_tile_max_dir = weekly_tile_max_dir,
+    field_tiles_ci_dir = field_tiles_ci_dir,
    # TIER 6: KPI & reporting
    reports_dir = reports_dir,
@ -508,171 +506,12 @@ setup_logging <- function(log_dir) {
  ))
 }
 # ==============================================================================
 # SECTION 6B: DATA SOURCE DETECTION
 # ==============================================================================
 #' Detect data source for project
 #'
 #' Returns the data source directory (always "merged_tif" for consistency)
 #'
 #' @param project_dir Character. Project name
 #' @return Character. "merged_tif"
 detect_data_source <- function(project_dir) {
  return("merged_tif")
 }
 #' Detect tile structure from merged TIF directory
 #'
 #' Checks if tiles exist by looking for:
 #' 1. tiling_config.json metadata file (most reliable)
 #' 2. Grid subdirectories (5x5, 10x10, etc.)
 #' 3. Tile-named files (*_XX.tif pattern)
 #'
 #' @param merged_final_tif_dir Character. Path to merged TIF directory
 #' @param daily_tiles_split_dir Character. Optional path to tiles directory
 #' @return List with has_tiles (logical), detected_tiles, total_files, source, grid_size
 detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) {
  # PRIORITY 1: Check for tiling_config.json metadata file from script 10
  if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) {
    config_files <- list.files(daily_tiles_split_dir, 
                               pattern = "tiling_config\\.json$", 
                               recursive = TRUE,
                               full.names = TRUE)
    if (length(config_files) > 0) {
      config_file <- config_files[which.max(file.info(config_files)$mtime)]
      tryCatch({
        config_json <- jsonlite::read_json(config_file)
        return(list(
          has_tiles = config_json$has_tiles %||% TRUE,
          detected_tiles = character(),
          total_files = 0,
          source = "tiling_config.json",
          grid_size = config_json$grid_size %||% "unknown"
        ))
      }, error = function(e) {
        warning("Error reading tiling_config.json: ", e$message)
      })
    }
  }
  # PRIORITY 2: File-based detection (fallback)
  if (!dir.exists(merged_final_tif_dir)) {
    return(list(
      has_tiles = FALSE, 
      detected_tiles = character(),
      total_files = 0,
      source = "directory_not_found"
    ))
  }
  # Check for grid-size subdirectories (5x5, 10x10, etc.)
  grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE)
  grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE)
  if (length(grid_patterns) > 0) {
    grid_size <- grid_patterns[1]
    grid_dir <- file.path(merged_final_tif_dir, grid_size)
    sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3]
    return(list(
      has_tiles = TRUE,
      detected_tiles = sample_tiles,
      total_files = length(sample_tiles),
      source = "grid_subdirectory_detection",
      grid_size = grid_size,
      grid_path = grid_dir
    ))
  }
  # Check for tile-named files (*_XX.tif pattern)
  tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE)
  if (length(tif_files) == 0) {
    return(list(
      has_tiles = FALSE,
      detected_tiles = character(), 
      total_files = 0,
      source = "no_files_found"
    ))
  }
  tile_pattern <- "_(\\d{2})\\.tif$"
  tile_files <- tif_files[grepl(tile_pattern, tif_files)]
  has_tiles <- length(tile_files) > 0
  return(list(
    has_tiles = has_tiles,
    detected_tiles = tile_files,
    total_files = length(tif_files),
    source = "file_pattern_detection"
  ))
 }
 # ==============================================================================
 # SECTION 7: MOSAIC & KPI VERIFICATION HELPERS
 # ==============================================================================
 # Centralized helper functions for run_full_pipeline.R to avoid hardcoding paths
 #' Detect mosaic mode from project structure
 #'
 #' Determines if project uses "tiled" (legacy) or "single-file" (per-field) mosaics
 #'
 #' @param project_dir Character. Project name
 #' @return Character. "tiled" or "single-file"
 detect_mosaic_mode <- function(project_dir) {
  # Per-field architecture is standard - always return "single-file"
  # unless weekly_tile_max directory exists with content
  mosaic_tiled_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max")
  if (dir.exists(mosaic_tiled_dir) && length(list.files(mosaic_tiled_dir)) > 0) {
    return("tiled")
  }
  return("single-file")
 }
 #' Detect grid size from tile directory structure
 #'
 #' For per-field architecture, returns "unknown" (grid-based tiling is legacy)
 #'
 #' @param project_dir Character. Project name
 #' @return Character. Grid size ("unknown" for per-field)
 detect_grid_size <- function(project_dir) {
  # Per-field architecture doesn't use grid-based organization
  return("unknown")
 }
 #' Get project storage path
 #'
 #' @param project_dir Character. Project name
 #' @param subdir Character. Optional subdirectory (default NULL)
 #' @return Character. Full path
 get_project_storage_path <- function(project_dir, subdir = NULL) {
  path <- file.path("laravel_app", "storage", "app", project_dir)
  if (!is.null(subdir)) {
    path <- file.path(path, subdir)
  }
  return(path)
 }
 #' Get mosaic directory
 #'
 #' @param project_dir Character. Project name
 #' @param mosaic_mode Character. "tiled" or "single-file"
 #' @return Character. Full path to mosaic directory
 get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") {
  if (mosaic_mode == "auto") {
    mosaic_mode <- detect_mosaic_mode(project_dir)
  }
  if (mosaic_mode == "tiled") {
    get_project_storage_path(project_dir, "weekly_tile_max")
  } else {
    get_project_storage_path(project_dir, "weekly_mosaic")
  }
 }
 #' Get KPI directory based on client type
 #'
@ -713,20 +552,6 @@ check_harvest_output_exists <- function(project_dir, week_num, year_num) {
  file.exists(path)
 }
 #' Get mosaic verification directory
 #'
 #' @param project_dir Character. Project name
 #' @param mosaic_mode Character. "tiled" or "single-file"
 #' @return Character. Full path to mosaic directory
 get_mosaic_verification_dir <- function(project_dir, mosaic_mode) {
  base <- file.path("laravel_app", "storage", "app", project_dir)
  if (mosaic_mode == "tiled") {
    file.path(base, "weekly_tile_max")
  } else {
    file.path(base, "weekly_mosaic")
  }
 }
 #' Check if mosaic files exist for a specific week
 #'