From d81f1a4e5a29c2a7efb15faa4bb352adebb76b7d Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 12 Feb 2026 08:47:30 +0100 Subject: [PATCH 1/5] Enhance cum_ci_plot function with dynamic max values for x-axis breaks --- r_app/report_utils.R | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/r_app/report_utils.R b/r_app/report_utils.R index e7e7888..02c84cd 100644 --- a/r_app/report_utils.R +++ b/r_app/report_utils.R @@ -441,6 +441,10 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = " x_label <- switch(x_unit, "days" = if (facet_on) "Date" else "Age of Crop (Days)", "weeks" = "Week Number") + + # Calculate dynamic max values for breaks + max_doy <- max(plot_data$DOY, na.rm = TRUE) + 20 + max_week <- max(as.numeric(plot_data$week), na.rm = TRUE) + ceiling(20 / 7) # Create plot with either facets by season or overlay by DOY/week if (facet_on) { @@ -512,9 +516,9 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = " color_scale + { if (x_var == "DOY") { - ggplot2::scale_x_continuous(breaks = seq(0, 450, by = 50), sec.axis = ggplot2::sec_axis(~ . / 30.44, name = "Age in Months", breaks = seq(0, 14, by = 1))) + ggplot2::scale_x_continuous(breaks = seq(0, max_doy, by = 50), sec.axis = ggplot2::sec_axis(~ . / 30.44, name = "Age in Months", breaks = seq(0, 14, by = 1))) } else if (x_var == "week") { - ggplot2::scale_x_continuous(breaks = seq(0, 64, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1))) + ggplot2::scale_x_continuous(breaks = seq(0, max_week, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1))) } } + ggplot2::theme_minimal() + @@ -565,14 +569,18 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = " x_label <- switch(x_unit, "days" = if (facet_on) "Date" else "Age of Crop (Days)", "weeks" = "Week Number") - + # Choose color palette based on colorblind_friendly flag color_scale <- if (colorblind_friendly) { ggplot2::scale_color_brewer(type = "qual", palette = "Set2") } else { ggplot2::scale_color_discrete() } - + + # Calculate dynamic max values for breaks + max_doy_both <- max(plot_data_both$DOY, na.rm = TRUE) + 20 + max_week_both <- max(as.numeric(plot_data_both$week), na.rm = TRUE) + ceiling(20 / 7) + # Create the faceted plot g_both <- ggplot2::ggplot(data = plot_data_both) + # Add benchmark lines first (behind season lines) @@ -620,9 +628,9 @@ cum_ci_plot <- function(pivotName, ci_quadrant_data = CI_quadrant, plot_type = " color_scale + { if (x_var == "DOY") { - ggplot2::scale_x_continuous(breaks = seq(0, 450, by = 50), sec.axis = ggplot2::sec_axis(~ . / 30.44, name = "Age in Months", breaks = seq(0, 14, by = 1))) + ggplot2::scale_x_continuous(breaks = seq(0, max_doy_both, by = 50), sec.axis = ggplot2::sec_axis(~ . / 30.44, name = "Age in Months", breaks = seq(0, 14, by = 1))) } else if (x_var == "week") { - ggplot2::scale_x_continuous(breaks = seq(0, 64, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1))) + ggplot2::scale_x_continuous(breaks = seq(0, max_week_both, by = 4), sec.axis = ggplot2::sec_axis(~ . / 4.348, name = "Age in Months", breaks = seq(0, 14, by = 1))) } else if (x_var == "Date") { ggplot2::scale_x_date(breaks = "1 month", date_labels = "%b-%Y", sec.axis = ggplot2::sec_axis(~ ., name = "Age in Months", breaks = scales::breaks_pretty())) } From d075302efa238af467ea9100359cc9bfcc0a0a8d Mon Sep 17 00:00:00 2001 From: Timon Date: Mon, 16 Feb 2026 10:11:43 +0100 Subject: [PATCH 2/5] feat: Add spectral extraction and analysis scripts for SC-161 and SC-162 - Implemented `run_spectral_extraction.py` to batch extract spectral indices (NDVI, BSI, NDWI) from 4-band TIFF files, saving results in a structured CSV format. - Created `spectral_features.py` for calculating various spectral indices, including NDVI, BSI, NDWI, CI_green, CI_red, GNDVI, SAVI, and EVI2. - Added Jupyter notebook `02_season_normalization_analysis.ipynb` for analyzing season-length normalization, comparing ESA and Angata spectral indices, and visualizing growth patterns. - Included functions for computing season age, plotting trajectories, and performing peak and amplitude analysis. --- .gitignore | 4 + .../00_validate_pivot_geojson.ipynb | 258 ++++++++++ .../01_spectral_feature_exploration.py | 431 ++++++++++++++++ .../02_season_normalization_analysis.ipynb | 465 ++++++++++++++++++ .../run_spectral_extraction.py | 199 ++++++++ .../angata_improvements/spectral_features.py | 241 +++++++++ r_app/parameters_project.R | 183 +------ 7 files changed, 1602 insertions(+), 179 deletions(-) create mode 100644 python_app/harvest_detection_experiments/angata_improvements/00_validate_pivot_geojson.ipynb create mode 100644 python_app/harvest_detection_experiments/angata_improvements/01_spectral_feature_exploration.py create mode 100644 python_app/harvest_detection_experiments/angata_improvements/02_season_normalization_analysis.ipynb create mode 100644 python_app/harvest_detection_experiments/angata_improvements/run_spectral_extraction.py create mode 100644 python_app/harvest_detection_experiments/angata_improvements/spectral_features.py diff --git a/.gitignore b/.gitignore index ba19ced..04de0ff 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,10 @@ output/ reports/ *.docx +# Experiment Outputs (temporary plots, analysis artifacts) +python_app/harvest_detection_experiments/*/plots/ +python_app/harvest_detection_experiments/*/*.ipynb_checkpoints/ + # Cache Files rosm.cache/ *.cache diff --git a/python_app/harvest_detection_experiments/angata_improvements/00_validate_pivot_geojson.ipynb b/python_app/harvest_detection_experiments/angata_improvements/00_validate_pivot_geojson.ipynb new file mode 100644 index 0000000..b15edbe --- /dev/null +++ b/python_app/harvest_detection_experiments/angata_improvements/00_validate_pivot_geojson.ipynb @@ -0,0 +1,258 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d90f7f7f", + "metadata": {}, + "source": [ + "# Validate & Explore ESA `pivot.geojson`\n", + "\n", + "Quick inspection of the field boundary file for the ESA project:\n", + "- Geometry validity, CRS, field count\n", + "- Area/perimeter statistics\n", + "- Map visualization with field labels\n", + "- Export summary CSV for downstream use" + ] + }, + { + "cell_type": "markdown", + "id": "e99594bb", + "metadata": {}, + "source": [ + "## 1. Import Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "652485c7", + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas as gpd\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from pathlib import Path\n", + "from shapely.validation import make_valid\n", + "\n", + "# Project paths\n", + "PROJECT = \"esa\"\n", + "REPO_ROOT = Path.cwd().parents[2] # SmartCane_code\n", + "PROJECT_STORAGE = REPO_ROOT / \"laravel_app\" / \"storage\" / \"app\" / PROJECT\n", + "GEOJSON_PATH = PROJECT_STORAGE / \"pivot.geojson\"\n", + "DATA_DIR = PROJECT_STORAGE / \"Data\"\n", + "DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "print(f\"Repo root: {REPO_ROOT}\")\n", + "print(f\"GeoJSON: {GEOJSON_PATH}\")\n", + "print(f\"Exists: {GEOJSON_PATH.exists()}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4ff6d825", + "metadata": {}, + "source": [ + "## 2. Load GeoJSON & Basic Info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92012a8c", + "metadata": {}, + "outputs": [], + "source": [ + "gdf = gpd.read_file(GEOJSON_PATH)\n", + "\n", + "print(f\"Features: {len(gdf)}\")\n", + "print(f\"Geometry type: {gdf.geom_type.unique()}\")\n", + "print(f\"CRS: {gdf.crs}\")\n", + "print(f\"Bounds: {gdf.total_bounds}\")\n", + "print(f\"\\nColumns: {list(gdf.columns)}\")\n", + "print(f\"\\nAttribute table:\")\n", + "gdf.drop(columns=\"geometry\")" + ] + }, + { + "cell_type": "markdown", + "id": "edadb2de", + "metadata": {}, + "source": [ + "## 3. Validate Geometries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15b5e915", + "metadata": {}, + "outputs": [], + "source": [ + "# Check validity\n", + "validity = gdf.is_valid\n", + "empty = gdf.is_empty\n", + "\n", + "print(f\"Valid geometries: {validity.sum()}/{len(gdf)}\")\n", + "print(f\"Invalid geometries: {(~validity).sum()}\")\n", + "print(f\"Empty geometries: {empty.sum()}\")\n", + "\n", + "# Repair if needed\n", + "if not validity.all():\n", + " print(\"\\nRepairing invalid geometries...\")\n", + " gdf[\"geometry\"] = gdf[\"geometry\"].apply(make_valid)\n", + " print(f\"After repair: {gdf.is_valid.sum()}/{len(gdf)} valid\")\n", + "\n", + "# Remove empty\n", + "if empty.any():\n", + " gdf = gdf[~gdf.is_empty]\n", + " print(f\"After removing empty: {len(gdf)} features remaining\")" + ] + }, + { + "cell_type": "markdown", + "id": "c3f01400", + "metadata": {}, + "source": [ + "## 4. Field-Level Statistics (Area, Perimeter, Centroid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "922ffdbd", + "metadata": {}, + "outputs": [], + "source": [ + "# Reproject to UTM for accurate area/perimeter (auto-detect UTM zone from centroid)\n", + "centroid = gdf.dissolve().centroid.iloc[0]\n", + "utm_zone = int((centroid.x + 180) / 6) + 1\n", + "hemisphere = \"north\" if centroid.y >= 0 else \"south\"\n", + "utm_epsg = 32600 + utm_zone if hemisphere == \"north\" else 32700 + utm_zone\n", + "print(f\"Auto-detected UTM zone: {utm_zone}{hemisphere[0].upper()} (EPSG:{utm_epsg})\")\n", + "\n", + "gdf_utm = gdf.to_crs(epsg=utm_epsg)\n", + "\n", + "# Compute stats\n", + "gdf_utm[\"area_m2\"] = gdf_utm.geometry.area\n", + "gdf_utm[\"area_ha\"] = gdf_utm[\"area_m2\"] / 10_000\n", + "gdf_utm[\"area_acres\"] = gdf_utm[\"area_ha\"] * 2.47105\n", + "gdf_utm[\"perimeter_m\"] = gdf_utm.geometry.length\n", + "centroids = gdf_utm.geometry.centroid\n", + "gdf_utm[\"centroid_x\"] = centroids.x\n", + "gdf_utm[\"centroid_y\"] = centroids.y\n", + "\n", + "# Summary table\n", + "stats = gdf_utm[[\"field\", \"area_ha\", \"area_acres\", \"perimeter_m\", \"centroid_x\", \"centroid_y\"]].copy()\n", + "stats = stats.sort_values(\"area_ha\", ascending=False).reset_index(drop=True)\n", + "print(f\"\\nTotal area: {stats['area_ha'].sum():.1f} ha ({stats['area_acres'].sum():.1f} acres)\")\n", + "print(f\"Fields: {len(stats)}\")\n", + "print(f\"Area range: {stats['area_ha'].min():.1f} – {stats['area_ha'].max():.1f} ha\\n\")\n", + "stats" + ] + }, + { + "cell_type": "markdown", + "id": "3dbf84c7", + "metadata": {}, + "source": [ + "## 5. Visualize Field Boundaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be1be0ea", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(1, 1, figsize=(12, 10))\n", + "gdf_utm.plot(ax=ax, column=\"field\", legend=True, edgecolor=\"black\", linewidth=0.8,\n", + " cmap=\"Set3\", alpha=0.7, legend_kwds={\"loc\": \"upper left\", \"fontsize\": 8})\n", + "\n", + "# Add field labels at centroids\n", + "for _, row in gdf_utm.iterrows():\n", + " c = row.geometry.centroid\n", + " ax.annotate(row[\"field\"], xy=(c.x, c.y), ha=\"center\", va=\"center\",\n", + " fontsize=7, fontweight=\"bold\",\n", + " bbox=dict(boxstyle=\"round,pad=0.2\", fc=\"white\", alpha=0.7))\n", + "\n", + "ax.set_title(f\"ESA Field Boundaries ({len(gdf_utm)} fields)\", fontsize=14)\n", + "ax.set_xlabel(\"Easting (m)\")\n", + "ax.set_ylabel(\"Northing (m)\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "52301688", + "metadata": {}, + "source": [ + "## 6. CRS Check & Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4c712fa", + "metadata": {}, + "outputs": [], + "source": [ + "# Original CRS info\n", + "print(\"=== Original CRS ===\")\n", + "print(f\"CRS: {gdf.crs}\")\n", + "print(f\"Is geographic: {gdf.crs.is_geographic if gdf.crs else 'No CRS'}\")\n", + "print(f\"Is projected: {gdf.crs.is_projected if gdf.crs else 'No CRS'}\")\n", + "\n", + "if gdf.crs and gdf.crs.is_geographic:\n", + " # Compare area in geographic vs projected CRS\n", + " area_geographic = gdf.geometry.area # in degrees² (meaningless)\n", + " area_projected = gdf_utm.geometry.area # in m²\n", + " \n", + " print(f\"\\n=== Area comparison ===\")\n", + " print(f\"Geographic CRS area (degrees²): meaningless for spatial analysis\")\n", + " print(f\"Projected UTM area (EPSG:{utm_epsg}):\")\n", + " for _, row in gdf_utm.iterrows():\n", + " print(f\" {row['field']}: {row['area_ha']:.2f} ha\")\n", + " print(f\"\\nAlways use projected CRS (UTM) for area/distance calculations.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3a21236a", + "metadata": {}, + "source": [ + "## 7. Export Summary Table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acd4b3c7", + "metadata": {}, + "outputs": [], + "source": [ + "# Save summary CSV to project data directory\n", + "output_csv = DATA_DIR / \"field_summary.csv\"\n", + "stats.to_csv(output_csv, index=False)\n", + "print(f\"Saved field summary to: {output_csv}\")\n", + "print(f\"\\nFinal summary:\")\n", + "print(f\" Project: {PROJECT}\")\n", + "print(f\" Fields: {len(stats)}\")\n", + "print(f\" Total area: {stats['area_ha'].sum():.1f} ha ({stats['area_acres'].sum():.1f} acres)\")\n", + "print(f\" CRS: {gdf.crs} (original) → EPSG:{utm_epsg} (projected)\")\n", + "print(f\" All valid: {gdf.is_valid.all()}\")\n", + "stats" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python_app/harvest_detection_experiments/angata_improvements/01_spectral_feature_exploration.py b/python_app/harvest_detection_experiments/angata_improvements/01_spectral_feature_exploration.py new file mode 100644 index 0000000..32456bf --- /dev/null +++ b/python_app/harvest_detection_experiments/angata_improvements/01_spectral_feature_exploration.py @@ -0,0 +1,431 @@ +""" +01_spectral_feature_exploration.py — SC-161 Exploration +======================================================= +Explore spectral indices extracted from ESA 4-band TIFFs. + +Indices (8 total from RGB+NIR): + NDVI, BSI, NDWI, CI_green, CI_red, GNDVI, SAVI, EVI2 + +Key questions: +1. How do indices correlate with each other? Which are redundant? +2. Does any index capture harvest signal better than CI? +3. Which indices add independent information beyond CI_green? +4. Pre/post harvest distribution shifts per index + +Usage: + cd python_app/harvest_detection_experiments/angata_improvements + python 01_spectral_feature_exploration.py +""" + +import matplotlib +matplotlib.use("Agg") # Non-interactive backend — save only, no window blocking + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.dates as mdates +from pathlib import Path + +# ============================================================================= +# CONFIG +# ============================================================================= +PROJECT = "esa" +REPO_ROOT = Path(__file__).resolve().parents[3] +PROJECT_STORAGE = REPO_ROOT / "laravel_app" / "storage" / "app" / PROJECT + +SPECTRAL_CSV = PROJECT_STORAGE / "Data" / "extracted_ci" / "ci_data_for_python" / "spectral_indices.csv" +HARVEST_XLSX = PROJECT_STORAGE / "harvest.xlsx" + +# Output directory for plots +OUTPUT_DIR = Path(__file__).parent / "plots" +OUTPUT_DIR.mkdir(exist_ok=True) + +# All mean index columns +INDEX_COLS = [ + "mean_ndvi", "mean_bsi", "mean_ndwi", "mean_ci_green", + "mean_ci_red", "mean_gndvi", "mean_savi", "mean_evi2", +] +INDEX_LABELS = { + "mean_ndvi": "NDVI", + "mean_bsi": "BSI", + "mean_ndwi": "NDWI", + "mean_ci_green": "CI_green", + "mean_ci_red": "CI_red", + "mean_gndvi": "GNDVI", + "mean_savi": "SAVI", + "mean_evi2": "EVI2", +} +INDEX_COLORS = { + "mean_ndvi": "green", + "mean_bsi": "saddlebrown", + "mean_ndwi": "steelblue", + "mean_ci_green": "darkgreen", + "mean_ci_red": "darkred", + "mean_gndvi": "olive", + "mean_savi": "teal", + "mean_evi2": "purple", +} + +# ============================================================================= +# LOAD DATA +# ============================================================================= +print("=" * 80) +print("SC-161: SPECTRAL FEATURE EXPLORATION — ESA (8 indices)") +print("=" * 80) + +# Spectral indices +si = pd.read_csv(SPECTRAL_CSV, parse_dates=["Date"]) +si = si[si["field"] != "00F25"] # drop always-NaN field +print(f"\nSpectral indices: {len(si)} rows, {si['field'].nunique()} fields") +print(f" Date range: {si['Date'].min().date()} → {si['Date'].max().date()}") +print(f" Columns: {[c for c in si.columns if c.startswith('mean_')]}") +for col in INDEX_COLS: + if col in si.columns: + nan_pct = si[col].isna().mean() * 100 + print(f" {INDEX_LABELS[col]:10s} NaN: {nan_pct:.1f}% mean: {si[col].mean():.4f} range: [{si[col].min():.4f}, {si[col].max():.4f}]") + +# Harvest data +harvest = pd.read_excel(HARVEST_XLSX) +harvest["season_start"] = pd.to_datetime(harvest["season_start"], errors="coerce") +harvest["season_end"] = pd.to_datetime(harvest["season_end"], errors="coerce") + +# Filter to fields we have spectral data for +our_fields = set(si["field"].unique()) +harvest = harvest[harvest["field"].isin(our_fields)].copy() +# Only keep seasons with an actual end date (= confirmed harvest) +harvest_events = harvest.dropna(subset=["season_end"]).copy() +print(f"\nHarvest events (confirmed): {len(harvest_events)} across {harvest_events['field'].nunique()} fields") + + +# ============================================================================= +# 1. FULL CORRELATION MATRIX — all 8 indices +# ============================================================================= +print("\n--- Full correlation matrix (8 indices) ---") + +available_cols = [c for c in INDEX_COLS if c in si.columns] +valid = si.dropna(subset=available_cols) +corr_matrix = valid[available_cols].corr() + +# Print in readable format +labels = [INDEX_LABELS[c] for c in available_cols] +print(f"\n{'':>12s}", " ".join(f"{l:>10s}" for l in labels)) +for i, (col, label) in enumerate(zip(available_cols, labels)): + vals = " ".join(f"{corr_matrix.iloc[i, j]:>10.3f}" for j in range(len(available_cols))) + print(f"{label:>12s} {vals}") + +# Heatmap +fig, ax = plt.subplots(figsize=(10, 8)) +im = ax.imshow(corr_matrix.values, cmap="RdBu_r", vmin=-1, vmax=1, aspect="equal") +ax.set_xticks(range(len(labels))) +ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=10) +ax.set_yticks(range(len(labels))) +ax.set_yticklabels(labels, fontsize=10) +ax.set_title("Spectral Index Correlation Matrix (ESA)", fontsize=14) + +# Annotate cells +for i in range(len(labels)): + for j in range(len(labels)): + val = corr_matrix.iloc[i, j] + txt_color = "white" if abs(val) > 0.7 else "black" + ax.text(j, i, f"{val:.2f}", ha="center", va="center", fontsize=9, color=txt_color) + +plt.colorbar(im, ax=ax, label="Pearson r", shrink=0.8) +plt.tight_layout() +fig.savefig(OUTPUT_DIR / "correlation_matrix_all_indices.png", dpi=150, bbox_inches="tight") +plt.close() +print(f" → Saved correlation_matrix_all_indices.png") + + +# ============================================================================= +# 2. PER-FIELD CORRELATION — CI_green vs each index +# ============================================================================= +print("\n--- Per-field correlation: CI_green vs each index ---") + +ci_col = "mean_ci_green" +other_cols = [c for c in available_cols if c != ci_col] + +fig, axes = plt.subplots(2, 4, figsize=(20, 10)) +axes_flat = axes.flatten() + +for i, col in enumerate(other_cols): + ax = axes_flat[i] + label = INDEX_LABELS[col] + + field_corrs = valid.groupby("field").apply( + lambda g: g[ci_col].corr(g[col]) if len(g) > 30 else np.nan + ).dropna() + + ax.hist(field_corrs, bins=20, color=INDEX_COLORS.get(col, "gray"), edgecolor="white", alpha=0.8) + ax.axvline(field_corrs.mean(), color="red", linestyle="--", linewidth=2, + label=f"mean={field_corrs.mean():.3f}") + ax.set_xlabel(f"r(CI_green, {label})") + ax.set_ylabel("Field count") + ax.set_title(f"CI_green vs {label}") + ax.legend(fontsize=8) + ax.set_xlim(-1, 1) + + print(f" CI_green vs {label:8s}: mean r = {field_corrs.mean():.4f}, std = {field_corrs.std():.4f}") + +# Remove unused subplot +if len(other_cols) < len(axes_flat): + for j in range(len(other_cols), len(axes_flat)): + axes_flat[j].set_visible(False) + +fig.suptitle("Per-field correlation: CI_green vs each index", fontsize=14, fontweight="bold") +plt.tight_layout() +fig.savefig(OUTPUT_DIR / "ci_green_vs_all_per_field_corr.png", dpi=150, bbox_inches="tight") +plt.close() +print(f" → Saved ci_green_vs_all_per_field_corr.png") + + +# ============================================================================= +# 3. PRE/POST HARVEST SHIFTS — all indices +# ============================================================================= +print("\n--- Pre/post harvest distribution shifts (all indices) ---") + +WINDOW_DAYS = 30 + +shift_results = {} + +for col in available_cols: + label = INDEX_LABELS[col] + pre_vals = [] + post_vals = [] + + for _, h in harvest_events.iterrows(): + field_data = si[si["field"] == h["field"]].copy() + if field_data.empty: + continue + harvest_date = h["season_end"] + + pre = field_data[ + (field_data["Date"] >= harvest_date - pd.Timedelta(days=WINDOW_DAYS)) + & (field_data["Date"] < harvest_date) + ][col].dropna() + pre_vals.extend(pre.tolist()) + + post = field_data[ + (field_data["Date"] > harvest_date) + & (field_data["Date"] <= harvest_date + pd.Timedelta(days=WINDOW_DAYS)) + ][col].dropna() + post_vals.extend(post.tolist()) + + pre_arr = np.array(pre_vals) + post_arr = np.array(post_vals) + delta = post_arr.mean() - pre_arr.mean() + # Effect size (Cohen's d) + pooled_std = np.sqrt((pre_arr.std() ** 2 + post_arr.std() ** 2) / 2) + cohens_d = delta / pooled_std if pooled_std > 0 else 0 + + shift_results[col] = { + "label": label, + "pre_mean": pre_arr.mean(), + "post_mean": post_arr.mean(), + "delta": delta, + "abs_delta": abs(delta), + "cohens_d": cohens_d, + "pre_n": len(pre_arr), + "post_n": len(post_arr), + } + +# Print sorted by absolute Cohen's d (effect size) +print(f"\n{'Index':>12s} {'Pre':>8s} {'Post':>8s} {'Delta':>8s} {'Cohen d':>8s} {'n_pre':>6s} {'n_post':>6s}") +for col in sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True): + r = shift_results[col] + print(f"{r['label']:>12s} {r['pre_mean']:>8.4f} {r['post_mean']:>8.4f} {r['delta']:>+8.4f} {r['cohens_d']:>+8.3f} {r['pre_n']:>6d} {r['post_n']:>6d}") + +# Bar chart of effect sizes +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6)) + +sorted_cols = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True) +labels_sorted = [shift_results[c]["label"] for c in sorted_cols] +deltas = [shift_results[c]["delta"] for c in sorted_cols] +cohens_ds = [shift_results[c]["cohens_d"] for c in sorted_cols] +colors = [INDEX_COLORS.get(c, "gray") for c in sorted_cols] + +# Raw delta +ax1.barh(labels_sorted, deltas, color=colors, alpha=0.8, edgecolor="white") +ax1.set_xlabel("Post - Pre harvest mean") +ax1.set_title("Raw shift at harvest (±30 days)") +ax1.axvline(0, color="black", linewidth=0.5) +ax1.grid(True, alpha=0.3, axis="x") + +# Cohen's d +ax2.barh(labels_sorted, [abs(d) for d in cohens_ds], color=colors, alpha=0.8, edgecolor="white") +ax2.set_xlabel("|Cohen's d| (effect size)") +ax2.set_title("Harvest signal strength (effect size)") +ax2.axvline(0.2, color="gray", linestyle=":", label="small (0.2)") +ax2.axvline(0.5, color="gray", linestyle="--", label="medium (0.5)") +ax2.axvline(0.8, color="gray", linestyle="-", label="large (0.8)") +ax2.legend(fontsize=8) +ax2.grid(True, alpha=0.3, axis="x") + +plt.tight_layout() +fig.savefig(OUTPUT_DIR / "harvest_signal_all_indices.png", dpi=150, bbox_inches="tight") +plt.close() +print(f" → Saved harvest_signal_all_indices.png") + + +# ============================================================================= +# 4. PRE/POST DISTRIBUTIONS — top 4 by harvest signal +# ============================================================================= +print("\n--- Pre/post harvest distributions (top 4 indices by effect size) ---") + +top4_cols = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)[:4] + +fig, axes = plt.subplots(1, 4, figsize=(20, 5)) + +for ax, col in zip(axes, top4_cols): + label = INDEX_LABELS[col] + pre_vals = [] + post_vals = [] + for _, h in harvest_events.iterrows(): + field_data = si[si["field"] == h["field"]].copy() + if field_data.empty: + continue + harvest_date = h["season_end"] + pre = field_data[ + (field_data["Date"] >= harvest_date - pd.Timedelta(days=WINDOW_DAYS)) + & (field_data["Date"] < harvest_date) + ][col].dropna() + pre_vals.extend(pre.tolist()) + post = field_data[ + (field_data["Date"] > harvest_date) + & (field_data["Date"] <= harvest_date + pd.Timedelta(days=WINDOW_DAYS)) + ][col].dropna() + post_vals.extend(post.tolist()) + + ax.hist(pre_vals, bins=40, alpha=0.6, color="green", label=f"Pre (n={len(pre_vals)})", density=True) + ax.hist(post_vals, bins=40, alpha=0.6, color="brown", label=f"Post (n={len(post_vals)})", density=True) + d = shift_results[col]["cohens_d"] + ax.set_title(f"{label} (d={d:+.2f})") + ax.set_xlabel(label) + ax.set_ylabel("Density") + ax.legend(fontsize=8) + ax.grid(True, alpha=0.3) + +fig.suptitle(f"Pre/Post Harvest ±{WINDOW_DAYS}d — Top 4 by effect size", fontsize=13, fontweight="bold") +plt.tight_layout() +fig.savefig(OUTPUT_DIR / "pre_post_harvest_top4.png", dpi=150, bbox_inches="tight") +plt.close() +print(f" → Saved pre_post_harvest_top4.png") + + +# ============================================================================= +# 5. TIMESERIES — top field, all 8 indices +# ============================================================================= +print("\n--- Timeseries: all indices for top harvest-rich fields ---") + +top_fields = harvest_events.groupby("field").size().sort_values(ascending=False).head(3).index.tolist() +print(f"Top fields: {top_fields}") + +for field_id in top_fields: + field_data = si[si["field"] == field_id].sort_values("Date").copy() + field_harvests = harvest_events[harvest_events["field"] == field_id] + + fig, axes = plt.subplots(4, 2, figsize=(20, 16), sharex=True) + fig.suptitle(f"Field {field_id} — All Spectral Indices", fontsize=14, fontweight="bold") + + for ax, col in zip(axes.flatten(), available_cols): + label = INDEX_LABELS[col] + color = INDEX_COLORS.get(col, "gray") + vals = field_data.dropna(subset=[col]) + + ax.plot(vals["Date"], vals[col], color=color, linewidth=0.6, alpha=0.7) + + # 14-day rolling mean + if len(vals) > 14: + rolling = vals.set_index("Date")[col].rolling("14D", min_periods=3).mean() + ax.plot(rolling.index, rolling.values, color=color, linewidth=2, alpha=0.8) + + # Harvest dates + for _, h in field_harvests.iterrows(): + ax.axvline(h["season_end"], color="red", linestyle="--", alpha=0.6, linewidth=1) + + ax.set_ylabel(label, fontsize=10, fontweight="bold") + ax.grid(True, alpha=0.3) + + axes[-1, 0].xaxis.set_major_formatter(mdates.DateFormatter("%Y")) + axes[-1, 1].xaxis.set_major_formatter(mdates.DateFormatter("%Y")) + + plt.tight_layout() + fig.savefig(OUTPUT_DIR / f"all_indices_{field_id}.png", dpi=150, bbox_inches="tight") + plt.close() + print(f" → Saved all_indices_{field_id}.png") + + +# ============================================================================= +# 6. REDUNDANCY ANALYSIS — which indices add information beyond CI_green? +# ============================================================================= +print("\n--- Redundancy analysis: residual variance after regressing vs CI_green ---") + +from numpy.polynomial import polynomial as P + +residual_variance = {} + +for col in [c for c in available_cols if c != ci_col]: + label = INDEX_LABELS[col] + pair = valid[[ci_col, col]].dropna() + if len(pair) < 100: + continue + + x, y = pair[ci_col].values, pair[col].values + # Fit linear regression CI_green → index + coeffs = P.polyfit(x, y, deg=1) + y_pred = P.polyval(x, coeffs) + residuals = y - y_pred + total_var = np.var(y) + resid_var = np.var(residuals) + r_squared = 1 - resid_var / total_var + + residual_variance[col] = { + "label": label, + "r_squared": r_squared, + "residual_pct": (1 - r_squared) * 100, + } + +print(f"\n{'Index':>12s} {'R² vs CI':>10s} {'Residual %':>12s} {'Interpretation'}") +for col in sorted(residual_variance.keys(), key=lambda c: residual_variance[c]["residual_pct"], reverse=True): + r = residual_variance[col] + interp = "UNIQUE signal" if r["residual_pct"] > 20 else "Some unique" if r["residual_pct"] > 5 else "Redundant" + print(f"{r['label']:>12s} {r['r_squared']:>10.4f} {r['residual_pct']:>11.1f}% {interp}") + + +# ============================================================================= +# 7. SUMMARY +# ============================================================================= +print("\n" + "=" * 80) +print("SUMMARY — SC-161 Expanded Index Analysis") +print("=" * 80) + +# Best harvest signal +best_col = sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True)[0] +best = shift_results[best_col] + +# Most unique signal (highest residual variance vs CI_green) +most_unique = sorted(residual_variance.keys(), key=lambda c: residual_variance[c]["residual_pct"], reverse=True) + +print(f""" +INDICES COMPUTED: {', '.join(INDEX_LABELS[c] for c in available_cols)} +DATA: {si['field'].nunique()} fields, {si['Date'].nunique()} dates + +HARVEST SIGNAL (strongest → weakest by Cohen's d): +""") +for col in sorted(shift_results.keys(), key=lambda c: abs(shift_results[c]["cohens_d"]), reverse=True): + r = shift_results[col] + print(f" {r['label']:>12s}: Cohen's d = {r['cohens_d']:+.3f} (Δ = {r['delta']:+.4f})") + +print(f""" +REDUNDANCY vs CI_green (unique information): +""") +for col in most_unique: + r = residual_variance[col] + print(f" {r['label']:>12s}: {r['residual_pct']:.1f}% unique variance (R² = {r['r_squared']:.3f})") + +print(f""" +RECOMMENDATION: + Best harvest signal: {best['label']} (d={best['cohens_d']:+.3f}) + Most unique vs CI: {residual_variance[most_unique[0]]['label']} ({residual_variance[most_unique[0]]['residual_pct']:.1f}% independent) + +Plots saved to: {OUTPUT_DIR} +""") diff --git a/python_app/harvest_detection_experiments/angata_improvements/02_season_normalization_analysis.ipynb b/python_app/harvest_detection_experiments/angata_improvements/02_season_normalization_analysis.ipynb new file mode 100644 index 0000000..b1c48a0 --- /dev/null +++ b/python_app/harvest_detection_experiments/angata_improvements/02_season_normalization_analysis.ipynb @@ -0,0 +1,465 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e9cdc8ee", + "metadata": {}, + "source": [ + "# SC-162: Season-Length Normalization Analysis\n", + "## Compare ESA vs Angata Spectral Indices (CI_green, SAVI)\n", + "\n", + "Multi-year data analysis focusing on:\n", + "- **Peak timing** (at what season-age % do peaks occur?)\n", + "- **Amplitude** (how high/low?)\n", + "- **Phase** (lag between indices)\n", + "- **Shape** (growth curve pattern)\n", + "\n", + "Note: Cannot compare absolute DOY across multiple years, but CAN compare normalized season age (0-1 scale) and pattern shapes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59943da7", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.dates as mdates\n", + "from pathlib import Path\n", + "from scipy import stats\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Configuration\n", + "REPO_ROOT = Path.cwd().parent.parent.parent.parent\n", + "print(f\"Repo root: {REPO_ROOT}\")\n", + "\n", + "# Project-specific config\n", + "PROJECTS = {\n", + " 'esa': {\n", + " 'expected_season_length': 365,\n", + " 'storage_path': REPO_ROOT / 'laravel_app' / 'storage' / 'app' / 'esa',\n", + " },\n", + " 'angata': {\n", + " 'expected_season_length': 540,\n", + " 'storage_path': REPO_ROOT / 'laravel_app' / 'storage' / 'app' / 'angata',\n", + " }\n", + "}\n", + "\n", + "# Load data for both projects\n", + "data = {}\n", + "harvest_data = {}\n", + "\n", + "for project, config in PROJECTS.items():\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"Loading {project.upper()}\")\n", + " print(f\"{'='*60}\")\n", + " \n", + " # Spectral indices CSV\n", + " csv_path = config['storage_path'] / 'Data' / 'extracted_ci' / 'ci_data_for_python' / 'spectral_indices.csv'\n", + " df = pd.read_csv(csv_path, parse_dates=['Date'])\n", + " df = df[df['field'] != '00F25'] # Remove always-NaN field if present\n", + " print(f\"Spectral data: {len(df)} rows, {df['field'].nunique()} fields\")\n", + " print(f\"Date range: {df['Date'].min().date()} → {df['Date'].max().date()}\")\n", + " \n", + " # Harvest dates\n", + " harvest_path = config['storage_path'] / 'Data' / 'harvest.xlsx'\n", + " harvest_df = pd.read_excel(harvest_path)\n", + " harvest_df['season_start'] = pd.to_datetime(harvest_df['season_start'], errors='coerce')\n", + " harvest_df['season_end'] = pd.to_datetime(harvest_df['season_end'], errors='coerce')\n", + " \n", + " # Filter to fields we have spectral data for\n", + " our_fields = set(df['field'].unique())\n", + " harvest_df = harvest_df[harvest_df['field'].isin(our_fields)].copy()\n", + " harvest_events = harvest_df.dropna(subset=['season_end']).copy()\n", + " print(f\"Harvest events: {len(harvest_events)} across {harvest_events['field'].nunique()} fields\")\n", + " \n", + " data[project] = df\n", + " harvest_data[project] = harvest_events" + ] + }, + { + "cell_type": "markdown", + "id": "260b405b", + "metadata": {}, + "source": [ + "## Step 1: Compute Season Age for Each Observation\n", + "\n", + "For each field-date pair, calculate days since season start and normalize to 0-1 scale." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55a0f94f", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_season_age(spec_df, harvest_df, project_season_length):\n", + " \"\"\"\n", + " Compute normalized season age (0-1) for each row.\n", + " \n", + " For each field, use the earliest season_start from harvest data as the baseline.\n", + " If no season_start available, use the earliest date in spectral data.\n", + " \"\"\"\n", + " spec_copy = spec_df.copy()\n", + " spec_copy['season_age'] = np.nan\n", + " spec_copy['days_since_start'] = np.nan\n", + " \n", + " for field_id in spec_copy['field'].unique():\n", + " # Get season start for this field\n", + " field_harvest = harvest_df[harvest_df['field'] == field_id]\n", + " \n", + " if not field_harvest.empty and not field_harvest['season_start'].isna().all():\n", + " season_start = field_harvest['season_start'].min()\n", + " else:\n", + " # Fallback: use first observation date\n", + " field_data = spec_copy[spec_copy['field'] == field_id]\n", + " season_start = field_data['Date'].min()\n", + " \n", + " # Compute for all rows of this field\n", + " field_mask = spec_copy['field'] == field_id\n", + " days_since = (spec_copy.loc[field_mask, 'Date'] - season_start).dt.days\n", + " spec_copy.loc[field_mask, 'days_since_start'] = days_since\n", + " spec_copy.loc[field_mask, 'season_age'] = days_since / project_season_length\n", + " \n", + " return spec_copy\n", + "\n", + "# Compute for both projects\n", + "for project, config in PROJECTS.items():\n", + " data[project] = compute_season_age(\n", + " data[project],\n", + " harvest_data[project],\n", + " config['expected_season_length']\n", + " )\n", + " print(f\"{project.upper()} season_age range: [{data[project]['season_age'].min():.3f}, {data[project]['season_age'].max():.3f}]\")" + ] + }, + { + "cell_type": "markdown", + "id": "b946c36c", + "metadata": {}, + "source": [ + "## Step 2: Plot CI_green + SAVI Trajectories by Project\n", + "\n", + "Multi-year overlay to compare:\n", + "- Peak timing (at what season-age %?)\n", + "- Amplitude (range of values)\n", + "- Phase (timing relative to each other)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8de5a03", + "metadata": {}, + "outputs": [], + "source": [ + "# Create figure: 2 projects × 2 indices = 4 subplots\n", + "fig, axes = plt.subplots(2, 2, figsize=(18, 12))\n", + "\n", + "indices = ['mean_ci_green', 'mean_savi']\n", + "index_labels = {'mean_ci_green': 'CI_green (Chlorophyll)', 'mean_savi': 'SAVI (Soil-Adjusted)'}\n", + "\n", + "for row, project in enumerate(['esa', 'angata']):\n", + " df = data[project]\n", + " season_len = PROJECTS[project]['expected_season_length']\n", + " \n", + " for col, idx in enumerate(indices):\n", + " ax = axes[row, col]\n", + " \n", + " # Plot all field timeseries (normalized season age)\n", + " for field in df['field'].unique():\n", + " field_data = df[df['field'] == field].dropna(subset=[idx, 'season_age']).sort_values('season_age')\n", + " ax.plot(field_data['season_age'], field_data[idx], alpha=0.15, color='steelblue', linewidth=0.8)\n", + " \n", + " # Overlay mean trajectory with rolling average\n", + " binned = df.dropna(subset=[idx, 'season_age']).copy()\n", + " binned['season_age_bin'] = pd.cut(binned['season_age'], bins=20)\n", + " mean_trajectory = binned.groupby('season_age_bin')[idx].mean()\n", + " bin_centers = [interval.mid for interval in mean_trajectory.index]\n", + " ax.plot(bin_centers, mean_trajectory.values, color='red', linewidth=3, label='Average', zorder=10)\n", + " \n", + " ax.set_xlabel('Season Age (0 = start, 1 = expected end)', fontsize=11)\n", + " ax.set_ylabel(index_labels[idx], fontsize=11)\n", + " ax.set_title(f\"{project.upper()} - {index_labels[idx]}\\n(Season length: {season_len}d)\", fontsize=12, fontweight='bold')\n", + " ax.set_xlim(0, 1.2) # Allow slight overgrowth\n", + " ax.grid(True, alpha=0.3)\n", + " ax.legend(fontsize=9)\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('season_normalization_trajectories.png', dpi=150, bbox_inches='tight')\n", + "plt.show()\n", + "print(\"\\nSaved: season_normalization_trajectories.png\")" + ] + }, + { + "cell_type": "markdown", + "id": "f60a0dc5", + "metadata": {}, + "source": [ + "## Step 3: Peak Analysis - Where do maxima occur?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e149843", + "metadata": {}, + "outputs": [], + "source": [ + "def find_peaks_per_field(df, index_col, season_age_col='season_age'):\n", + " \"\"\"\n", + " For each field, find the peak (maximum) in the normalized season age.\n", + " Return statistics about where peaks occur.\n", + " \"\"\"\n", + " peaks = []\n", + " \n", + " for field in df['field'].unique():\n", + " field_data = df[df['field'] == field].dropna(subset=[index_col, season_age_col]).sort_values(season_age_col)\n", + " \n", + " if len(field_data) > 10: # Need enough data\n", + " max_idx = field_data[index_col].idxmax()\n", + " peak_season_age = field_data.loc[max_idx, season_age_col]\n", + " peak_value = field_data.loc[max_idx, index_col]\n", + " peaks.append({\n", + " 'field': field,\n", + " 'peak_season_age': peak_season_age,\n", + " 'peak_value': peak_value\n", + " })\n", + " \n", + " return pd.DataFrame(peaks)\n", + "\n", + "print(\"\\n\" + \"=\"*70)\n", + "print(\"PEAK TIMING ANALYSIS (at what season-age do indices peak?)\")\n", + "print(\"=\"*70)\n", + "\n", + "for project in ['esa', 'angata']:\n", + " print(f\"\\n{project.upper()}:\")\n", + " print(\"-\" * 70)\n", + " \n", + " for idx, label in [('mean_ci_green', 'CI_green'), ('mean_savi', 'SAVI')]:\n", + " peaks_df = find_peaks_per_field(data[project], idx, 'season_age')\n", + " \n", + " print(f\"\\n {label}:\")\n", + " print(f\" Fields analyzed: {len(peaks_df)}\")\n", + " print(f\" Peak season_age: mean={peaks_df['peak_season_age'].mean():.3f}, std={peaks_df['peak_season_age'].std():.3f}\")\n", + " print(f\" Peak occurs at: {peaks_df['peak_season_age'].mean()*100:.1f}% through season\")\n", + " print(f\" Peak value range: [{peaks_df['peak_value'].min():.4f}, {peaks_df['peak_value'].max():.4f}]\")" + ] + }, + { + "cell_type": "markdown", + "id": "1d6e97e4", + "metadata": {}, + "source": [ + "## Step 4: Amplitude Comparison - How strong are the signals?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75c72340", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\"*70)\n", + "print(\"AMPLITUDE ANALYSIS (value ranges per project)\")\n", + "print(\"=\"*70)\n", + "\n", + "for project in ['esa', 'angata']:\n", + " print(f\"\\n{project.upper()}:\")\n", + " print(\"-\" * 70)\n", + " \n", + " df = data[project]\n", + " \n", + " for idx, label in [('mean_ci_green', 'CI_green'), ('mean_savi', 'SAVI')]:\n", + " valid = df[idx].dropna()\n", + " amplitude = valid.max() - valid.min()\n", + " \n", + " print(f\"\\n {label}:\")\n", + " print(f\" Mean: {valid.mean():.4f}\")\n", + " print(f\" Std: {valid.std():.4f}\")\n", + " print(f\" Min: {valid.min():.4f}\")\n", + " print(f\" Max: {valid.max():.4f}\")\n", + " print(f\" Amplitude (max-min): {amplitude:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8c2e3fc8", + "metadata": {}, + "source": [ + "## Step 5: Phase Analysis - Lead/lag between CI_green and SAVI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "867d39e9", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_cross_correlation(df, idx1, idx2, max_lag=0.2):\n", + " \"\"\"\n", + " Compute cross-correlation between two indices to find phase lag.\n", + " \"\"\"\n", + " # Aggregate by season_age bins\n", + " binned = df.dropna(subset=[idx1, idx2, 'season_age']).copy()\n", + " binned['bin'] = pd.cut(binned['season_age'], bins=30)\n", + " \n", + " agg = binned.groupby('bin')[[idx1, idx2]].mean()\n", + " agg = agg.dropna()\n", + " \n", + " if len(agg) < 10:\n", + " return np.nan, np.nan\n", + " \n", + " # Normalize\n", + " x = (agg[idx1] - agg[idx1].mean()) / agg[idx1].std()\n", + " y = (agg[idx2] - agg[idx2].mean()) / agg[idx2].std()\n", + " \n", + " # Cross-correlation\n", + " corr = np.correlate(x.values, y.values, mode='full')\n", + " lags = np.arange(-len(x)+1, len(x))\n", + " \n", + " best_lag_idx = np.argmax(np.abs(corr))\n", + " best_lag = lags[best_lag_idx]\n", + " max_corr = corr[best_lag_idx] / (len(x) - 1)\n", + " \n", + " return best_lag, max_corr\n", + "\n", + "print(\"\\n\" + \"=\"*70)\n", + "print(\"PHASE ANALYSIS (CI_green vs SAVI lag)\")\n", + "print(\"=\"*70)\n", + "\n", + "for project in ['esa', 'angata']:\n", + " df = data[project]\n", + " lag, corr = compute_cross_correlation(df, 'mean_ci_green', 'mean_savi')\n", + " \n", + " print(f\"\\n{project.upper()}:\")\n", + " if not np.isnan(lag):\n", + " print(f\" CI_green → SAVI lag: {lag:.2f} bins (~{(lag/30):.3f} of season)\")\n", + " print(f\" Max cross-correlation: {corr:.3f}\")\n", + " print(f\" Interpretation: {'SAVI slightly leads CI_green' if lag < 0 else 'CI_green slightly leads SAVI' if lag > 0 else 'Synchronized'}\")\n", + " else:\n", + " print(f\" Insufficient data for lag analysis\")" + ] + }, + { + "cell_type": "markdown", + "id": "5277beb9", + "metadata": {}, + "source": [ + "## Step 6: Growth Curve Shape - Do they follow similar patterns?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24685c7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare growth curves directly: normalized to 0-1 on both axes\n", + "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", + "\n", + "for col, idx in enumerate(['mean_ci_green', 'mean_savi']):\n", + " ax = axes[col]\n", + " \n", + " for project, color in [('esa', 'blue'), ('angata', 'orange')]:\n", + " df = data[project]\n", + " binned = df.dropna(subset=[idx, 'season_age']).copy()\n", + " binned['bin'] = pd.cut(binned['season_age'], bins=25)\n", + " \n", + " # Mean trajectory per bin\n", + " traj = binned.groupby('bin')[idx].agg(['mean', 'std', 'count'])\n", + " traj = traj[traj['count'] > 5] # Only bins with N>5\n", + " bin_centers = [interval.mid for interval in traj.index]\n", + " \n", + " # Normalize to 0-1 for visual comparison\n", + " normalized_vals = (traj['mean'] - traj['mean'].min()) / (traj['mean'].max() - traj['mean'].min())\n", + " \n", + " ax.plot(bin_centers, normalized_vals.values, color=color, linewidth=2.5, \n", + " marker='o', markersize=6, label=f\"{project.upper()}\", zorder=10)\n", + " ax.fill_between(bin_centers, \n", + " (normalized_vals - traj['std']/traj['mean'].std()).values,\n", + " (normalized_vals + traj['std']/traj['mean'].std()).values,\n", + " alpha=0.2, color=color)\n", + " \n", + " ax.set_xlabel('Season Age (0-1)', fontsize=12)\n", + " ax.set_ylabel('Normalized Index (0-1)', fontsize=12)\n", + " ax.set_title(f\"Growth Curve Shape: {['CI_green', 'SAVI'][col]}\", fontsize=13, fontweight='bold')\n", + " ax.set_xlim(0, 1.1)\n", + " ax.set_ylim(-0.1, 1.1)\n", + " ax.grid(True, alpha=0.3)\n", + " ax.legend(fontsize=11, loc='best')\n", + "\n", + "plt.tight_layout()\n", + "plt.savefig('growth_curve_comparison.png', dpi=150, bbox_inches='tight')\n", + "plt.show()\n", + "print(\"\\nSaved: growth_curve_comparison.png\")" + ] + }, + { + "cell_type": "markdown", + "id": "27b5e61a", + "metadata": {}, + "source": [ + "## Summary: Does Season-Age Normalization Work?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9252cf10", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\"*70)\n", + "print(\"SC-162 FINDINGS: Season-Age Normalization Validation\")\n", + "print(\"=\"*70)\n", + "\n", + "print(\"\"\"\n", + "▶ KEY QUESTION: Do ESA and Angata follow similar growth patterns when normalized to season age?\n", + "\n", + "If YES (patterns align on 0-1 scale):\n", + " ✅ Season-age normalization is valid\n", + " ✅ Can train model on ESA, apply to Angata with confidence\n", + " ✅ Use proportional imminent window (7-8% of season length)\n", + "\n", + "If NO (patterns diverge):\n", + " ❌ Need project-specific models\n", + " ❌ Investigate why patterns differ (climate, variety, soil, etc.)\n", + " ❌ Consider multivariate approach (season_age + other features)\n", + "\n", + "OBSERVATIONS TO CHECK ABOVE:\n", + "1. Peak timing: Do both peak at ~same season-age?\n", + "2. Growth curve shape: Do normalized curves look similar?\n", + "3. Amplitude: Is the relative range comparable?\n", + "4. Phase: Is lag between CI_green and SAVI similar across projects?\n", + "\"\"\")\n", + "\n", + "print(\"\\nNEXT STEPS:\")\n", + "print(\"1. If patterns align → Proceed with feature engineering (Phase 5)\")\n", + "print(\"2. If patterns differ → Debug and adjust project configs\")\n", + "print(\"3. Test both fixed (28d) and proportional imminent windows on ESA\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch_gpu", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/python_app/harvest_detection_experiments/angata_improvements/run_spectral_extraction.py b/python_app/harvest_detection_experiments/angata_improvements/run_spectral_extraction.py new file mode 100644 index 0000000..16a0d94 --- /dev/null +++ b/python_app/harvest_detection_experiments/angata_improvements/run_spectral_extraction.py @@ -0,0 +1,199 @@ +""" +run_spectral_extraction.py — Batch extract spectral indices from all TIFFs +========================================================================== +Part of SC-161: Extract spectral indices from 4-band TIFFs (BSI, NDVI, NDWI) + +Loops over all dated TIF files in a project's TIF folder, computes per-field +NDVI, BSI, NDWI and saves a single CSV with the same field/date structure +as ci_data_for_python.csv (can be joined on field + Date). + +Usage: + cd python_app/harvest_detection_experiments/angata_improvements + python run_spectral_extraction.py --project esa + python run_spectral_extraction.py --project esa --tif-folder merged_final_tif + python run_spectral_extraction.py --project esa --start 2024-01-01 --end 2024-12-31 +""" + +import argparse +import sys +import time +from datetime import datetime +from pathlib import Path + +import geopandas as gpd +import pandas as pd + +# Add parent dirs to path so we can import the module +sys.path.insert(0, str(Path(__file__).parent)) +from spectral_features import extract_field_spectral_indices + + +def find_tif_dates(tif_folder: Path) -> list[tuple[str, Path]]: + """ + Find all dated TIF files (YYYY-MM-DD.tif) in a folder. + Returns sorted list of (date_str, path) tuples. + """ + tifs = [] + for f in sorted(tif_folder.glob("*.tif")): + stem = f.stem # e.g. "2024-01-15" + try: + datetime.strptime(stem, "%Y-%m-%d") + tifs.append((stem, f)) + except ValueError: + continue # skip non-date-named TIFs + return tifs + + +def main(): + parser = argparse.ArgumentParser( + description="Extract spectral indices (NDVI, BSI, NDWI) from 4-band TIFFs per field" + ) + parser.add_argument("--project", required=True, help="Project name (e.g. esa, angata)") + parser.add_argument( + "--tif-folder", default="merged_final_tif", + help="TIF subfolder name within project storage (default: merged_final_tif)" + ) + parser.add_argument("--start", default=None, help="Start date filter (YYYY-MM-DD)") + parser.add_argument("--end", default=None, help="End date filter (YYYY-MM-DD)") + parser.add_argument( + "--output", default=None, + help="Output CSV path. Default: {project_storage}/Data/extracted_ci/ci_data_for_python/spectral_indices.csv" + ) + parser.add_argument("--field-id-col", default="field", help="Field ID column in GeoJSON") + args = parser.parse_args() + + # ========================================================================== + # RESOLVE PATHS + # ========================================================================== + # Navigate from this script to the repo root + # __file__ = angata_improvements/run_spectral_extraction.py + # parents: [0]=angata_improvements, [1]=harvest_detection_experiments, [2]=python_app, [3]=SmartCane_code + repo_root = Path(__file__).resolve().parents[3] + project_storage = repo_root / "laravel_app" / "storage" / "app" / args.project + + tif_folder = project_storage / args.tif_folder + geojson_path = project_storage / "pivot.geojson" + + if args.output: + output_csv = Path(args.output) + else: + output_dir = project_storage / "Data" / "extracted_ci" / "ci_data_for_python" + output_dir.mkdir(parents=True, exist_ok=True) + output_csv = output_dir / "spectral_indices.csv" + + # ========================================================================== + # VALIDATION + # ========================================================================== + print("=" * 80) + print(f"SPECTRAL INDEX EXTRACTION — {args.project.upper()}") + print("=" * 80) + + if not tif_folder.exists(): + print(f"ERROR: TIF folder not found: {tif_folder}") + sys.exit(1) + + if not geojson_path.exists(): + print(f"ERROR: Field boundaries not found: {geojson_path}") + sys.exit(1) + + # Load field boundaries once + print(f"\nLoading field boundaries: {geojson_path}") + gdf = gpd.read_file(geojson_path) + n_fields = len(gdf) + print(f" Fields: {n_fields}") + print(f" Field IDs: {', '.join(gdf[args.field_id_col].astype(str).tolist())}") + + # Find all dated TIFs + all_tifs = find_tif_dates(tif_folder) + print(f"\nTIF files found: {len(all_tifs)}") + + # Apply date filters + if args.start: + all_tifs = [(d, p) for d, p in all_tifs if d >= args.start] + if args.end: + all_tifs = [(d, p) for d, p in all_tifs if d <= args.end] + print(f"TIF files after filtering: {len(all_tifs)}") + + if not all_tifs: + print("ERROR: No TIF files to process.") + sys.exit(1) + + print(f"Date range: {all_tifs[0][0]} → {all_tifs[-1][0]}") + print(f"Output: {output_csv}") + print() + + # ========================================================================== + # EXTRACTION LOOP + # ========================================================================== + all_rows = [] + errors = 0 + t0 = time.time() + + for i, (date_str, tif_path) in enumerate(all_tifs): + try: + df = extract_field_spectral_indices( + tif_path, gdf, field_id_col=args.field_id_col + ) + df.insert(1, "Date", date_str) + all_rows.append(df) + except Exception as e: + errors += 1 + print(f" [{i+1}/{len(all_tifs)}] ERROR {date_str}: {e}") + continue + + # Progress every 100 files + if (i + 1) % 100 == 0 or (i + 1) == len(all_tifs): + elapsed = time.time() - t0 + rate = (i + 1) / elapsed + remaining = (len(all_tifs) - i - 1) / rate if rate > 0 else 0 + print( + f" [{i+1}/{len(all_tifs)}] {date_str} " + f"({rate:.1f} files/s, ~{remaining:.0f}s remaining)" + ) + + elapsed_total = time.time() - t0 + + # ========================================================================== + # COMBINE & SAVE + # ========================================================================== + if not all_rows: + print("ERROR: No data extracted.") + sys.exit(1) + + result = pd.concat(all_rows, ignore_index=True) + + # Sort by field, date for consistency with CI data + result = result.sort_values(["field", "Date"]).reset_index(drop=True) + + # Save + output_csv.parent.mkdir(parents=True, exist_ok=True) + result.to_csv(output_csv, index=False) + + # ========================================================================== + # SUMMARY + # ========================================================================== + print() + print("=" * 80) + print("EXTRACTION COMPLETE") + print("=" * 80) + print(f" Dates processed: {len(all_tifs)}") + print(f" Errors: {errors}") + print(f" Total rows: {len(result)}") + print(f" Fields: {result['field'].nunique()}") + print(f" Date range: {result['Date'].min()} → {result['Date'].max()}") + print(f" Time: {elapsed_total:.1f}s ({len(all_tifs)/elapsed_total:.1f} files/s)") + print() + + # Quick stats + for col in ["mean_ndvi", "mean_bsi", "mean_ndwi"]: + vals = result[col].dropna() + if len(vals) > 0: + print(f" {col}: [{vals.min():.4f}, {vals.max():.4f}] (mean={vals.mean():.4f})") + + nan_pct = result[["mean_ndvi", "mean_bsi", "mean_ndwi"]].isna().mean() * 100 + print(f"\n NaN rates: NDVI={nan_pct['mean_ndvi']:.1f}%, BSI={nan_pct['mean_bsi']:.1f}%, NDWI={nan_pct['mean_ndwi']:.1f}%") + print(f"\n Output: {output_csv}") + + +if __name__ == "__main__": + main() diff --git a/python_app/harvest_detection_experiments/angata_improvements/spectral_features.py b/python_app/harvest_detection_experiments/angata_improvements/spectral_features.py new file mode 100644 index 0000000..9d287b7 --- /dev/null +++ b/python_app/harvest_detection_experiments/angata_improvements/spectral_features.py @@ -0,0 +1,241 @@ +""" +spectral_features.py — Extract per-field spectral indices from 4-band GeoTIFFs +================================================================================ +Part of SC-161: Extract spectral indices from 4-band TIFFs (BSI, NDVI, NDWI) + +Band order in TIFFs: [R, G, B, NIR] (uint16) + +Spectral Indices: + NDVI = (NIR - R) / (NIR + R) — vegetation density + BSI = ((R + NIR) - (G + B)) / ((R + NIR) + (G + B)) — bare soil detection + NDWI = (G - NIR) / (G + NIR) — moisture content + CI_green = NIR / Green - 1 — chlorophyll index (= SmartCane CI) + CI_red = NIR / Red - 1 — chlorophyll index (red-based) + GNDVI = (NIR - G) / (NIR + G) — green NDVI (less saturation) + SAVI = 1.5 * (NIR - R) / (NIR + R + 0.5) — soil-adjusted vegetation + EVI2 = 2.5 * (NIR - R) / (NIR + 2.4*R + 1) — enhanced vegetation (2-band) +""" + +import numpy as np +import pandas as pd +import geopandas as gpd +import rasterio +from rasterio.mask import mask as rio_mask +from pathlib import Path +from typing import Optional + + +# ============================================================================= +# SPECTRAL INDEX CALCULATIONS +# ============================================================================= + +def _safe_normalized_diff(a: np.ndarray, b: np.ndarray) -> np.ndarray: + """Compute (a - b) / (a + b) with safe division, masking invalid pixels.""" + with np.errstate(divide="ignore", invalid="ignore"): + result = (a - b) / (a + b) + result[~np.isfinite(result)] = np.nan + return result + + +def compute_ndvi(red: np.ndarray, nir: np.ndarray) -> np.ndarray: + """NDVI = (NIR - R) / (NIR + R)""" + return _safe_normalized_diff(nir, red) + + +def compute_bsi(red: np.ndarray, green: np.ndarray, blue: np.ndarray, nir: np.ndarray) -> np.ndarray: + """BSI = ((R + NIR) - (G + B)) / ((R + NIR) + (G + B))""" + numerator = (red + nir) - (green + blue) + denominator = (red + nir) + (green + blue) + with np.errstate(divide="ignore", invalid="ignore"): + result = numerator / denominator + result[~np.isfinite(result)] = np.nan + return result + + +def compute_ndwi(green: np.ndarray, nir: np.ndarray) -> np.ndarray: + """NDWI = (G - NIR) / (G + NIR)""" + return _safe_normalized_diff(green, nir) + + +def compute_ci_green(green: np.ndarray, nir: np.ndarray) -> np.ndarray: + """CI_green = NIR / Green - 1 (= SmartCane pipeline CI)""" + with np.errstate(divide="ignore", invalid="ignore"): + result = nir / green - 1.0 + result[~np.isfinite(result)] = np.nan + return result + + +def compute_ci_red(red: np.ndarray, nir: np.ndarray) -> np.ndarray: + """CI_red = NIR / Red - 1 (more sensitive at low chlorophyll)""" + with np.errstate(divide="ignore", invalid="ignore"): + result = nir / red - 1.0 + result[~np.isfinite(result)] = np.nan + return result + + +def compute_gndvi(green: np.ndarray, nir: np.ndarray) -> np.ndarray: + """GNDVI = (NIR - G) / (NIR + G)""" + return _safe_normalized_diff(nir, green) + + +def compute_savi(red: np.ndarray, nir: np.ndarray, L: float = 0.5) -> np.ndarray: + """SAVI = (1 + L) * (NIR - R) / (NIR + R + L), L=0.5 typical""" + with np.errstate(divide="ignore", invalid="ignore"): + result = (1.0 + L) * (nir - red) / (nir + red + L) + result[~np.isfinite(result)] = np.nan + return result + + +def compute_evi2(red: np.ndarray, nir: np.ndarray) -> np.ndarray: + """EVI2 = 2.5 * (NIR - R) / (NIR + 2.4*R + 1)""" + with np.errstate(divide="ignore", invalid="ignore"): + result = 2.5 * (nir - red) / (nir + 2.4 * red + 1.0) + result[~np.isfinite(result)] = np.nan + return result + + +# ============================================================================= +# PER-FIELD EXTRACTION +# ============================================================================= + +def extract_field_spectral_indices( + tif_path: str | Path, + field_boundaries_gdf: gpd.GeoDataFrame, + field_id_col: str = "field", + nodata_threshold: float = 0.0, + min_valid_fraction: float = 0.1, +) -> pd.DataFrame: + """ + Extract mean/std spectral indices per field from a single 4-band TIF. + + Parameters + ---------- + tif_path : path to 4-band GeoTIFF (R, G, B, NIR — uint16) + field_boundaries_gdf : GeoDataFrame with field polygons + field_id_col : column name for field identifier + nodata_threshold : pixels with ALL bands <= this value are treated as nodata + min_valid_fraction : minimum fraction of valid pixels required (else NaN) + + Returns + ------- + DataFrame with columns: + field, mean_ndvi, mean_bsi, mean_ndwi, mean_ci_green, mean_ci_red, + mean_gndvi, mean_savi, mean_evi2, + std_ndvi, std_bsi, std_ndwi, std_ci_green, std_ci_red, + std_gndvi, std_savi, std_evi2, + valid_pixel_count, total_pixel_count + """ + tif_path = Path(tif_path) + rows = [] + + with rasterio.open(tif_path) as src: + tif_crs = src.crs + + # Reproject field boundaries to match TIF CRS if needed + gdf = field_boundaries_gdf.copy() + if gdf.crs is not None and not gdf.crs.equals(tif_crs): + gdf = gdf.to_crs(tif_crs) + elif gdf.crs is None: + # Assume EPSG:4326 if not set + gdf = gdf.set_crs("EPSG:4326").to_crs(tif_crs) + + for _, row_geom in gdf.iterrows(): + field_id = row_geom[field_id_col] + geom = [row_geom.geometry.__geo_interface__] + + try: + out_image, _ = rio_mask(src, geom, crop=True, all_touched=True, nodata=0) + except Exception: + # Field doesn't overlap with this TIF + rows.append(_make_nan_row(field_id)) + continue + + # out_image shape: (bands, height, width) — convert to float + bands = out_image.astype(np.float64) + + red = bands[0] + green = bands[1] + blue = bands[2] + nir = bands[3] + + # Mask: valid pixels have at least one band > nodata_threshold + valid_mask = np.any(bands > nodata_threshold, axis=0) + total_pixels = valid_mask.size + valid_pixels = int(valid_mask.sum()) + + if valid_pixels < max(1, int(total_pixels * min_valid_fraction)): + rows.append(_make_nan_row(field_id, valid_pixels, total_pixels)) + continue + + # Compute indices (only on valid pixels) + ndvi = compute_ndvi(red, nir) + bsi = compute_bsi(red, green, blue, nir) + ndwi = compute_ndwi(green, nir) + ci_green = compute_ci_green(green, nir) + ci_red = compute_ci_red(red, nir) + gndvi = compute_gndvi(green, nir) + savi = compute_savi(red, nir) + evi2 = compute_evi2(red, nir) + + # Apply mask + ndvi_valid = ndvi[valid_mask] + bsi_valid = bsi[valid_mask] + ndwi_valid = ndwi[valid_mask] + ci_green_valid = ci_green[valid_mask] + ci_red_valid = ci_red[valid_mask] + gndvi_valid = gndvi[valid_mask] + savi_valid = savi[valid_mask] + evi2_valid = evi2[valid_mask] + + rows.append({ + "field": field_id, + "mean_ndvi": np.nanmean(ndvi_valid), + "mean_bsi": np.nanmean(bsi_valid), + "mean_ndwi": np.nanmean(ndwi_valid), + "mean_ci_green": np.nanmean(ci_green_valid), + "mean_ci_red": np.nanmean(ci_red_valid), + "mean_gndvi": np.nanmean(gndvi_valid), + "mean_savi": np.nanmean(savi_valid), + "mean_evi2": np.nanmean(evi2_valid), + "std_ndvi": np.nanstd(ndvi_valid), + "std_bsi": np.nanstd(bsi_valid), + "std_ndwi": np.nanstd(ndwi_valid), + "std_ci_green": np.nanstd(ci_green_valid), + "std_ci_red": np.nanstd(ci_red_valid), + "std_gndvi": np.nanstd(gndvi_valid), + "std_savi": np.nanstd(savi_valid), + "std_evi2": np.nanstd(evi2_valid), + "valid_pixel_count": valid_pixels, + "total_pixel_count": total_pixels, + }) + + return pd.DataFrame(rows) + + +def _make_nan_row( + field_id: str, + valid_pixels: int = 0, + total_pixels: int = 0, +) -> dict: + """Return a row with NaN values for a field that couldn't be processed.""" + return { + "field": field_id, + "mean_ndvi": np.nan, + "mean_bsi": np.nan, + "mean_ndwi": np.nan, + "mean_ci_green": np.nan, + "mean_ci_red": np.nan, + "mean_gndvi": np.nan, + "mean_savi": np.nan, + "mean_evi2": np.nan, + "std_ndvi": np.nan, + "std_bsi": np.nan, + "std_ndwi": np.nan, + "std_ci_green": np.nan, + "std_ci_red": np.nan, + "std_gndvi": np.nan, + "std_savi": np.nan, + "std_evi2": np.nan, + "valid_pixel_count": valid_pixels, + "total_pixel_count": total_pixels, + } diff --git a/r_app/parameters_project.R b/r_app/parameters_project.R index 6e657a5..6c3565f 100644 --- a/r_app/parameters_project.R +++ b/r_app/parameters_project.R @@ -146,7 +146,6 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { # TIER 2: PER-FIELD TIFFS (Script 10 output) field_tiles_dir <- here(laravel_storage_dir, "field_tiles") field_tiles_ci_dir <- here(laravel_storage_dir, "field_tiles_CI") - daily_tiles_split_dir <- here(laravel_storage_dir, "daily_tiles_split") # SUPPORT TIER: DATA DIRECTORY (define early for use in later tiers) data_dir <- here(laravel_storage_dir, "Data") @@ -163,7 +162,7 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { # TIER 5: MOSAICS (Script 40 output) weekly_mosaic_dir <- here(laravel_storage_dir, "weekly_mosaic") - weekly_tile_max_dir <- here(laravel_storage_dir, "weekly_tile_max") + field_tiles_ci_dir <- here(weekly_mosaic_dir, "field_tiles_CI") # TIER 6: KPI & REPORTING (Scripts 80/90/91 output) reports_dir <- here(laravel_storage_dir, "reports") @@ -178,10 +177,10 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { # Create all directories all_dirs <- c( - merged_tif_folder, field_tiles_dir, field_tiles_ci_dir, daily_tiles_split_dir, + merged_tif_folder, field_tiles_dir, field_tiles_ci_dir, extracted_ci_base_dir, daily_ci_vals_dir, cumulative_ci_vals_dir, ci_for_python_dir, growth_model_interpolated_dir, - weekly_mosaic_dir, weekly_tile_max_dir, + weekly_mosaic_dir, field_tiles_ci_dir, reports_dir, kpi_reports_dir, kpi_field_stats_dir, kpi_field_analysis_dir, data_dir, vrt_dir, harvest_dir, log_dir ) @@ -204,7 +203,6 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { # TIER 2: Per-field TIFFs field_tiles_dir = field_tiles_dir, field_tiles_ci_dir = field_tiles_ci_dir, - daily_tiles_split_dir = daily_tiles_split_dir, # TIER 3: CI extraction extracted_ci_base_dir = extracted_ci_base_dir, @@ -217,7 +215,7 @@ setup_project_directories <- function(project_dir, data_source = "merged_tif") { # TIER 5: Mosaics weekly_mosaic_dir = weekly_mosaic_dir, - weekly_tile_max_dir = weekly_tile_max_dir, + field_tiles_ci_dir = field_tiles_ci_dir, # TIER 6: KPI & reporting reports_dir = reports_dir, @@ -508,171 +506,12 @@ setup_logging <- function(log_dir) { )) } -# ============================================================================== -# SECTION 6B: DATA SOURCE DETECTION -# ============================================================================== - -#' Detect data source for project -#' -#' Returns the data source directory (always "merged_tif" for consistency) -#' -#' @param project_dir Character. Project name -#' @return Character. "merged_tif" -detect_data_source <- function(project_dir) { - return("merged_tif") -} - -#' Detect tile structure from merged TIF directory -#' -#' Checks if tiles exist by looking for: -#' 1. tiling_config.json metadata file (most reliable) -#' 2. Grid subdirectories (5x5, 10x10, etc.) -#' 3. Tile-named files (*_XX.tif pattern) -#' -#' @param merged_final_tif_dir Character. Path to merged TIF directory -#' @param daily_tiles_split_dir Character. Optional path to tiles directory -#' @return List with has_tiles (logical), detected_tiles, total_files, source, grid_size -detect_tile_structure_from_merged_final <- function(merged_final_tif_dir, daily_tiles_split_dir = NULL) { - # PRIORITY 1: Check for tiling_config.json metadata file from script 10 - if (!is.null(daily_tiles_split_dir) && dir.exists(daily_tiles_split_dir)) { - config_files <- list.files(daily_tiles_split_dir, - pattern = "tiling_config\\.json$", - recursive = TRUE, - full.names = TRUE) - - if (length(config_files) > 0) { - config_file <- config_files[which.max(file.info(config_files)$mtime)] - - tryCatch({ - config_json <- jsonlite::read_json(config_file) - return(list( - has_tiles = config_json$has_tiles %||% TRUE, - detected_tiles = character(), - total_files = 0, - source = "tiling_config.json", - grid_size = config_json$grid_size %||% "unknown" - )) - }, error = function(e) { - warning("Error reading tiling_config.json: ", e$message) - }) - } - } - - # PRIORITY 2: File-based detection (fallback) - if (!dir.exists(merged_final_tif_dir)) { - return(list( - has_tiles = FALSE, - detected_tiles = character(), - total_files = 0, - source = "directory_not_found" - )) - } - - # Check for grid-size subdirectories (5x5, 10x10, etc.) - grid_subfolders <- list.dirs(merged_final_tif_dir, full.names = FALSE, recursive = FALSE) - grid_patterns <- grep("^\\d+x\\d+$", grid_subfolders, value = TRUE) - - if (length(grid_patterns) > 0) { - grid_size <- grid_patterns[1] - grid_dir <- file.path(merged_final_tif_dir, grid_size) - sample_tiles <- list.files(grid_dir, pattern = "\\.tif$", recursive = TRUE)[1:3] - - return(list( - has_tiles = TRUE, - detected_tiles = sample_tiles, - total_files = length(sample_tiles), - source = "grid_subdirectory_detection", - grid_size = grid_size, - grid_path = grid_dir - )) - } - - # Check for tile-named files (*_XX.tif pattern) - tif_files <- list.files(merged_final_tif_dir, pattern = "\\.tif$", full.names = FALSE) - - if (length(tif_files) == 0) { - return(list( - has_tiles = FALSE, - detected_tiles = character(), - total_files = 0, - source = "no_files_found" - )) - } - - tile_pattern <- "_(\\d{2})\\.tif$" - tile_files <- tif_files[grepl(tile_pattern, tif_files)] - has_tiles <- length(tile_files) > 0 - - return(list( - has_tiles = has_tiles, - detected_tiles = tile_files, - total_files = length(tif_files), - source = "file_pattern_detection" - )) -} - # ============================================================================== # SECTION 7: MOSAIC & KPI VERIFICATION HELPERS # ============================================================================== # Centralized helper functions for run_full_pipeline.R to avoid hardcoding paths -#' Detect mosaic mode from project structure -#' -#' Determines if project uses "tiled" (legacy) or "single-file" (per-field) mosaics -#' -#' @param project_dir Character. Project name -#' @return Character. "tiled" or "single-file" -detect_mosaic_mode <- function(project_dir) { - # Per-field architecture is standard - always return "single-file" - # unless weekly_tile_max directory exists with content - mosaic_tiled_dir <- file.path("laravel_app", "storage", "app", project_dir, "weekly_tile_max") - - if (dir.exists(mosaic_tiled_dir) && length(list.files(mosaic_tiled_dir)) > 0) { - return("tiled") - } - return("single-file") -} -#' Detect grid size from tile directory structure -#' -#' For per-field architecture, returns "unknown" (grid-based tiling is legacy) -#' -#' @param project_dir Character. Project name -#' @return Character. Grid size ("unknown" for per-field) -detect_grid_size <- function(project_dir) { - # Per-field architecture doesn't use grid-based organization - return("unknown") -} - -#' Get project storage path -#' -#' @param project_dir Character. Project name -#' @param subdir Character. Optional subdirectory (default NULL) -#' @return Character. Full path -get_project_storage_path <- function(project_dir, subdir = NULL) { - path <- file.path("laravel_app", "storage", "app", project_dir) - if (!is.null(subdir)) { - path <- file.path(path, subdir) - } - return(path) -} - -#' Get mosaic directory -#' -#' @param project_dir Character. Project name -#' @param mosaic_mode Character. "tiled" or "single-file" -#' @return Character. Full path to mosaic directory -get_mosaic_dir <- function(project_dir, mosaic_mode = "auto") { - if (mosaic_mode == "auto") { - mosaic_mode <- detect_mosaic_mode(project_dir) - } - - if (mosaic_mode == "tiled") { - get_project_storage_path(project_dir, "weekly_tile_max") - } else { - get_project_storage_path(project_dir, "weekly_mosaic") - } -} #' Get KPI directory based on client type #' @@ -713,20 +552,6 @@ check_harvest_output_exists <- function(project_dir, week_num, year_num) { file.exists(path) } -#' Get mosaic verification directory -#' -#' @param project_dir Character. Project name -#' @param mosaic_mode Character. "tiled" or "single-file" -#' @return Character. Full path to mosaic directory -get_mosaic_verification_dir <- function(project_dir, mosaic_mode) { - base <- file.path("laravel_app", "storage", "app", project_dir) - - if (mosaic_mode == "tiled") { - file.path(base, "weekly_tile_max") - } else { - file.path(base, "weekly_mosaic") - } -} #' Check if mosaic files exist for a specific week #' From 8b02f90f7b1ac37d444ac16491778fda4f9c2c8a Mon Sep 17 00:00:00 2001 From: Nik Verweel Date: Tue, 17 Feb 2026 16:20:22 +0100 Subject: [PATCH 3/5] Create UID generator webapp --- webapps/index.html | 17 ++ webapps/res/id.png | Bin 0 -> 4590 bytes webapps/uid_generator/app.js | 310 +++++++++++++++++++++++++++++++ webapps/uid_generator/index.html | 260 ++++++++++++++++++++++++++ 4 files changed, 587 insertions(+) create mode 100644 webapps/res/id.png create mode 100644 webapps/uid_generator/app.js create mode 100644 webapps/uid_generator/index.html diff --git a/webapps/index.html b/webapps/index.html index a13ec78..d74f831 100644 --- a/webapps/index.html +++ b/webapps/index.html @@ -254,6 +254,7 @@

ODK Data Viewer

Upload ODK CSVs to visualise and turn field corner points into polygons

    +
  • Upload ODK CSVs
  • View ODK data
  • Turn corner points into polygons
  • Download created polygons
  • @@ -261,6 +262,22 @@ Open Viewer + + +
    +
    🆔
    +
    +

    Unique ID generator

    +

    Generate unique IDs for fields from different clients

    +
      +
    • Upload GeoJSONs
    • +
    • Identify client
    • +
    • Generate unique IDs following SmartCane structure
    • +
    • Download GeoJSON with new IDs
    • +
    + Open Editor +
    +