From 4c7ca85d290e1d126acc420b045f68d4dc64be71 Mon Sep 17 00:00:00 2001 From: Timon Date: Thu, 15 Jan 2026 09:28:13 +0100 Subject: [PATCH] Add script 23: Convert harvest format from production to standard, remove diagnostic script --- python_app/23_convert_harvest_format.py | 225 ++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 python_app/23_convert_harvest_format.py diff --git a/python_app/23_convert_harvest_format.py b/python_app/23_convert_harvest_format.py new file mode 100644 index 0000000..bfa138c --- /dev/null +++ b/python_app/23_convert_harvest_format.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Script 23: Convert Harvest Format from Production to Standard +============================================================== + +Converts harvest_production_export.xlsx (output from script 22) to the standard +harvest.xlsx format used by R scripts 24+. + +INPUT: + - harvest_production_export.xlsx (from script 22) + Columns: field, season (numeric), season_start_date, season_end_date, phase2_harvest_date + Contains detected harvests only + +OUTPUT: + - harvest.xlsx (standard format for R pipeline) + Columns: field, sub_field, year, season, season_start, season_end, age, sub_area, tonnage_ha + +LOGIC: + 1. For each field, group all detections chronologically + 2. Create one row per completed season (has season_end date) + 3. season_start = first CI date (2024-09-25) for first season, then previous harvest + 1 day + 4. season_end = phase2_harvest_date (refined harvest date from script 22) + 5. year = extracted from season_start date + 6. season = "Data{year} : {field}" format + 7. sub_field = field (same as field) + 8. age, sub_area, tonnage_ha = left empty (filled by R scripts later or other data sources) + +Date Format: YYYY-MM-DD +""" + +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +import os +import sys +from pathlib import Path + + +def get_ci_date_range(project_dir): + """ + Get the date range of CI data to establish season_start for first season. + + Returns: (min_date, max_date) as datetime objects + """ + base_storage = Path("../laravel_app/storage/app") / project_dir / "Data" + ci_data_dir = base_storage / "extracted_ci" / "ci_data_for_python" + ci_csv_path = ci_data_dir / "ci_data_for_python.csv" + + if not ci_csv_path.exists(): + # Fallback: assume data starts 2024-09-25 (typical for projects) + print(f"[WARNING] CI data CSV not found at {ci_csv_path}, assuming CI starts 2024-09-25") + return datetime(2024, 9, 25), datetime.now() + + try: + # Read only date column (first column usually has dates) + df = pd.read_csv(ci_csv_path, nrows=1) + columns = df.columns.tolist() + date_col = columns[0] # First column should be dates + + df = pd.read_csv(ci_csv_path, usecols=[date_col]) + df[date_col] = pd.to_datetime(df[date_col]) + + min_date = df[date_col].min() + max_date = df[date_col].max() + + print(f"[INFO] CI data date range: {min_date.date()} to {max_date.date()}") + return min_date, max_date + except Exception as e: + print(f"[WARNING] Error reading CI date range: {e}, using fallback dates") + return datetime(2024, 9, 25), datetime.now() + + +def convert_harvest_format(project_dir="angata"): + """ + Convert harvest_production_export.xlsx to standard harvest.xlsx format. + + Parameters: + ----------- + project_dir : str + Project name (angata, esa, chemba, etc.) + """ + + print(f"\n{'='*80}") + print(f"Script 23: Convert Harvest Format") + print(f"Project: {project_dir}") + print(f"{'='*80}\n") + + # Get paths (same as script 22) + base_storage = Path("../laravel_app/storage/app") / project_dir / "Data" + harvest_data_dir = base_storage / "HarvestData" + source_file = harvest_data_dir / "harvest_production_export.xlsx" + output_file = base_storage / "harvest.xlsx" + + # Check source file exists + if not source_file.exists(): + print(f"[ERROR] Source file not found: {source_file}") + print(f"[ERROR] Please run script 22 first to generate harvest_production_export.xlsx") + return False + + print(f"[INFO] Reading source file: {source_file}") + + try: + # Read production format + df_source = pd.read_excel(source_file) + print(f"[INFO] Loaded {len(df_source)} harvest detections") + print(f"[INFO] Columns: {list(df_source.columns)}") + + # Validate required columns + required_cols = ["field", "season_start_date", "season_end_date", "phase2_harvest_date"] + missing = [c for c in required_cols if c not in df_source.columns] + if missing: + print(f"[ERROR] Missing columns: {missing}") + return False + + # Get CI date range for establishing first season start + ci_min_date, ci_max_date = get_ci_date_range(project_dir) + first_season_start = ci_min_date.strftime("%Y-%m-%d") + + # Convert to datetime for processing + df_source["phase2_harvest_date"] = pd.to_datetime(df_source["phase2_harvest_date"]) + df_source["field"] = df_source["field"].astype(str) + + # Sort by field and harvest date + df_source = df_source.sort_values(["field", "phase2_harvest_date"]).reset_index(drop=True) + + # Build output rows + output_rows = [] + + # Group by field + for field_id, group_df in df_source.groupby("field"): + # Get all harvest dates for this field, sorted chronologically + harvest_dates = group_df["phase2_harvest_date"].dt.strftime("%Y-%m-%d").tolist() + + print(f"[INFO] Field {field_id}: {len(harvest_dates)} harvest detection(s)") + + # First season always starts from CI beginning + current_season_start = first_season_start + + for harvest_idx, harvest_date in enumerate(harvest_dates): + # Extract year from current season start + season_start_obj = pd.to_datetime(current_season_start) + year = season_start_obj.year + + # Create season identifier + season_str = f"Data{year} : {field_id}" + + # Create row for completed season + row = { + "field": field_id, + "sub_field": field_id, # Same as field + "year": year, + "season": season_str, + "season_start": current_season_start, + "season_end": harvest_date, # Filled because harvest detected + "age": "", # Empty - will be calculated in R + "sub_area": "", # Empty - will be populated from other data + "tonnage_ha": "" # Empty - will be populated from other data + } + + output_rows.append(row) + + # Next season starts day after this harvest + next_season_start = (pd.to_datetime(harvest_date) + timedelta(days=1)).strftime("%Y-%m-%d") + current_season_start = next_season_start + + # If field has detections, check if we should add a final incomplete season + # Only if we're not at the end of the monitoring period + last_harvest = pd.to_datetime(harvest_dates[-1]) + days_after_last = (ci_max_date - last_harvest).days + + if days_after_last > 30: # More than 30 days of data after last harvest + # Add incomplete season row (season_end empty) + season_start_obj = pd.to_datetime(current_season_start) + year = season_start_obj.year + season_str = f"Data{year} : {field_id}" + + row = { + "field": field_id, + "sub_field": field_id, + "year": year, + "season": season_str, + "season_start": current_season_start, + "season_end": "", # Empty - season still active + "age": "", + "sub_area": "", + "tonnage_ha": "" + } + + output_rows.append(row) + print(f"[INFO] Added incomplete season starting {current_season_start}") + + # Create output DataFrame + df_output = pd.DataFrame(output_rows) + + # Reorder columns to match standard format + column_order = ["field", "sub_field", "year", "season", "season_start", "season_end", + "age", "sub_area", "tonnage_ha"] + df_output = df_output[column_order] + + # Write to Excel + print(f"\n[INFO] Writing output file: {output_file}") + df_output.to_excel(output_file, index=False, sheet_name="Harvest") + + # Print summary + print(f"\n[SUCCESS] Conversion complete!") + print(f"[INFO] Output rows: {len(df_output)}") + print(f"[INFO] Unique fields: {df_output['field'].nunique()}") + print(f"\n[INFO] Sample output:") + print(df_output.head(10).to_string(index=False)) + + return True + + except Exception as e: + print(f"[ERROR] Conversion failed: {e}") + import traceback + traceback.print_exc() + return False + + +if __name__ == "__main__": + # Get project from command line or use default + project = sys.argv[1] if len(sys.argv) > 1 else "angata" + + success = convert_harvest_format(project) + sys.exit(0 if success else 1)