341 lines
12 KiB
R
341 lines
12 KiB
R
# Analyze CI values around actual harvest dates to tune detection parameters
|
|
suppressPackageStartupMessages({
|
|
library(readxl)
|
|
library(dplyr)
|
|
library(tidyr)
|
|
library(lubridate)
|
|
library(terra)
|
|
library(sf)
|
|
library(here)
|
|
library(ggplot2)
|
|
})
|
|
|
|
# Set project directory
|
|
project_dir <- "esa"
|
|
assign("project_dir", project_dir, envir = .GlobalEnv)
|
|
|
|
# Source required files
|
|
cat("Loading project configuration...\n")
|
|
source(here("r_app", "parameters_project.R"))
|
|
|
|
# Read pre-extracted DAILY CI data from script 02
|
|
ci_rds_file <- here("laravel_app/storage/app", project_dir, "Data/extracted_ci/cumulative_vals/All_pivots_Cumulative_CI_quadrant_year_v2.rds")
|
|
|
|
cat("Reading pre-extracted daily CI data from:\n")
|
|
cat(" ", ci_rds_file, "\n")
|
|
|
|
if (!file.exists(ci_rds_file)) {
|
|
stop("CI data file not found: ", ci_rds_file)
|
|
}
|
|
|
|
ci_data_raw <- readRDS(ci_rds_file) %>% ungroup()
|
|
|
|
cat("Loaded CI data with", nrow(ci_data_raw), "rows\n\n")
|
|
|
|
# Transform to daily time series format
|
|
cat("Converting to daily time series format...\n")
|
|
time_series <- ci_data_raw %>%
|
|
mutate(
|
|
date = as.Date(Date),
|
|
week = isoweek(date),
|
|
year = isoyear(date)
|
|
) %>%
|
|
select(
|
|
field_id = field,
|
|
date,
|
|
week,
|
|
year,
|
|
mean_ci = FitData
|
|
) %>%
|
|
filter(!is.na(mean_ci), !is.na(date), !is.na(field_id)) %>%
|
|
arrange(field_id, date)
|
|
|
|
cat("Daily time series ready:", nrow(time_series), "observations\n")
|
|
cat("Fields:", n_distinct(time_series$field_id), "\n")
|
|
cat("Date range:", as.character(min(time_series$date)), "to", as.character(max(time_series$date)), "\n\n")
|
|
|
|
# Read actual harvest data
|
|
harvest_actual_all <- read_excel('laravel_app/storage/app/esa/Data/harvest.xlsx') %>%
|
|
mutate(
|
|
season_start = as.Date(season_start),
|
|
season_end = as.Date(season_end)
|
|
) %>% select(-age, -sub_area, -tonnage_ha, -sub_field)
|
|
|
|
fields_with_data <- unique(field_boundaries_sf$field)
|
|
|
|
harvest_actual <- harvest_actual_all %>%
|
|
filter(field %in% fields_with_data) %>%
|
|
filter(!is.na(season_end)) %>%
|
|
mutate(
|
|
actual_harvest_week = isoweek(season_end),
|
|
actual_harvest_year = isoyear(season_end)
|
|
)
|
|
|
|
cat("Analyzing CI values around actual harvest dates...\n\n")
|
|
|
|
# For each actual harvest, find the NEAREST date in time series (within ±3 days)
|
|
harvest_analysis <- harvest_actual %>%
|
|
rowwise() %>%
|
|
do({
|
|
h_field <- .$field
|
|
h_date <- .$season_end
|
|
h_week <- .$actual_harvest_week
|
|
h_year <- .$actual_harvest_year
|
|
|
|
# Find nearest date in time series for this field
|
|
nearest_match <- time_series %>%
|
|
filter(field_id == h_field) %>%
|
|
mutate(
|
|
date_diff = abs(as.numeric(date - h_date))
|
|
) %>%
|
|
filter(date_diff <= 3) %>% # Within 3 days
|
|
arrange(date_diff) %>%
|
|
head(1)
|
|
|
|
if (nrow(nearest_match) > 0) {
|
|
data.frame(
|
|
field = h_field,
|
|
season_end = h_date,
|
|
actual_harvest_week = h_week,
|
|
actual_harvest_year = h_year,
|
|
matched_date = nearest_match$date,
|
|
date_diff = nearest_match$date_diff,
|
|
mean_ci = nearest_match$mean_ci,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
} else {
|
|
data.frame(
|
|
field = h_field,
|
|
season_end = h_date,
|
|
actual_harvest_week = h_week,
|
|
actual_harvest_year = h_year,
|
|
matched_date = as.Date(NA),
|
|
date_diff = NA,
|
|
mean_ci = NA,
|
|
stringsAsFactors = FALSE
|
|
)
|
|
}
|
|
}) %>%
|
|
ungroup() %>%
|
|
mutate(has_ci_data = !is.na(mean_ci))
|
|
|
|
# Summary statistics
|
|
cat("=== CI VALUES AT ACTUAL HARVEST DATES ===\n")
|
|
cat("Harvests with CI data:", sum(harvest_analysis$has_ci_data), "/", nrow(harvest_analysis), "\n\n")
|
|
|
|
ci_at_harvest <- harvest_analysis %>% filter(has_ci_data)
|
|
|
|
if (nrow(ci_at_harvest) > 0) {
|
|
cat("CI Statistics at harvest:\n")
|
|
cat(" Min:", round(min(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
|
|
cat(" Max:", round(max(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
|
|
cat(" Mean:", round(mean(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
|
|
cat(" Median:", round(median(ci_at_harvest$mean_ci, na.rm = TRUE), 2), "\n")
|
|
cat(" Q25:", round(quantile(ci_at_harvest$mean_ci, 0.25, na.rm = TRUE), 2), "\n")
|
|
cat(" Q75:", round(quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE), 2), "\n\n")
|
|
|
|
cat("Distribution of CI at harvest:\n")
|
|
cat(" CI < 1.0:", sum(ci_at_harvest$mean_ci < 1.0, na.rm = TRUE), "\n")
|
|
cat(" CI < 1.5:", sum(ci_at_harvest$mean_ci < 1.5, na.rm = TRUE), "\n")
|
|
cat(" CI < 2.0:", sum(ci_at_harvest$mean_ci < 2.0, na.rm = TRUE), "\n")
|
|
cat(" CI < 2.5:", sum(ci_at_harvest$mean_ci < 2.5, na.rm = TRUE), "\n")
|
|
cat(" CI < 3.0:", sum(ci_at_harvest$mean_ci < 3.0, na.rm = TRUE), "\n")
|
|
cat(" CI >= 3.0:", sum(ci_at_harvest$mean_ci >= 3.0, na.rm = TRUE), "\n\n")
|
|
}
|
|
|
|
# Look at CI values in DAYS BEFORE and AFTER harvest
|
|
cat("\n=== CI TEMPORAL PATTERN AROUND HARVEST (DAILY) ===\n")
|
|
cat("Analyzing ±30 days around actual harvest dates...\n\n")
|
|
|
|
# For each harvest, get CI values in surrounding days
|
|
temporal_analysis <- harvest_actual %>%
|
|
rowwise() %>%
|
|
do({
|
|
field_name <- .$field
|
|
harvest_date <- .$season_end
|
|
|
|
# Get CI values for days around harvest
|
|
field_ts <- time_series %>%
|
|
filter(field_id == field_name,
|
|
date >= (harvest_date - 30),
|
|
date <= (harvest_date + 30)) %>%
|
|
mutate(
|
|
days_from_harvest = as.numeric(date - harvest_date),
|
|
harvest_date_ref = harvest_date
|
|
) %>%
|
|
select(field_id, date, days_from_harvest, mean_ci)
|
|
|
|
field_ts
|
|
}) %>%
|
|
ungroup()
|
|
|
|
if (nrow(temporal_analysis) > 0) {
|
|
summary_by_offset <- temporal_analysis %>%
|
|
group_by(days_from_harvest) %>%
|
|
summarise(
|
|
n = n(),
|
|
mean_ci = mean(mean_ci, na.rm = TRUE),
|
|
median_ci = median(mean_ci, na.rm = TRUE),
|
|
min_ci = min(mean_ci, na.rm = TRUE),
|
|
max_ci = max(mean_ci, na.rm = TRUE),
|
|
sd_ci = sd(mean_ci, na.rm = TRUE),
|
|
.groups = "drop"
|
|
) %>%
|
|
arrange(days_from_harvest)
|
|
|
|
cat("\nDaily CI pattern around harvest (±30 days):\n")
|
|
print(summary_by_offset, n = 100)
|
|
|
|
# Calculate CI drop from pre-harvest to post-harvest
|
|
cat("\n=== CI DROP ANALYSIS ===\n")
|
|
pre_harvest_ci <- summary_by_offset %>%
|
|
filter(days_from_harvest >= -7, days_from_harvest <= -1) %>%
|
|
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
|
|
pull(mean_ci)
|
|
|
|
harvest_day_ci <- summary_by_offset %>%
|
|
filter(days_from_harvest == 0) %>%
|
|
pull(mean_ci)
|
|
|
|
post_harvest_ci <- summary_by_offset %>%
|
|
filter(days_from_harvest >= 1, days_from_harvest <= 7) %>%
|
|
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
|
|
pull(mean_ci)
|
|
|
|
cat("CI 7 days before harvest:", round(pre_harvest_ci, 2), "\n")
|
|
cat("CI on harvest day:", round(harvest_day_ci, 2), "\n")
|
|
cat("CI 7 days after harvest:", round(post_harvest_ci, 2), "\n")
|
|
cat("Drop (pre to harvest day):", round(pre_harvest_ci - harvest_day_ci, 2), "\n")
|
|
cat("Drop (harvest day to post):", round(harvest_day_ci - post_harvest_ci, 2), "\n")
|
|
cat("Total drop (pre to post):", round(pre_harvest_ci - post_harvest_ci, 2), "\n\n")
|
|
|
|
# Analyze when CI starts dropping
|
|
cat("\n=== WHEN DOES CI DROP START? ===\n")
|
|
baseline_ci <- summary_by_offset %>%
|
|
filter(days_from_harvest >= -30, days_from_harvest <= -15) %>%
|
|
summarise(mean_ci = mean(mean_ci, na.rm = TRUE)) %>%
|
|
pull(mean_ci)
|
|
|
|
cat("Baseline CI (days -30 to -15):", round(baseline_ci, 2), "\n")
|
|
|
|
# Find when CI first drops significantly below baseline
|
|
drop_start <- summary_by_offset %>%
|
|
filter(days_from_harvest < 0) %>%
|
|
mutate(drop_from_baseline = baseline_ci - mean_ci) %>%
|
|
filter(drop_from_baseline > 0.3) %>% # Significant drop
|
|
arrange(days_from_harvest) %>%
|
|
head(1)
|
|
|
|
if (nrow(drop_start) > 0) {
|
|
cat("First significant drop detected at day:", drop_start$days_from_harvest,
|
|
"(CI:", round(drop_start$mean_ci, 2), ", drop:", round(drop_start$drop_from_baseline, 2), ")\n")
|
|
}
|
|
|
|
# Find when CI reaches minimum
|
|
min_ci_day <- summary_by_offset %>%
|
|
filter(days_from_harvest >= -30, days_from_harvest <= 30) %>%
|
|
arrange(mean_ci) %>%
|
|
head(1)
|
|
|
|
cat("Minimum CI reached at day:", min_ci_day$days_from_harvest,
|
|
"(CI:", round(min_ci_day$mean_ci, 2), ")\n")
|
|
|
|
# Find when CI starts recovering
|
|
recovery_start <- summary_by_offset %>%
|
|
filter(days_from_harvest > 0) %>%
|
|
mutate(recovery_from_harvest = mean_ci - harvest_day_ci) %>%
|
|
filter(recovery_from_harvest > 0.3) %>% # Significant recovery
|
|
arrange(days_from_harvest) %>%
|
|
head(1)
|
|
|
|
if (nrow(recovery_start) > 0) {
|
|
cat("Recovery detected at day:", recovery_start$days_from_harvest,
|
|
"(CI:", round(recovery_start$mean_ci, 2), ", gain:", round(recovery_start$recovery_from_harvest, 2), ")\n")
|
|
}
|
|
|
|
# Analyze the ENTIRE harvest period (not just a single day)
|
|
cat("\n=== MULTI-DAY HARVEST PERIOD ANALYSIS ===\n")
|
|
cat("Harvest may span multiple days/weeks. Looking for extended low CI periods...\n\n")
|
|
|
|
# Count consecutive days below different thresholds
|
|
for (threshold in c(1.5, 2.0, 2.5, 3.0)) {
|
|
consecutive_low <- temporal_analysis %>%
|
|
arrange(field_id, date) %>%
|
|
group_by(field_id) %>%
|
|
mutate(
|
|
is_low = mean_ci < threshold,
|
|
day_diff = as.numeric(date - lag(date)),
|
|
new_period = is.na(day_diff) | day_diff > 3 | !is_low, # Gap or not low
|
|
period_id = cumsum(new_period)
|
|
) %>%
|
|
filter(is_low) %>%
|
|
group_by(field_id, period_id) %>%
|
|
summarise(
|
|
start_day = min(days_from_harvest),
|
|
end_day = max(days_from_harvest),
|
|
duration = n(),
|
|
mean_ci_period = mean(mean_ci),
|
|
.groups = "drop"
|
|
) %>%
|
|
filter(duration >= 3) # At least 3 consecutive days
|
|
|
|
if (nrow(consecutive_low) > 0) {
|
|
cat("\nConsecutive periods with CI <", threshold, ":\n")
|
|
cat(" Number of periods:", nrow(consecutive_low), "\n")
|
|
cat(" Average duration:", round(mean(consecutive_low$duration), 1), "days\n")
|
|
cat(" Median start day:", round(median(consecutive_low$start_day), 1), "\n")
|
|
cat(" Median end day:", round(median(consecutive_low$end_day), 1), "\n")
|
|
|
|
# Show distribution of when these periods start
|
|
periods_before <- sum(consecutive_low$start_day < -7)
|
|
periods_during <- sum(consecutive_low$start_day >= -7 & consecutive_low$start_day <= 7)
|
|
periods_after <- sum(consecutive_low$start_day > 7)
|
|
|
|
cat(" Periods starting before harvest (-30 to -7):", periods_before, "\n")
|
|
cat(" Periods starting during harvest (-7 to +7):", periods_during, "\n")
|
|
cat(" Periods starting after harvest (+7 to +30):", periods_after, "\n")
|
|
}
|
|
}
|
|
|
|
cat("\n=== RECOMMENDED THRESHOLDS (DAILY DATA) ===\n")
|
|
ci_75th <- quantile(ci_at_harvest$mean_ci, 0.75, na.rm = TRUE)
|
|
ci_90th <- quantile(ci_at_harvest$mean_ci, 0.90, na.rm = TRUE)
|
|
|
|
cat("Based on actual harvest CI values:\n")
|
|
cat(" Conservative threshold (captures 75% of harvests): CI <", round(ci_75th, 2), "\n")
|
|
cat(" Aggressive threshold (captures 90% of harvests): CI <", round(ci_90th, 2), "\n\n")
|
|
|
|
# Calculate drop thresholds
|
|
if (!is.na(pre_harvest_ci) && !is.na(post_harvest_ci)) {
|
|
typical_drop <- pre_harvest_ci - post_harvest_ci
|
|
cat("Typical CI drop (7 days before to 7 days after):", round(typical_drop, 2), "\n")
|
|
cat("Suggested drop_threshold:", round(typical_drop * 0.5, 2), "(half of typical drop)\n\n")
|
|
}
|
|
|
|
cat("Suggested detection parameters for daily data:\n")
|
|
cat(" low_ci_threshold:", round(ci_75th, 1), "(75th percentile of harvest CI)\n")
|
|
cat(" drop_threshold:", round((pre_harvest_ci - post_harvest_ci) * 0.5, 1), "(half of typical drop)\n")
|
|
cat(" min_low_days: 7-10 (stay below threshold for this many days)\n")
|
|
cat(" recovery_threshold:", round(pre_harvest_ci, 1), "(pre-harvest CI level)\n")
|
|
}
|
|
|
|
# Show sample cases where detection failed
|
|
cat("\n\n=== SAMPLE HARVEST DATES WITH CI VALUES ===\n")
|
|
sample_harvests <- harvest_analysis %>%
|
|
filter(has_ci_data) %>%
|
|
arrange(mean_ci) %>%
|
|
select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>%
|
|
head(15)
|
|
|
|
cat("15 harvests with LOWEST CI on harvest day:\n")
|
|
print(sample_harvests)
|
|
|
|
sample_high <- harvest_analysis %>%
|
|
filter(has_ci_data) %>%
|
|
arrange(desc(mean_ci)) %>%
|
|
select(field, season_end, actual_harvest_week, actual_harvest_year, mean_ci) %>%
|
|
head(10)
|
|
|
|
cat("\n10 harvests with HIGHEST CI on harvest day:\n")
|
|
print(sample_high)
|