Refactor cum_ci_plot function for enhanced plotting options; added parameters for plot type and season filtering; improved x-axis labeling and updated ggplot aesthetics. Also, continued work on yield prediction in R.

This commit is contained in:
Timon 2024-10-29 10:01:37 +01:00 committed by Martin Folkerts
parent ddb4fdbc02
commit c1a32bf6ad
4 changed files with 214 additions and 110 deletions

View file

@ -53,6 +53,7 @@ library(randomForest)
library(CAST) library(CAST)
source("report_utils.R") source("report_utils.R")
# source(here("r_app", "report_utils.R"))
``` ```
@ -269,87 +270,24 @@ print(" PRINT")
} }
``` ```
```{r eval=FALSE, fig.height=10, fig.width=14, include=FALSE}
CI_all2 <- readRDS(here(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_whole_pivot_year.rds")) %>%
mutate(#line = substr(pivot, 1 , 1),
season = as.factor(season))
pivots_dates <- readRDS(here(harvest_dir, "harvest_data_new")) #%>%
pvt_age <- pivots_dates %>% ungroup() %>% select(pivot, Age) %>% unique()
CI_all2 <- left_join(CI_all2, pvt_age , by = "pivot") %>% mutate(month = plyr::round_any((Age/4),2)) %>%
mutate(month = case_when(month > 16 ~ 18,
TRUE ~month)) %>%
group_by(pivot) %>% filter(Age == max(Age)) %>% ungroup()
CI_all2$season <- ordered(CI_all2$season, levels = c("Data_2021", "Data_2022"))
# CI_all2 <- CI_all2 %>% mutate(season_order = as.integer(season))
latest_model <- CI_all2 %>% group_by(pivot) %>% filter(season =="Data_2022")
# latest_model <- CI_all2 %>% group_by(pivot) %>% arrange(season, desc(DOY)) %>% slice(1)
# CI_all2 <- CI_all %>% group_by(pivot, DOY ) %>% mutate(pivot_cumulative_CI = mean(cumulative_CI))
# label_data <- CI_all2 %>% group_by(pivot) %>% arrange(season, desc(cumulative_CI)) %>% slice(1) %>% mutate(label = paste(pivot, " - ", round(cumulative_CI)))
label_data <- latest_model %>% arrange(season, desc(cumulative_CI)) %>% slice(1) %>% mutate(label = paste(pivot, " - ", round(cumulative_CI)))
max_day <- label_data %>% group_by(pivot) %>% summarise(max_day = max(DOY))
ggplot(data= CI_all2%>% filter(season =="Data_2022"), aes(DOY, cumulative_CI, col = pivot)) +
facet_wrap(~month) +
geom_line() +
# scale_y_continuous(breaks = seq(0, max(label_data$cumulative_CI) + 100, by = 100)) +
labs(title = "Cumulative CI values over the year per pivot, split per 2-month age (rounded down)", x = "Days after harvest/planting (scale is per 2 weeks)", y = "Cumulative CI value",
color = "Line") +
geom_label_repel(data = label_data %>% filter(model %in% latest_model$model), aes(DOY, cumulative_CI, label = label), box.padding = 1,
# ylim = c(1300, NA),
max.overlaps = Inf
# segment.linetype = 4,
# segment.curvature = -1e-20,
# arrow = arrow(length = unit(0.015, "npc"))
) +
theme(legend.position="right", legend.text = element_text(size=8), legend.title = element_text(size = 8),
plot.title = element_text(size=19)) +
# geom_smooth()+
guides(fill = guide_legend(nrow=2,byrow=TRUE)) +
scale_y_continuous(breaks=seq(0,max(label_data$cumulative_CI),100)) +
scale_x_continuous(breaks=seq(0,max(max_day$max_day),30)) + theme(axis.text.x = element_text(angle = 90)) +
labs(x = "Weeks")+
theme(legend.position = "none")
```
# Yield prediction # Yield prediction
The below table shows estimates of the biomass if you would harvest them now. The below table shows estimates of the biomass if you would harvest them now.
```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE} ```{r eval=FALSE, message=FALSE, warning=FALSE, include=TRUE}
CI_quadrant <- readRDS(here(cumulative_CI_vals_dir,"All_pivots_Cumulative_CI_quadrant_year_v2.rds")) %>% CI_quadrant <- readRDS(here(cumulative_CI_vals_dir,"All_pivots_Cumulative_CI_quadrant_year_v2.rds"))
rename( pivot_quadrant = field)#All_pivots_Cumulative_CI.rds
ggplot(CI_quadrant %>% filter(pivot %in% "1.11")) +
geom_line(aes(DOY, cumulative_CI, col = as.factor(season))) +
facet_wrap(~pivot_quadrant)
pivots_dates0 <- pivots_dates0 %>% ungroup() %>% unique() %>% harvesting_data <- harvesting_data %>% rename(season = year)
dplyr::select(field, sub_field, Tcha_2021, Tcha_2022 ) %>%
pivot_longer(cols = c("Tcha_2021", "Tcha_2022"), names_to = "Tcha_Year", values_to = "Tcha") %>%
filter(Tcha > 50) %>%
mutate(season = as.integer(str_extract(Tcha_Year, "\\d+")))
CI_and_yield <- left_join(CI_quadrant , pivots_dates0, by = c("pivot", "pivot_quadrant", "season")) %>% filter(!is.na(Tcha)) %>% CI_and_yield <- left_join(CI_quadrant , harvesting_data, by = c("field", "sub_field", "season")) %>% #filter(!is.na(tonnage_ha)) %>%
group_by(pivot_quadrant, season) %>% slice(which.max(DOY)) %>% group_by(sub_field, season) %>% slice(which.max(DOY)) %>%
dplyr::select(pivot, pivot_quadrant, Tcha_Year, Tcha, cumulative_CI, DOY, season) %>% dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
mutate(CI_per_day = cumulative_CI/ DOY) mutate(CI_per_day = cumulative_CI/ DOY)
ggplot(CI_and_yield) +
geom_point(aes(Tcha, CI_per_day, col = Tcha_Year ))
set.seed(20)
CI_and_yield_split <- initial_split(CI_and_yield, prop = 0.75, strata = pivot_quadrant)
CI_and_yield_test <- training(CI_and_yield_split)
CI_and_yield_validation <- testing(CI_and_yield_split)
predictors <- c( "cumulative_CI" , "DOY" ,"CI_per_day" ) predictors <- c( "cumulative_CI" , "DOY" ,"CI_per_day" )
response <- "Tcha" response <- "tonnage_ha"
CI_and_yield_test <- as.data.frame(CI_and_yield_test) # CI_and_yield_test <- as.data.frame(CI_and_yield_test)
CI_and_yield_test <- CI_and_yield %>% as.data.frame() %>% filter(!is.na(tonnage_ha))
CI_and_yield_validation <- CI_and_yield_test
prediction_yields <- CI_and_yield %>% as.data.frame() %>% filter(is.na(tonnage_ha))
ctrl <- trainControl(method="cv", ctrl <- trainControl(method="cv",
savePredictions = TRUE, savePredictions = TRUE,
@ -368,46 +306,52 @@ model_ffs_rf <- ffs( CI_and_yield_test[,predictors],
na.rm = TRUE na.rm = TRUE
) )
pred_ffs_rf <-
predict(model_ffs_rf, newdata = CI_and_yield_validation) %>% as.data.frame() %>% rename(predicted_Tcha = ".") %>% mutate(
pivot_quadrant = CI_and_yield_validation$pivot_quadrant,
pivot = CI_and_yield_validation$pivot,
Age_days = CI_and_yield_validation$DOY,
total_CI = round(CI_and_yield_validation$cumulative_CI, 0),
predicted_Tcha = round(predicted_Tcha, 0),
season = CI_and_yield_validation$season
) %>% dplyr::select(pivot , pivot_quadrant, Age_days, total_CI, predicted_Tcha, season) %>%
left_join(., CI_and_yield_validation, by = c("pivot", "pivot_quadrant", "season")) %>%
filter(Age_days > 250)
# Function to prepare predictions
prepare_predictions <- function(predictions, newdata) {
return(predictions %>%
as.data.frame() %>%
rename(predicted_Tcha = ".") %>%
mutate(sub_field = newdata$sub_field,
field = newdata$field,
Age_days = newdata$DOY,
total_CI = round(newdata$cumulative_CI, 0),
predicted_Tcha = round(predicted_Tcha, 0),
season = newdata$season) %>%
dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>%
left_join(., newdata, by = c("field", "sub_field", "season")))
}
# Predict yields for the validation dataset
pred_ffs_rf <- prepare_predictions(predict(model_ffs_rf, newdata = CI_and_yield_validation), CI_and_yield_validation)
# Predict yields for the current season
prediction_2023 <- CI_quadrant %>% filter(season == "2023") %>% group_by(pivot_quadrant) %>% slice(which.max(DOY))%>% pred_rf_current_season <- prepare_predictions(predict(model_ffs_rf, newdata = prediction_yields), prediction_yields) %>%
mutate(CI_per_day = cumulative_CI/ DOY)
pred_rf_2023 <- predict(model_ffs_rf, newdata=prediction_2023) %>%
as.data.frame() %>% rename(predicted_Tcha_2023 = ".") %>% mutate(pivot_quadrant = prediction_2023$pivot_quadrant,
pivot = prediction_2023$pivot,
Age_days = prediction_2023$DOY,
total_CI = round(prediction_2023$cumulative_CI,0),
predicted_Tcha_2023 = round(predicted_Tcha_2023, 0)) %>%
filter(Age_days > 300) %>% filter(Age_days > 300) %>%
dplyr::select(pivot ,pivot_quadrant, Age_days, total_CI, predicted_Tcha_2023)%>% mutate(CI_per_day = round(total_CI / Age_days, 1))
mutate(CI_per_day = round(total_CI/ Age_days, 1))
``` ```
```{r yield_plaatjes, eval=FALSE, include=FALSE} ```{r yield_plaatjes, eval=FALSE, include=TRUE}
ggplot(pred_ffs_rf, aes(y = predicted_Tcha , x = Tcha , col = pivot )) + ggplot(pred_ffs_rf, aes(y = predicted_Tcha, x = tonnage_ha, col = sub_area)) +
geom_point() +geom_abline() + geom_point(size = 2, alpha = 0.6) + # Adjust point size and transparency
scale_x_continuous(limits = c(50, 160))+ geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") + # Reference line
scale_y_continuous(limits = c(50, 160)) + scale_x_continuous(limits = c(0, 200)) +
labs(title = "Model trained and tested on historical results - RF") scale_y_continuous(limits = c(0, 200)) +
labs(title = "Model Performance: \nPredicted vs Actual Tonnage/ha",
x = "Actual tonnage/ha (Tcha)",
y = "Predicted tonnage/ha (Tcha)") +
theme_minimal()
ggplot(pred_rf_2023, aes(total_CI , predicted_Tcha_2023 , col = pivot )) + ggplot(pred_rf_current_season, aes(x = Age_days, y = predicted_Tcha, col = field)) +
geom_point() + labs(title = "2023 data (still to be harvested) - fields over 300 days old") geom_point(size = 2, alpha = 0.6) + # Adjust point size and transparency
labs(title = "Predicted Yields for Fields Over 300 Days \nOld Yet to Be Harvested",
x = "Age (days)",
y = "Predicted tonnage/ha (Tcha)") +
facet_wrap(~sub_area) +
scale_y_continuous(limits = c(0, 200)) + # Optional: Set limits for y-axis
theme_minimal()
knitr::kable(pred_rf_2023) knitr::kable(pred_rf_current_season,
digits = 0,
caption = "Predicted Tonnage/ha for Fields Over 300 Days Old")
``` ```

91
r_app/counting_clouds.R Normal file
View file

@ -0,0 +1,91 @@
# Required packages
# library(ggplot2)
# library(dplyr)
raster_files_NEW <- list.files(merged_final,full.names = T, pattern = ".tif")
# Extracting the dates from vrt_list (assuming the format "YYYY-MM-DD.vrt" at the end)
no_cloud_dates <- as.Date(sapply(raster_files_NEW, function(x) {
sub(".*/([0-9]{4}-[0-9]{2}-[0-9]{2})\\.tif", "\\1", x)
}))
# Generate a full sequence of dates in the range
start_date <- min(no_cloud_dates)
end_date <- max(no_cloud_dates)
all_dates <- seq(start_date, end_date, by = "day")
# Create a data frame marking no clouds (1) and clouds (0)
cloud_data <- data.frame(
date = all_dates,
cloud_status = ifelse(all_dates %in% no_cloud_dates, 1, 0)
)
# Plot the data
ggplot(cloud_data, aes(x = date, y = cloud_status)) +
geom_point() +
labs(x = "Date", y = "Cloud Status (1 = No Cloud, 0 = Cloud)") +
scale_y_continuous(breaks = c(0, 1)) +
theme_minimal()
# Updated ggplot code to display months on the x-axis
ggplot(cloud_data, aes(x = date, y = cloud_status)) +
geom_point() +
scale_x_date(date_labels = "%b", date_breaks = "1 month") +
labs(x = "Month", y = "Cloud Status (1 = No Cloud, 0 = Cloud)") +
scale_y_continuous(breaks = c(0, 1)) +
theme_minimal()
# Group data by year and week
cloud_data <- cloud_data %>%
mutate(week = isoweek(date), year = year(date)) %>%
group_by(year, week) %>%
summarise(no_cloud_days = sum(cloud_status == 1),
cloud_days = sum(cloud_status == 0))
# 1. Show how many weeks per year have no images (clouds for all 7 days)
weeks_no_images <- cloud_data %>%
filter(cloud_days == 7)
# Plot weeks with no images
ggplot(weeks_no_images, aes(x = week, y = year)) +
geom_tile(fill = "red") +
labs(x = "Week", y = "Year", title = "Weeks with No Images (Full Cloud Cover)") +
theme_minimal()
# 2. Determine when most clouds are present (cloud_days > no_cloud_days)
weeks_most_clouds <- cloud_data %>%
filter(cloud_days > no_cloud_days)
# Plot when most clouds are present
ggplot(weeks_most_clouds, aes(x = week, y = year)) +
geom_tile(fill = "blue") +
labs(x = "Week", y = "Year", title = "Weeks with Most Clouds") +
theme_minimal()
# Group weeks by number of cloud days and count how many weeks had 0-7 cloud days
weeks_by_cloud_days <- cloud_data %>%
group_by(cloud_days) %>%
summarise(week_count = n())
# Display the summary
print(weeks_by_cloud_days)
# Optional: Plot the results to visualise how many weeks had 0-7 cloud days
ggplot(weeks_by_cloud_days, aes(x = cloud_days, y = week_count)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(x = "Number of Cloud Days (per week)", y = "Number of Weeks",
title = "Distribution of Cloud Days per Week") +
theme_minimal()
weeks_by_no_cloud_days <- cloud_data %>%
mutate(no_cloud_days = 7 - cloud_days) %>%
group_by(no_cloud_days) %>%
summarise(week_count = n())
# Plot the results to visualise how many weeks had 0-7 cloud-free days
ggplot(weeks_by_no_cloud_days, aes(x = no_cloud_days, y = week_count)) +
geom_bar(stat = "identity", fill = "#00A799") +
geom_text(aes(label = week_count), vjust = -0.5, size = 4) + # Add the count of weeks on top of bars
labs(x = "Number of Cloud-Free Days (per week)", y = "Number of Weeks",
title = "Distribution of Cloud-Free Days per Week") +
theme_minimal()

View file

@ -90,7 +90,7 @@ ci_plot <- function(pivotName){
cum_ci_plot <- function(pivotName){ cum_ci_plot <- function(pivotName){
# pivotName = "1.1" # pivotName = "3a13"
data_ci <- CI_quadrant %>% filter(field == pivotName) data_ci <- CI_quadrant %>% filter(field == pivotName)
if (nrow(data_ci) == 0) { if (nrow(data_ci) == 0) {
@ -125,6 +125,75 @@ cum_ci_plot <- function(pivotName){
} }
cum_ci_plot <- function(pivotName, plot_type = "value", facet_on = TRUE) {
# pivotName = "3a13"
data_ci <- CI_quadrant %>% filter(field == pivotName)
if (nrow(data_ci) == 0) {
return(cum_ci_plot2(pivotName)) # Return an empty data frame if no data is found
}
data_ci2 <- data_ci %>%
mutate(CI_rate = cumulative_CI / DOY,
week = week(Date)) %>%
group_by(field) %>%
mutate(mean_CIrate_rolling_10_days = rollapplyr(CI_rate, width = 10, FUN = mean, partial = TRUE),
mean_rolling_10_days = rollapplyr(value, width = 10, FUN = mean, partial = TRUE))
data_ci2 <- data_ci2 %>% mutate(season = as.factor(season))
date_preperation_perfect_pivot <- data_ci2 %>%
group_by(season) %>%
summarise(min_date = min(Date),
max_date = max(Date),
days = max_date - min_date)
unique_seasons <- sort(unique(date_preperation_perfect_pivot$season), decreasing = TRUE)[1:3]
# Determine the y aesthetic based on the plot type
y_aesthetic <- switch(plot_type,
"CI_rate" = "mean_CIrate_rolling_10_days",
"cumulative_CI" = "cumulative_CI",
"value" = "mean_rolling_10_days")
y_label <- switch(plot_type,
"CI_rate" = "10-Day Rolling Mean CI Rate (cumulative CI / age)",
"cumulative_CI" = "Cumulative CI",
"value" = "10-Day Rolling Mean CI")
if (facet_on) {
g <- ggplot(data = data_ci2 %>% filter(season %in% unique_seasons)) +
facet_wrap(~season, scales = "free_x") +
geom_line(aes_string(x = "Date", y = y_aesthetic, col = "sub_field", group = "sub_field")) +
labs(title = paste("Plot of", y_label, "for Pivot", pivotName),
color = "Field Name",
y = y_label) +
scale_x_date(date_breaks = "1 month", date_labels = "%m-%Y") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.justification = c(1, 0), legend.position = c(1, 0),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8)) +
guides(color = guide_legend(nrow = 2, byrow = TRUE))
} else {
g <- ggplot(data = data_ci2 %>% filter(season %in% unique_seasons)) +
geom_line(aes_string(x = "DOY", y = y_aesthetic, col = "season", group = "season")) +
labs(title = paste("Plot of", y_label, "for Pivot", pivotName),
color = "Season",
y = y_label,
x = "Age of Crop (Days)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.justification = c(1, 0), legend.position = c(1, 0),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8)) +
guides(color = guide_legend(nrow = 2, byrow = TRUE))
}
subchunkify(g, 3.2, 10)
}
cum_ci_plot2 <- function(pivotName){ cum_ci_plot2 <- function(pivotName){
end_date <- Sys.Date() end_date <- Sys.Date()
start_date <- end_date %m-% months(11) # 11 months ago from end_date start_date <- end_date %m-% months(11) # 11 months ago from end_date