Refactor cum_ci_plot function for enhanced plotting options; added parameters for plot type and season filtering; improved x-axis labeling and updated ggplot aesthetics. Also, continued work on yield prediction in R.

2024-10-29 10:01:37 +01:00 · 2024-10-29 10:01:37 +01:00 · c1a32bf6ad
parent ddb4fdbc02
commit c1a32bf6ad
4 changed files with 214 additions and 110 deletions
--- a/laravel_app/tests/fixtures/Muhoroni/harvest.xlsx
+++ b/laravel_app/tests/fixtures/Muhoroni/harvest.xlsx
--- a/r_app/CI_report_dashboard_planet.Rmd
+++ b/r_app/CI_report_dashboard_planet.Rmd
@ -53,6 +53,7 @@ library(randomForest)
 library(CAST)

 source("report_utils.R")
+# source(here("r_app", "report_utils.R"))

 ```

@ -269,87 +270,24 @@ print(" PRINT")
 }
 ```

-```{r eval=FALSE, fig.height=10, fig.width=14, include=FALSE}
-CI_all2 <- readRDS(here(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_whole_pivot_year.rds")) %>%
-  mutate(#line = substr(pivot, 1 , 1),
-         season = as.factor(season))
-
-pivots_dates <- readRDS(here(harvest_dir, "harvest_data_new")) #%>%
-
-pvt_age <- pivots_dates %>% ungroup() %>% select(pivot, Age) %>% unique()
-CI_all2 <- left_join(CI_all2, pvt_age , by = "pivot") %>% mutate(month =  plyr::round_any((Age/4),2)) %>%
-  mutate(month = case_when(month > 16 ~ 18,
-                           TRUE ~month)) %>%
-  group_by(pivot) %>% filter(Age == max(Age)) %>% ungroup()
-
-CI_all2$season <- ordered(CI_all2$season, levels = c("Data_2021", "Data_2022"))
-# CI_all2 <- CI_all2 %>% mutate(season_order = as.integer(season))
-latest_model <- CI_all2 %>% group_by(pivot) %>% filter(season =="Data_2022")
-# latest_model <- CI_all2 %>% group_by(pivot) %>% arrange(season, desc(DOY)) %>% slice(1)
-
-# CI_all2 <- CI_all %>% group_by(pivot, DOY ) %>% mutate(pivot_cumulative_CI = mean(cumulative_CI))
-# label_data <- CI_all2 %>% group_by(pivot) %>% arrange(season, desc(cumulative_CI)) %>% slice(1)  %>% mutate(label = paste(pivot, " - ", round(cumulative_CI)))
-label_data <- latest_model %>% arrange(season, desc(cumulative_CI)) %>% slice(1)  %>% mutate(label = paste(pivot, " - ", round(cumulative_CI)))
-
-max_day <- label_data %>% group_by(pivot) %>% summarise(max_day = max(DOY))
-
-ggplot(data= CI_all2%>% filter(season =="Data_2022"), aes(DOY, cumulative_CI, col = pivot)) +
-           facet_wrap(~month) +
-           geom_line() +
-           # scale_y_continuous(breaks = seq(0, max(label_data$cumulative_CI) + 100, by = 100)) +
-           labs(title = "Cumulative CI values over the year per pivot, split per 2-month age (rounded down)", x = "Days after harvest/planting (scale is per 2 weeks)", y = "Cumulative CI value",
-                color = "Line") +
-           geom_label_repel(data = label_data %>% filter(model %in% latest_model$model), aes(DOY, cumulative_CI, label = label), box.padding = 1,
-                            # ylim = c(1300, NA),
-                            max.overlaps = Inf
-                                # segment.linetype = 4,
-
-    # segment.curvature = -1e-20,
-    # arrow = arrow(length = unit(0.015, "npc"))
-    ) +
-  theme(legend.position="right", legend.text = element_text(size=8), legend.title = element_text(size = 8),
-        plot.title = element_text(size=19)) +
-  # geom_smooth()+
-  guides(fill = guide_legend(nrow=2,byrow=TRUE)) +
-  scale_y_continuous(breaks=seq(0,max(label_data$cumulative_CI),100)) +
-  scale_x_continuous(breaks=seq(0,max(max_day$max_day),30)) + theme(axis.text.x = element_text(angle = 90)) +
-  labs(x = "Weeks")+
-  theme(legend.position = "none")
-
-```
 # Yield prediction
 The below table shows estimates of the biomass if you would harvest them now.
-```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
-CI_quadrant <-  readRDS(here(cumulative_CI_vals_dir,"All_pivots_Cumulative_CI_quadrant_year_v2.rds")) %>%
-  rename( pivot_quadrant = field)#All_pivots_Cumulative_CI.rds
-ggplot(CI_quadrant %>% filter(pivot %in% "1.11")) +
-  geom_line(aes(DOY, cumulative_CI, col = as.factor(season))) +
-  facet_wrap(~pivot_quadrant)
+```{r eval=FALSE, message=FALSE, warning=FALSE, include=TRUE}
+CI_quadrant <-  readRDS(here(cumulative_CI_vals_dir,"All_pivots_Cumulative_CI_quadrant_year_v2.rds")) 

-pivots_dates0 <- pivots_dates0 %>% ungroup() %>% unique() %>%
-  dplyr::select(field, sub_field, Tcha_2021, Tcha_2022 )  %>%
-  pivot_longer(cols = c("Tcha_2021", "Tcha_2022"), names_to = "Tcha_Year", values_to = "Tcha") %>%
-  filter(Tcha > 50) %>%
-  mutate(season = as.integer(str_extract(Tcha_Year, "\\d+")))
+harvesting_data <- harvesting_data %>% rename(season = year)

-CI_and_yield <- left_join(CI_quadrant , pivots_dates0, by = c("pivot", "pivot_quadrant", "season")) %>% filter(!is.na(Tcha)) %>%
-  group_by(pivot_quadrant, season) %>% slice(which.max(DOY)) %>%
-  dplyr::select(pivot, pivot_quadrant, Tcha_Year, Tcha, cumulative_CI, DOY, season) %>%
+CI_and_yield <- left_join(CI_quadrant , harvesting_data, by = c("field", "sub_field", "season")) %>% #filter(!is.na(tonnage_ha)) %>%
+  group_by(sub_field, season) %>% slice(which.max(DOY)) %>%
+  dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
  mutate(CI_per_day = cumulative_CI/ DOY)

-ggplot(CI_and_yield) +
-  geom_point(aes(Tcha, CI_per_day, col = Tcha_Year ))
-
-
-set.seed(20)
-CI_and_yield_split <- initial_split(CI_and_yield, prop = 0.75, strata = pivot_quadrant)
-CI_and_yield_test <- training(CI_and_yield_split)
-CI_and_yield_validation <- testing(CI_and_yield_split)
-
-
 predictors <- c(  "cumulative_CI" , "DOY" ,"CI_per_day"  )
-response <- "Tcha"
-CI_and_yield_test <- as.data.frame(CI_and_yield_test)
+response <- "tonnage_ha"
+# CI_and_yield_test <- as.data.frame(CI_and_yield_test)
+CI_and_yield_test <- CI_and_yield %>% as.data.frame() %>% filter(!is.na(tonnage_ha))
+CI_and_yield_validation <- CI_and_yield_test
+prediction_yields <- CI_and_yield %>% as.data.frame() %>% filter(is.na(tonnage_ha))

 ctrl <- trainControl(method="cv",
                     savePredictions = TRUE,
@ -368,46 +306,52 @@ model_ffs_rf <- ffs( CI_and_yield_test[,predictors],
                     na.rm = TRUE
 )

-pred_ffs_rf <-
-  predict(model_ffs_rf, newdata = CI_and_yield_validation) %>%  as.data.frame() %>% rename(predicted_Tcha = ".") %>% mutate(
-    pivot_quadrant = CI_and_yield_validation$pivot_quadrant,
-    pivot = CI_and_yield_validation$pivot,
-    Age_days = CI_and_yield_validation$DOY,
-    total_CI = round(CI_and_yield_validation$cumulative_CI, 0),
+
+# Function to prepare predictions
+prepare_predictions <- function(predictions, newdata) {
+  return(predictions %>%
+           as.data.frame() %>%
+           rename(predicted_Tcha = ".") %>%
+           mutate(sub_field = newdata$sub_field,
+                  field = newdata$field,
+                  Age_days = newdata$DOY,
+                  total_CI = round(newdata$cumulative_CI, 0),
                  predicted_Tcha = round(predicted_Tcha, 0),
-    season = CI_and_yield_validation$season
-  )  %>%  dplyr::select(pivot , pivot_quadrant,  Age_days, total_CI, predicted_Tcha, season) %>%
-  left_join(., CI_and_yield_validation, by = c("pivot",  "pivot_quadrant", "season")) %>%
-  filter(Age_days > 250)
+                  season = newdata$season) %>%
+           dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>%
+           left_join(., newdata, by = c("field", "sub_field", "season")))
+}
+# Predict yields for the validation dataset
+pred_ffs_rf <- prepare_predictions(predict(model_ffs_rf, newdata = CI_and_yield_validation), CI_and_yield_validation)

-
-
-prediction_2023 <- CI_quadrant  %>% filter(season == "2023")  %>% group_by(pivot_quadrant) %>% slice(which.max(DOY))%>%
-  mutate(CI_per_day = cumulative_CI/ DOY)
-
-pred_rf_2023 <- predict(model_ffs_rf, newdata=prediction_2023) %>%
-  as.data.frame() %>% rename(predicted_Tcha_2023 = ".") %>% mutate(pivot_quadrant = prediction_2023$pivot_quadrant,
-                                                                   pivot = prediction_2023$pivot,
-                                                                   Age_days = prediction_2023$DOY,
-                                                                   total_CI = round(prediction_2023$cumulative_CI,0),
-                                                                   predicted_Tcha_2023 = round(predicted_Tcha_2023, 0)) %>%
+# Predict yields for the current season
+pred_rf_current_season <- prepare_predictions(predict(model_ffs_rf, newdata = prediction_yields), prediction_yields) %>%
  filter(Age_days > 300) %>%
-  dplyr::select(pivot ,pivot_quadrant,  Age_days, total_CI, predicted_Tcha_2023)%>%
-  mutate(CI_per_day = round(total_CI/ Age_days, 1))
-
-
-
+  mutate(CI_per_day = round(total_CI / Age_days, 1))
 ```

-```{r yield_plaatjes, eval=FALSE, include=FALSE}
-ggplot(pred_ffs_rf, aes(y = predicted_Tcha  , x = Tcha , col = pivot )) +
-  geom_point()  +geom_abline() +
-  scale_x_continuous(limits = c(50, 160))+
-  scale_y_continuous(limits = c(50, 160)) +
-  labs(title = "Model trained and tested on historical results - RF")
+```{r yield_plaatjes, eval=FALSE, include=TRUE}
+ggplot(pred_ffs_rf, aes(y = predicted_Tcha, x = tonnage_ha, col = sub_area)) +
+  geom_point(size = 2, alpha = 0.6) +  # Adjust point size and transparency
+  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +  # Reference line
+  scale_x_continuous(limits = c(0, 200)) +
+  scale_y_continuous(limits = c(0, 200)) +
+  labs(title = "Model Performance: \nPredicted vs Actual Tonnage/ha",
+       x = "Actual tonnage/ha (Tcha)", 
+       y = "Predicted tonnage/ha (Tcha)") +
+  theme_minimal() 

-ggplot(pred_rf_2023, aes(total_CI , predicted_Tcha_2023 , col = pivot )) +
-  geom_point() + labs(title = "2023 data (still to be harvested) - fields over 300 days old")
+ggplot(pred_rf_current_season, aes(x = Age_days, y = predicted_Tcha, col = field)) +
+  geom_point(size = 2, alpha = 0.6) +  # Adjust point size and transparency
+  labs(title = "Predicted Yields for Fields Over 300 Days \nOld Yet to Be Harvested",
+       x = "Age (days)", 
+       y = "Predicted tonnage/ha (Tcha)") +
+  facet_wrap(~sub_area) +
+  scale_y_continuous(limits = c(0, 200)) +  # Optional: Set limits for y-axis
+  theme_minimal() 

-knitr::kable(pred_rf_2023)
+knitr::kable(pred_rf_current_season, 
+             digits = 0,
+             caption = "Predicted Tonnage/ha for Fields Over 300 Days Old")
 ```
+
--- a/r_app/counting_clouds.R
+++ b/r_app/counting_clouds.R
@ -0,0 +1,91 @@
+# Required packages
+# library(ggplot2)
+# library(dplyr)
+raster_files_NEW <- list.files(merged_final,full.names = T, pattern = ".tif")
+
+# Extracting the dates from vrt_list (assuming the format "YYYY-MM-DD.vrt" at the end)
+no_cloud_dates <- as.Date(sapply(raster_files_NEW, function(x) {
+  sub(".*/([0-9]{4}-[0-9]{2}-[0-9]{2})\\.tif", "\\1", x)
+}))
+
+# Generate a full sequence of dates in the range
+start_date <- min(no_cloud_dates)
+end_date <- max(no_cloud_dates)
+all_dates <- seq(start_date, end_date, by = "day")
+
+# Create a data frame marking no clouds (1) and clouds (0)
+cloud_data <- data.frame(
+  date = all_dates,
+  cloud_status = ifelse(all_dates %in% no_cloud_dates, 1, 0)
+)
+
+# Plot the data
+ggplot(cloud_data, aes(x = date, y = cloud_status)) +
+  geom_point() +
+  labs(x = "Date", y = "Cloud Status (1 = No Cloud, 0 = Cloud)") +
+  scale_y_continuous(breaks = c(0, 1)) +
+  theme_minimal()
+
+# Updated ggplot code to display months on the x-axis
+ggplot(cloud_data, aes(x = date, y = cloud_status)) +
+  geom_point() +
+  scale_x_date(date_labels = "%b", date_breaks = "1 month") +
+  labs(x = "Month", y = "Cloud Status (1 = No Cloud, 0 = Cloud)") +
+  scale_y_continuous(breaks = c(0, 1)) +
+  theme_minimal()
+
+# Group data by year and week
+cloud_data <- cloud_data %>%
+  mutate(week = isoweek(date), year = year(date)) %>%
+  group_by(year, week) %>%
+  summarise(no_cloud_days = sum(cloud_status == 1),
+            cloud_days = sum(cloud_status == 0))
+
+# 1. Show how many weeks per year have no images (clouds for all 7 days)
+weeks_no_images <- cloud_data %>%
+  filter(cloud_days == 7)
+
+# Plot weeks with no images
+ggplot(weeks_no_images, aes(x = week, y = year)) +
+  geom_tile(fill = "red") +
+  labs(x = "Week", y = "Year", title = "Weeks with No Images (Full Cloud Cover)") +
+  theme_minimal()
+
+
+# 2. Determine when most clouds are present (cloud_days > no_cloud_days)
+weeks_most_clouds <- cloud_data %>%
+  filter(cloud_days > no_cloud_days)
+
+# Plot when most clouds are present
+ggplot(weeks_most_clouds, aes(x = week, y = year)) +
+  geom_tile(fill = "blue") +
+  labs(x = "Week", y = "Year", title = "Weeks with Most Clouds") +
+  theme_minimal()
+
+# Group weeks by number of cloud days and count how many weeks had 0-7 cloud days
+weeks_by_cloud_days <- cloud_data %>%
+  group_by(cloud_days) %>%
+  summarise(week_count = n())
+
+# Display the summary
+print(weeks_by_cloud_days)
+
+# Optional: Plot the results to visualise how many weeks had 0-7 cloud days
+ggplot(weeks_by_cloud_days, aes(x = cloud_days, y = week_count)) +
+  geom_bar(stat = "identity", fill = "skyblue") +
+  labs(x = "Number of Cloud Days (per week)", y = "Number of Weeks",
+       title = "Distribution of Cloud Days per Week") +
+  theme_minimal()
+
+weeks_by_no_cloud_days <- cloud_data %>%
+  mutate(no_cloud_days = 7 - cloud_days) %>%
+  group_by(no_cloud_days) %>%
+  summarise(week_count = n())
+
+# Plot the results to visualise how many weeks had 0-7 cloud-free days
+ggplot(weeks_by_no_cloud_days, aes(x = no_cloud_days, y = week_count)) +
+  geom_bar(stat = "identity", fill = "#00A799") +
+  geom_text(aes(label = week_count), vjust = -0.5, size = 4) +  # Add the count of weeks on top of bars
+  labs(x = "Number of Cloud-Free Days (per week)", y = "Number of Weeks",
+       title = "Distribution of Cloud-Free Days per Week") +
+  theme_minimal()
--- a/r_app/report_utils.R
+++ b/r_app/report_utils.R
@ -90,7 +90,7 @@ ci_plot <- function(pivotName){

 cum_ci_plot <- function(pivotName){
  
-  # pivotName = "1.1"
+  # pivotName = "3a13"
  data_ci <- CI_quadrant %>% filter(field == pivotName)
  
  if (nrow(data_ci) == 0) {
@ -125,6 +125,75 @@ cum_ci_plot <- function(pivotName){
  
 }

+cum_ci_plot <- function(pivotName, plot_type = "value", facet_on = TRUE) {
+  
+  # pivotName = "3a13"
+  data_ci <- CI_quadrant %>% filter(field == pivotName)
+  
+  if (nrow(data_ci) == 0) {
+    return(cum_ci_plot2(pivotName))  # Return an empty data frame if no data is found
+  }
+  
+  data_ci2 <- data_ci %>% 
+    mutate(CI_rate = cumulative_CI / DOY,
+           week = week(Date)) %>% 
+    group_by(field) %>%
+    mutate(mean_CIrate_rolling_10_days = rollapplyr(CI_rate, width = 10, FUN = mean, partial = TRUE),
+           mean_rolling_10_days = rollapplyr(value, width = 10, FUN = mean, partial = TRUE))
+  
+  data_ci2 <- data_ci2 %>% mutate(season = as.factor(season))
+  
+  date_preperation_perfect_pivot <- data_ci2 %>% 
+    group_by(season) %>% 
+    summarise(min_date = min(Date),
+              max_date = max(Date),
+              days = max_date - min_date)
+  
+  unique_seasons <- sort(unique(date_preperation_perfect_pivot$season), decreasing = TRUE)[1:3]
+  
+  # Determine the y aesthetic based on the plot type
+  y_aesthetic <- switch(plot_type,
+                        "CI_rate" = "mean_CIrate_rolling_10_days",
+                        "cumulative_CI" = "cumulative_CI",
+                        "value" = "mean_rolling_10_days")
+  
+  y_label <- switch(plot_type,
+                    "CI_rate" = "10-Day Rolling Mean CI Rate (cumulative CI / age)",
+                    "cumulative_CI" = "Cumulative CI",
+                    "value" = "10-Day Rolling Mean CI")
+  
+  if (facet_on) {
+    g <- ggplot(data = data_ci2 %>% filter(season %in% unique_seasons)) +
+      facet_wrap(~season, scales = "free_x") +
+      geom_line(aes_string(x = "Date", y = y_aesthetic, col = "sub_field", group = "sub_field")) +
+      labs(title = paste("Plot of", y_label, "for Pivot", pivotName),
+           color = "Field Name",
+           y = y_label) +
+      scale_x_date(date_breaks = "1 month", date_labels = "%m-%Y") +
+      theme_minimal() +
+      theme(axis.text.x = element_text(angle = 60, hjust = 1),
+            legend.justification = c(1, 0), legend.position = c(1, 0),
+            legend.title = element_text(size = 8),
+            legend.text = element_text(size = 8)) +
+      guides(color = guide_legend(nrow = 2, byrow = TRUE))
+  } else {
+    g <- ggplot(data = data_ci2 %>% filter(season %in% unique_seasons)) +
+      geom_line(aes_string(x = "DOY", y = y_aesthetic, col = "season", group = "season")) +
+      labs(title = paste("Plot of", y_label, "for Pivot", pivotName),
+           color = "Season",
+           y = y_label,
+           x = "Age of Crop (Days)") +
+      theme_minimal() +
+      theme(axis.text.x = element_text(angle = 60, hjust = 1),
+            legend.justification = c(1, 0), legend.position = c(1, 0),
+            legend.title = element_text(size = 8),
+            legend.text = element_text(size = 8)) +
+      guides(color = guide_legend(nrow = 2, byrow = TRUE))
+  }
+  
+  subchunkify(g, 3.2, 10)
+}
+
 cum_ci_plot2 <- function(pivotName){
  end_date <- Sys.Date()
  start_date <- end_date %m-% months(11) # 11 months ago from end_date