Refactor cum_ci_plot function for enhanced plotting options; added parameters for plot type and season filtering; improved x-axis labeling and updated ggplot aesthetics. Also, continued work on yield prediction in R.

2024-10-29 10:01:37 +01:00 · 2024-10-29 10:01:37 +01:00 · c1a32bf6ad
parent ddb4fdbc02
commit c1a32bf6ad
4 changed files with 214 additions and 110 deletions
--- a/laravel_app/tests/fixtures/Muhoroni/harvest.xlsx
+++ b/laravel_app/tests/fixtures/Muhoroni/harvest.xlsx
--- a/r_app/CI_report_dashboard_planet.Rmd
+++ b/r_app/CI_report_dashboard_planet.Rmd
@ -53,6 +53,7 @@ library(randomForest)
 library(CAST)
 source("report_utils.R")
 # source(here("r_app", "report_utils.R"))
 ```
@ -269,87 +270,24 @@ print(" PRINT")
 }
 ```
 ```{r eval=FALSE, fig.height=10, fig.width=14, include=FALSE}
 CI_all2 <- readRDS(here(cumulative_CI_vals_dir, "All_pivots_Cumulative_CI_whole_pivot_year.rds")) %>%
  mutate(#line = substr(pivot, 1 , 1),
         season = as.factor(season))
 pivots_dates <- readRDS(here(harvest_dir, "harvest_data_new")) #%>%
 pvt_age <- pivots_dates %>% ungroup() %>% select(pivot, Age) %>% unique()
 CI_all2 <- left_join(CI_all2, pvt_age , by = "pivot") %>% mutate(month =  plyr::round_any((Age/4),2)) %>%
  mutate(month = case_when(month > 16 ~ 18,
                           TRUE ~month)) %>%
  group_by(pivot) %>% filter(Age == max(Age)) %>% ungroup()
 CI_all2$season <- ordered(CI_all2$season, levels = c("Data_2021", "Data_2022"))
 # CI_all2 <- CI_all2 %>% mutate(season_order = as.integer(season))
 latest_model <- CI_all2 %>% group_by(pivot) %>% filter(season =="Data_2022")
 # latest_model <- CI_all2 %>% group_by(pivot) %>% arrange(season, desc(DOY)) %>% slice(1)
 # CI_all2 <- CI_all %>% group_by(pivot, DOY ) %>% mutate(pivot_cumulative_CI = mean(cumulative_CI))
 # label_data <- CI_all2 %>% group_by(pivot) %>% arrange(season, desc(cumulative_CI)) %>% slice(1)  %>% mutate(label = paste(pivot, " - ", round(cumulative_CI)))
 label_data <- latest_model %>% arrange(season, desc(cumulative_CI)) %>% slice(1)  %>% mutate(label = paste(pivot, " - ", round(cumulative_CI)))
 max_day <- label_data %>% group_by(pivot) %>% summarise(max_day = max(DOY))
 ggplot(data= CI_all2%>% filter(season =="Data_2022"), aes(DOY, cumulative_CI, col = pivot)) +
           facet_wrap(~month) +
           geom_line() +
           # scale_y_continuous(breaks = seq(0, max(label_data$cumulative_CI) + 100, by = 100)) +
           labs(title = "Cumulative CI values over the year per pivot, split per 2-month age (rounded down)", x = "Days after harvest/planting (scale is per 2 weeks)", y = "Cumulative CI value",
                color = "Line") +
           geom_label_repel(data = label_data %>% filter(model %in% latest_model$model), aes(DOY, cumulative_CI, label = label), box.padding = 1,
                            # ylim = c(1300, NA),
                            max.overlaps = Inf
                                # segment.linetype = 4,
    # segment.curvature = -1e-20,
    # arrow = arrow(length = unit(0.015, "npc"))
    ) +
  theme(legend.position="right", legend.text = element_text(size=8), legend.title = element_text(size = 8),
        plot.title = element_text(size=19)) +
  # geom_smooth()+
  guides(fill = guide_legend(nrow=2,byrow=TRUE)) +
  scale_y_continuous(breaks=seq(0,max(label_data$cumulative_CI),100)) +
  scale_x_continuous(breaks=seq(0,max(max_day$max_day),30)) + theme(axis.text.x = element_text(angle = 90)) +
  labs(x = "Weeks")+
  theme(legend.position = "none")
 ```
 # Yield prediction
 The below table shows estimates of the biomass if you would harvest them now.
-```{r eval=FALSE, message=FALSE, warning=FALSE, include=FALSE}
+```{r eval=FALSE, message=FALSE, warning=FALSE, include=TRUE}
-CI_quadrant <-  readRDS(here(cumulative_CI_vals_dir,"All_pivots_Cumulative_CI_quadrant_year_v2.rds")) %>%
+CI_quadrant <-  readRDS(here(cumulative_CI_vals_dir,"All_pivots_Cumulative_CI_quadrant_year_v2.rds")) 
  rename( pivot_quadrant = field)#All_pivots_Cumulative_CI.rds
 ggplot(CI_quadrant %>% filter(pivot %in% "1.11")) +
  geom_line(aes(DOY, cumulative_CI, col = as.factor(season))) +
  facet_wrap(~pivot_quadrant)
-pivots_dates0 <- pivots_dates0 %>% ungroup() %>% unique() %>%
+harvesting_data <- harvesting_data %>% rename(season = year)
  dplyr::select(field, sub_field, Tcha_2021, Tcha_2022 )  %>%
  pivot_longer(cols = c("Tcha_2021", "Tcha_2022"), names_to = "Tcha_Year", values_to = "Tcha") %>%
  filter(Tcha > 50) %>%
  mutate(season = as.integer(str_extract(Tcha_Year, "\\d+")))
-CI_and_yield <- left_join(CI_quadrant , pivots_dates0, by = c("pivot", "pivot_quadrant", "season")) %>% filter(!is.na(Tcha)) %>%
+CI_and_yield <- left_join(CI_quadrant , harvesting_data, by = c("field", "sub_field", "season")) %>% #filter(!is.na(tonnage_ha)) %>%
-  group_by(pivot_quadrant, season) %>% slice(which.max(DOY)) %>%
+  group_by(sub_field, season) %>% slice(which.max(DOY)) %>%
-  dplyr::select(pivot, pivot_quadrant, Tcha_Year, Tcha, cumulative_CI, DOY, season) %>%
+  dplyr::select(field, sub_field, tonnage_ha, cumulative_CI, DOY, season, sub_area) %>%
  mutate(CI_per_day = cumulative_CI/ DOY)
 ggplot(CI_and_yield) +
  geom_point(aes(Tcha, CI_per_day, col = Tcha_Year ))
 set.seed(20)
 CI_and_yield_split <- initial_split(CI_and_yield, prop = 0.75, strata = pivot_quadrant)
 CI_and_yield_test <- training(CI_and_yield_split)
 CI_and_yield_validation <- testing(CI_and_yield_split)
 predictors <- c(  "cumulative_CI" , "DOY" ,"CI_per_day"  )
-response <- "Tcha"
+response <- "tonnage_ha"
-CI_and_yield_test <- as.data.frame(CI_and_yield_test)
+# CI_and_yield_test <- as.data.frame(CI_and_yield_test)
 CI_and_yield_test <- CI_and_yield %>% as.data.frame() %>% filter(!is.na(tonnage_ha))
 CI_and_yield_validation <- CI_and_yield_test
 prediction_yields <- CI_and_yield %>% as.data.frame() %>% filter(is.na(tonnage_ha))
 ctrl <- trainControl(method="cv",
                     savePredictions = TRUE,
@ -368,46 +306,52 @@ model_ffs_rf <- ffs( CI_and_yield_test[,predictors],
                     na.rm = TRUE
 )
 pred_ffs_rf <-
  predict(model_ffs_rf, newdata = CI_and_yield_validation) %>%  as.data.frame() %>% rename(predicted_Tcha = ".") %>% mutate(
    pivot_quadrant = CI_and_yield_validation$pivot_quadrant,
    pivot = CI_and_yield_validation$pivot,
    Age_days = CI_and_yield_validation$DOY,
    total_CI = round(CI_and_yield_validation$cumulative_CI, 0),
    predicted_Tcha = round(predicted_Tcha, 0),
    season = CI_and_yield_validation$season
  )  %>%  dplyr::select(pivot , pivot_quadrant,  Age_days, total_CI, predicted_Tcha, season) %>%
  left_join(., CI_and_yield_validation, by = c("pivot",  "pivot_quadrant", "season")) %>%
  filter(Age_days > 250)
 # Function to prepare predictions
 prepare_predictions <- function(predictions, newdata) {
  return(predictions %>%
           as.data.frame() %>%
           rename(predicted_Tcha = ".") %>%
           mutate(sub_field = newdata$sub_field,
                  field = newdata$field,
                  Age_days = newdata$DOY,
                  total_CI = round(newdata$cumulative_CI, 0),
                  predicted_Tcha = round(predicted_Tcha, 0),
                  season = newdata$season) %>%
           dplyr::select(field, sub_field, Age_days, total_CI, predicted_Tcha, season) %>%
           left_join(., newdata, by = c("field", "sub_field", "season")))
 }
 # Predict yields for the validation dataset
 pred_ffs_rf <- prepare_predictions(predict(model_ffs_rf, newdata = CI_and_yield_validation), CI_and_yield_validation)
-
+# Predict yields for the current season
-prediction_2023 <- CI_quadrant  %>% filter(season == "2023")  %>% group_by(pivot_quadrant) %>% slice(which.max(DOY))%>%
+pred_rf_current_season <- prepare_predictions(predict(model_ffs_rf, newdata = prediction_yields), prediction_yields) %>%
  mutate(CI_per_day = cumulative_CI/ DOY)
 pred_rf_2023 <- predict(model_ffs_rf, newdata=prediction_2023) %>%
  as.data.frame() %>% rename(predicted_Tcha_2023 = ".") %>% mutate(pivot_quadrant = prediction_2023$pivot_quadrant,
                                                                   pivot = prediction_2023$pivot,
                                                                   Age_days = prediction_2023$DOY,
                                                                   total_CI = round(prediction_2023$cumulative_CI,0),
                                                                   predicted_Tcha_2023 = round(predicted_Tcha_2023, 0)) %>%
  filter(Age_days > 300) %>%
-  dplyr::select(pivot ,pivot_quadrant,  Age_days, total_CI, predicted_Tcha_2023)%>%
+  mutate(CI_per_day = round(total_CI / Age_days, 1))
  mutate(CI_per_day = round(total_CI/ Age_days, 1))
 ```
-```{r yield_plaatjes, eval=FALSE, include=FALSE}
+```{r yield_plaatjes, eval=FALSE, include=TRUE}
-ggplot(pred_ffs_rf, aes(y = predicted_Tcha  , x = Tcha , col = pivot )) +
+ggplot(pred_ffs_rf, aes(y = predicted_Tcha, x = tonnage_ha, col = sub_area)) +
-  geom_point()  +geom_abline() +
+  geom_point(size = 2, alpha = 0.6) +  # Adjust point size and transparency
-  scale_x_continuous(limits = c(50, 160))+
+  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +  # Reference line
-  scale_y_continuous(limits = c(50, 160)) +
+  scale_x_continuous(limits = c(0, 200)) +
-  labs(title = "Model trained and tested on historical results - RF")
+  scale_y_continuous(limits = c(0, 200)) +
  labs(title = "Model Performance: \nPredicted vs Actual Tonnage/ha",
       x = "Actual tonnage/ha (Tcha)", 
       y = "Predicted tonnage/ha (Tcha)") +
  theme_minimal() 
-ggplot(pred_rf_2023, aes(total_CI , predicted_Tcha_2023 , col = pivot )) +
+ggplot(pred_rf_current_season, aes(x = Age_days, y = predicted_Tcha, col = field)) +
-  geom_point() + labs(title = "2023 data (still to be harvested) - fields over 300 days old")
+  geom_point(size = 2, alpha = 0.6) +  # Adjust point size and transparency
  labs(title = "Predicted Yields for Fields Over 300 Days \nOld Yet to Be Harvested",
       x = "Age (days)", 
       y = "Predicted tonnage/ha (Tcha)") +
  facet_wrap(~sub_area) +
  scale_y_continuous(limits = c(0, 200)) +  # Optional: Set limits for y-axis
  theme_minimal() 
-knitr::kable(pred_rf_2023)
+knitr::kable(pred_rf_current_season, 
             digits = 0,
             caption = "Predicted Tonnage/ha for Fields Over 300 Days Old")
 ```
--- a/r_app/counting_clouds.R
+++ b/r_app/counting_clouds.R
@ -0,0 +1,91 @@
 # Required packages
 # library(ggplot2)
 # library(dplyr)
 raster_files_NEW <- list.files(merged_final,full.names = T, pattern = ".tif")
 # Extracting the dates from vrt_list (assuming the format "YYYY-MM-DD.vrt" at the end)
 no_cloud_dates <- as.Date(sapply(raster_files_NEW, function(x) {
  sub(".*/([0-9]{4}-[0-9]{2}-[0-9]{2})\\.tif", "\\1", x)
 }))
 # Generate a full sequence of dates in the range
 start_date <- min(no_cloud_dates)
 end_date <- max(no_cloud_dates)
 all_dates <- seq(start_date, end_date, by = "day")
 # Create a data frame marking no clouds (1) and clouds (0)
 cloud_data <- data.frame(
  date = all_dates,
  cloud_status = ifelse(all_dates %in% no_cloud_dates, 1, 0)
 )
 # Plot the data
 ggplot(cloud_data, aes(x = date, y = cloud_status)) +
  geom_point() +
  labs(x = "Date", y = "Cloud Status (1 = No Cloud, 0 = Cloud)") +
  scale_y_continuous(breaks = c(0, 1)) +
  theme_minimal()
 # Updated ggplot code to display months on the x-axis
 ggplot(cloud_data, aes(x = date, y = cloud_status)) +
  geom_point() +
  scale_x_date(date_labels = "%b", date_breaks = "1 month") +
  labs(x = "Month", y = "Cloud Status (1 = No Cloud, 0 = Cloud)") +
  scale_y_continuous(breaks = c(0, 1)) +
  theme_minimal()
 # Group data by year and week
 cloud_data <- cloud_data %>%
  mutate(week = isoweek(date), year = year(date)) %>%
  group_by(year, week) %>%
  summarise(no_cloud_days = sum(cloud_status == 1),
            cloud_days = sum(cloud_status == 0))
 # 1. Show how many weeks per year have no images (clouds for all 7 days)
 weeks_no_images <- cloud_data %>%
  filter(cloud_days == 7)
 # Plot weeks with no images
 ggplot(weeks_no_images, aes(x = week, y = year)) +
  geom_tile(fill = "red") +
  labs(x = "Week", y = "Year", title = "Weeks with No Images (Full Cloud Cover)") +
  theme_minimal()
 # 2. Determine when most clouds are present (cloud_days > no_cloud_days)
 weeks_most_clouds <- cloud_data %>%
  filter(cloud_days > no_cloud_days)
 # Plot when most clouds are present
 ggplot(weeks_most_clouds, aes(x = week, y = year)) +
  geom_tile(fill = "blue") +
  labs(x = "Week", y = "Year", title = "Weeks with Most Clouds") +
  theme_minimal()
 # Group weeks by number of cloud days and count how many weeks had 0-7 cloud days
 weeks_by_cloud_days <- cloud_data %>%
  group_by(cloud_days) %>%
  summarise(week_count = n())
 # Display the summary
 print(weeks_by_cloud_days)
 # Optional: Plot the results to visualise how many weeks had 0-7 cloud days
 ggplot(weeks_by_cloud_days, aes(x = cloud_days, y = week_count)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(x = "Number of Cloud Days (per week)", y = "Number of Weeks",
       title = "Distribution of Cloud Days per Week") +
  theme_minimal()
 weeks_by_no_cloud_days <- cloud_data %>%
  mutate(no_cloud_days = 7 - cloud_days) %>%
  group_by(no_cloud_days) %>%
  summarise(week_count = n())
 # Plot the results to visualise how many weeks had 0-7 cloud-free days
 ggplot(weeks_by_no_cloud_days, aes(x = no_cloud_days, y = week_count)) +
  geom_bar(stat = "identity", fill = "#00A799") +
  geom_text(aes(label = week_count), vjust = -0.5, size = 4) +  # Add the count of weeks on top of bars
  labs(x = "Number of Cloud-Free Days (per week)", y = "Number of Weeks",
       title = "Distribution of Cloud-Free Days per Week") +
  theme_minimal()
--- a/r_app/report_utils.R
+++ b/r_app/report_utils.R
@ -90,7 +90,7 @@ ci_plot <- function(pivotName){
 cum_ci_plot <- function(pivotName){
-  # pivotName = "1.1"
+  # pivotName = "3a13"
  data_ci <- CI_quadrant %>% filter(field == pivotName)
  if (nrow(data_ci) == 0) {
@ -125,6 +125,75 @@ cum_ci_plot <- function(pivotName){
 }
 cum_ci_plot <- function(pivotName, plot_type = "value", facet_on = TRUE) {
  # pivotName = "3a13"
  data_ci <- CI_quadrant %>% filter(field == pivotName)
  if (nrow(data_ci) == 0) {
    return(cum_ci_plot2(pivotName))  # Return an empty data frame if no data is found
  }
  data_ci2 <- data_ci %>% 
    mutate(CI_rate = cumulative_CI / DOY,
           week = week(Date)) %>% 
    group_by(field) %>%
    mutate(mean_CIrate_rolling_10_days = rollapplyr(CI_rate, width = 10, FUN = mean, partial = TRUE),
           mean_rolling_10_days = rollapplyr(value, width = 10, FUN = mean, partial = TRUE))
  data_ci2 <- data_ci2 %>% mutate(season = as.factor(season))
  date_preperation_perfect_pivot <- data_ci2 %>% 
    group_by(season) %>% 
    summarise(min_date = min(Date),
              max_date = max(Date),
              days = max_date - min_date)
  unique_seasons <- sort(unique(date_preperation_perfect_pivot$season), decreasing = TRUE)[1:3]
  # Determine the y aesthetic based on the plot type
  y_aesthetic <- switch(plot_type,
                        "CI_rate" = "mean_CIrate_rolling_10_days",
                        "cumulative_CI" = "cumulative_CI",
                        "value" = "mean_rolling_10_days")
  y_label <- switch(plot_type,
                    "CI_rate" = "10-Day Rolling Mean CI Rate (cumulative CI / age)",
                    "cumulative_CI" = "Cumulative CI",
                    "value" = "10-Day Rolling Mean CI")
  if (facet_on) {
    g <- ggplot(data = data_ci2 %>% filter(season %in% unique_seasons)) +
      facet_wrap(~season, scales = "free_x") +
      geom_line(aes_string(x = "Date", y = y_aesthetic, col = "sub_field", group = "sub_field")) +
      labs(title = paste("Plot of", y_label, "for Pivot", pivotName),
           color = "Field Name",
           y = y_label) +
      scale_x_date(date_breaks = "1 month", date_labels = "%m-%Y") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 60, hjust = 1),
            legend.justification = c(1, 0), legend.position = c(1, 0),
            legend.title = element_text(size = 8),
            legend.text = element_text(size = 8)) +
      guides(color = guide_legend(nrow = 2, byrow = TRUE))
  } else {
    g <- ggplot(data = data_ci2 %>% filter(season %in% unique_seasons)) +
      geom_line(aes_string(x = "DOY", y = y_aesthetic, col = "season", group = "season")) +
      labs(title = paste("Plot of", y_label, "for Pivot", pivotName),
           color = "Season",
           y = y_label,
           x = "Age of Crop (Days)") +
      theme_minimal() +
      theme(axis.text.x = element_text(angle = 60, hjust = 1),
            legend.justification = c(1, 0), legend.position = c(1, 0),
            legend.title = element_text(size = 8),
            legend.text = element_text(size = 8)) +
      guides(color = guide_legend(nrow = 2, byrow = TRUE))
  }
  subchunkify(g, 3.2, 10)
 }
 cum_ci_plot2 <- function(pivotName){
  end_date <- Sys.Date()
  start_date <- end_date %m-% months(11) # 11 months ago from end_date