Merge pull request #383 from saturncloud/r-parallel

R in Parallel workshop
saturncloud · Jul 13, 2022 · d02b2e9 · d02b2e9
2 parents a5cfd72 + d15b0d6
commit d02b2e9
Show file tree

Hide file tree

Showing 17 changed files with 386 additions and 1 deletion.
diff --git a/examples/workshop-r-parallel/.saturn/saturn.json b/examples/workshop-r-parallel/.saturn/saturn.json
@@ -0,0 +1,21 @@
+{
+  "name": "workshop-r-parallel",
+  "image_uri": "public.ecr.aws/saturncloud/saturn-rstudio:2022.04.01",
+  "description": "Supporting materials for the R in Parallel workshop",
+  "working_directory": "/home/jovyan/examples/examples/workshop-r-parallel",
+  "extra_packages": {
+    "cran": "future purrr targets furrr shiny"
+  },
+  "git_repositories": [
+    {
+      "url": "[email protected]:saturncloud/examples.git",
+      "path": "/home/jovyan/examples"
+    }
+  ],
+  "rstudio_server": {
+    "instance_type": "xlarge",
+    "disk_space": "10Gi",
+    "auto_shutoff": "1 hour"
+  },
+  "version": "2022.01.06"
+}
diff --git a/examples/workshop-r-parallel/README.md b/examples/workshop-r-parallel/README.md
@@ -0,0 +1,11 @@
+# Workshop: R in Parallel
+
+This workshop covers the basics of using using R in parallel. It contains three examples:
+
+* `callr_example` - Use callr to run a task in parallel within a Shiny app.
+* `future_example` - Use future and furrr to parallelize a complex task within R.
+* `rstudio_job_example` - Run multiple scripts at once using RStudio Jobs.
+
+Each example has it's own README.md explaining the problem setup.
+
+You can watch the recording of the presentation on the [Saturn Cloud website](https://saturncloud.io/events/webinar-2022-07-nolis-r-parallel/).
diff --git a/examples/workshop-r-parallel/callr-example/README.md b/examples/workshop-r-parallel/callr-example/README.md
@@ -0,0 +1,11 @@
+# callr Example
+
+This example is how to use callr to run a task in the background of a Shiny app. 
+
+Suppose we have a shiny app that we want to add a button that starts a long-running
+R task. We want it so when users press the button, it doesn't freeze the whole app.
+Instead, the task runs in the background while other things happen. The objective
+here is to add to `app.R` the code that causes the `process_the_data` command to
+run silently.
+
+The final app can be found in `app-solutions.R`.
diff --git a/examples/workshop-r-parallel/callr-example/app-solutions.R b/examples/workshop-r-parallel/callr-example/app-solutions.R
@@ -0,0 +1,50 @@
+# This is the RStudio default shiny app, except we want to add a button that does
+# a big data processing in the background
+
+library(shiny)
+
+process_the_data <- function(url) {
+  readr::read_csv(url) |>
+    dplyr::group_by(region) |>
+    dplyr::summarize(revenue = sum(revenue)) |>
+    readr::write_csv("aggregate_data.csv")
+}
+
+ui <- fluidPage(
+    titlePanel("Old Faithful Geyser Data"),
+    sidebarLayout(
+        sidebarPanel(
+            sliderInput("bins",
+                        "Number of bins:",
+                        min = 1,
+                        max = 50,
+                        value = 30),
+            actionButton("process_button", "Process the data")
+        ),
+
+        mainPanel(
+           plotOutput("distPlot"),
+           textOutput("processStart")
+        )
+    )
+)
+
+server <- function(input, output) {
+    output$distPlot <- renderPlot({
+        x    <- faithful[, 2]
+        bins <- seq(min(x), max(x), length.out = input$bins + 1)
+        hist(x, breaks = bins, col = "darkgray", border = "white")
+    })
+
+    output$processStart <- renderText({
+      if (input$process_button > 0) {
+        url <- "https://saturn-public-data.s3.us-east-2.amazonaws.com/r-parallel/example-data.csv"
+        callr::r_bg(process_the_data, args = list(url = url))
+        paste0("Data process started at ", Sys.time())
+      } else {
+        ""
+      }
+    })
+}
+
+shinyApp(ui = ui, server = server)
diff --git a/examples/workshop-r-parallel/callr-example/app.R b/examples/workshop-r-parallel/callr-example/app.R
@@ -0,0 +1,49 @@
+# This is the RStudio default shiny app, except we want to add a button that does
+# a big data processing in the background
+
+library(shiny)
+
+process_the_data <- function(url) {
+  readr::read_csv(url) |>
+    dplyr::group_by(region) |>
+    dplyr::summarize(revenue = sum(revenue)) |>
+    readr::write_csv("aggregate_data.csv")
+}
+
+ui <- fluidPage(
+  titlePanel("Old Faithful Geyser Data"),
+  sidebarLayout(
+    sidebarPanel(
+      sliderInput("bins",
+                  "Number of bins:",
+                  min = 1,
+                  max = 50,
+                  value = 30),
+      actionButton("process_button", "Process the data")
+    ),
+
+    mainPanel(
+      plotOutput("distPlot"),
+      textOutput("processStart")
+    )
+  )
+)
+
+server <- function(input, output) {
+  output$distPlot <- renderPlot({
+    x    <- faithful[, 2]
+    bins <- seq(min(x), max(x), length.out = input$bins + 1)
+    hist(x, breaks = bins, col = "darkgray", border = "white")
+  })
+
+  output$processStart <- renderText({
+    if (input$process_button > 0) {
+      # WHAT DO WE ADD HERE TO PROCESS DATA IN THE BACKGROUND?
+      paste0("Data process started at ", Sys.time())
+    } else {
+      ""
+    }
+  })
+}
+
+shinyApp(ui = ui, server = server)
diff --git a/examples/workshop-r-parallel/future-example/README.md b/examples/workshop-r-parallel/future-example/README.md
@@ -0,0 +1,11 @@
+# future Example
+
+This example is how to use future to split up a task within an R session by passing parts 
+of it to different sessions. Here we have a large dataset `data`, that within it has
+retail transaction data (the date, revenue, customer, region, etc.). We want to
+figure out, for each customer, how long it was between their first and second purchases.
+If we just use standard dplyr, that computation will take a minute. The goal is instead
+to make the command happen faster by computing the times across multiple processors.
+First we'll try with future and then with furrr.
+
+The completed example can be found in `future-example-solutions.R`.
diff --git a/examples/workshop-r-parallel/future-example/future-example-solutions.R b/examples/workshop-r-parallel/future-example/future-example-solutions.R
@@ -0,0 +1,81 @@
+library(future)
+library(furrr)
+library(tidyverse)
+library(glue)
+
+plan(future::multisession())
+
+url <- "https://saturn-public-data.s3.us-east-2.amazonaws.com/r-parallel/example-data.csv"
+data <- read.csv(url)
+
+get_nth_transaction <- function(x, n) {
+  if (length(x) < n) {
+    NA
+  } else {
+    sort(x)[n]
+  }
+}
+
+aggregate_function <- function(data) {
+  data %>%
+    group_by(customer_id, region) %>%
+    summarize(median_revenue = median(revenue),
+              num_transactions = n(),
+              first_transaction = get_nth_transaction(transaction_date, 1),
+              second_transaction = get_nth_transaction(transaction_date, 2),
+              .groups = "drop") %>%
+    mutate(time_between = as.numeric(difftime(second_transaction,
+                                              first_transaction,
+                                              units = "days")))
+}
+
+
+# Example 1: no parallelization ------------------------------------------------
+
+# expected to take 60 seconds
+system.time({
+  aggregate_data <-
+    data %>%
+    aggregate_function()
+})
+
+# Method 2: no parallelization, but split it by region first -------------------
+
+region_data <- data %>%
+  group_by(region) %>%
+  group_split()
+
+system.time({
+  aggregate_region_data <- list()
+  for (i in 1:length(region_data)) {
+    message(glue("Processing {i}"))
+    aggregate_region_data[[i]] <- aggregate_function(region_data[[i]])
+  }
+  aggregate_data <- bind_rows(aggregate_region_data)
+})
+
+# Method 3: split by region and parallelize with future ------------------------
+
+region_data <- data %>%
+  group_by(region) %>%
+  group_split()
+
+system.time({
+  aggregate_region_data <- list()
+  for (i in 1:length(region_data)) {
+    message(glue("Creating {i} future"))
+    aggregate_region_data[[i]] <- future({
+      aggregate_function(region_data[[i]])
+      })
+  }
+  aggregate_data <- bind_rows(value(aggregate_region_data))
+})
+
+# Method 4: split by region and use furrr -------------------------------------
+
+system.time({
+  region_data <- data %>%
+    group_by(region) %>%
+    group_split() %>%
+    future_map_dfr(aggregate_function)
+})
diff --git a/examples/workshop-r-parallel/future-example/future-example.R b/examples/workshop-r-parallel/future-example/future-example.R
@@ -0,0 +1,71 @@
+library(future)
+library(furrr)
+library(tidyverse)
+library(glue)
+
+plan(future::multisession())
+
+url <- "https://saturn-public-data.s3.us-east-2.amazonaws.com/r-parallel/example-data.csv"
+data <- read.csv(url)
+
+get_nth_transaction <- function(x, n) {
+  if (length(x) < n) {
+    NA
+  } else {
+    sort(x)[n]
+  }
+}
+
+aggregate_function <- function(data) {
+  data %>%
+    group_by(customer_id, region) %>%
+    summarize(median_revenue = median(revenue),
+              num_transactions = n(),
+              first_transaction = get_nth_transaction(transaction_date, 1),
+              second_transaction = get_nth_transaction(transaction_date, 2),
+              .groups = "drop") %>%
+    mutate(time_between = as.numeric(difftime(second_transaction,
+                                              first_transaction,
+                                              units = "days")))
+}
+
+
+# Example 1: no parallelization ------------------------------------------------
+
+# expected to take 60 seconds
+system.time({
+  aggregate_data <-
+    data %>%
+    aggregate_function()
+})
+
+# Method 2: no parallelization, but split it by region first -------------------
+
+region_data <- data %>%
+  group_by(region) %>%
+  group_split()
+
+system.time({
+  aggregate_region_data <- list()
+  for (i in 1:length(region_data)) {
+    message(glue("Processing {i}"))
+    aggregate_region_data[[i]] <- aggregate_function(region_data[[i]])
+  }
+  aggregate_data <- bind_rows(aggregate_region_data)
+})
+
+# Method 3: split by region and parallelize with future ------------------------
+
+region_data <- data %>%
+  group_by(region) %>%
+  group_split()
+
+system.time({
+  # HOW CAN WE PROCESS THE DIFFERENT REGIONS IN PARALLEL
+})
+
+# Method 4: split by region and use furrr -------------------------------------
+
+system.time({
+  # HOW CAN WE PROCESS THE DIFFERENT REGIONS IN PARALLEL WITH PURRR
+})
diff --git a/examples/workshop-r-parallel/rstudio-job-example/README.md b/examples/workshop-r-parallel/rstudio-job-example/README.md
@@ -0,0 +1,12 @@
+# RStudio Job Example
+
+This example is how to use RStudio jobs to run ad hoc tasks on request in the background.
+
+
+Suppose we have a long running report `complex-report.Rmd`, and we want to run it with a bunch of different
+parameter sets, in this case with `func` taking a value of 0.1, 0.5, or 1.0. We can write a script to generate
+each version of the report (`generate-report-xx.R` where `xx` is the value), but rather than running each 
+script one at a time let's run them in parallel with RStudio Jobs. Use `rstudio-job-example.R` as a script
+that will start a job for each different report generation.
+
+_The solution can be found in `rstudio-job-example-solutions.R`._
diff --git a/examples/workshop-r-parallel/rstudio-job-example/complex-report.Rmd b/examples/workshop-r-parallel/rstudio-job-example/complex-report.Rmd
@@ -0,0 +1,40 @@
+---
+title: "Complex Report"
+output: html_document
+params:
+  frac: 1
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r load_libraries}
+library(tidyverse)
+```
+
+```{r set_params}
+frac_of_data <- params$frac
+```
+
+```{r load_data}
+url <- "https://saturn-public-data.s3.us-east-2.amazonaws.com/r-parallel/example-data.csv"
+data <- read_csv(url,
+                 show_col_types = FALSE) %>%
+  sample_frac(frac_of_data)
+```
+
+```{r show_table, results = "asis"}
+data |>
+  group_by(region) |>
+  summarize(revenue = sum(revenue)) |>
+  knitr::kable()
+```
+
+```{r show_plot}
+data |>
+  mutate(transaction_date = as.Date(transaction_date)) |>
+  group_by(transaction_date, region) |>
+  summarize(revenue = sum(revenue), .groups = "drop") |>
+  ggplot(aes(x = transaction_date, y = revenue)) + geom_line()
+```
diff --git a/examples/workshop-r-parallel/rstudio-job-example/generate-report-0.1.R b/examples/workshop-r-parallel/rstudio-job-example/generate-report-0.1.R
@@ -0,0 +1,3 @@
+rmarkdown::render("rstudio-job-example/complex-report.Rmd",
+                  output_file = "results-0.1.html",
+                  params = list(frac = 0.1))
diff --git a/examples/workshop-r-parallel/rstudio-job-example/generate-report-0.5.R b/examples/workshop-r-parallel/rstudio-job-example/generate-report-0.5.R
@@ -0,0 +1,3 @@
+rmarkdown::render("rstudio-job-example/complex-report.Rmd",
+                  output_file = "results-0.5.html",
+                  params = list(frac = 0.5))
diff --git a/examples/workshop-r-parallel/rstudio-job-example/generate-report-1.0.R b/examples/workshop-r-parallel/rstudio-job-example/generate-report-1.0.R
@@ -0,0 +1,3 @@
+rmarkdown::render("rstudio-job-example/complex-report.Rmd",
+                  output_file = "results-1.0.html",
+                  params = list(frac = 1.0))
diff --git a/examples/workshop-r-parallel/rstudio-job-example/rstudio-job-example-solutions.R b/examples/workshop-r-parallel/rstudio-job-example/rstudio-job-example-solutions.R
@@ -0,0 +1,3 @@
+rstudioapi::jobRunScript("rstudio-job-example/generate-report-0.1.R", workingDir = here::here())
+rstudioapi::jobRunScript("rstudio-job-example/generate-report-0.5.R", workingDir = here::here())
+rstudioapi::jobRunScript("rstudio-job-example/generate-report-1.0.R", workingDir = here::here())
diff --git a/examples/workshop-r-parallel/rstudio-job-example/rstudio-job-example.R b/examples/workshop-r-parallel/rstudio-job-example/rstudio-job-example.R
@@ -0,0 +1 @@
+# HOW CAN WE RUN 3 JOBS TO GENERATE THE THREE DIFFERENT REPORTS?
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# HOW CAN WE RUN 3 JOBS TO GENERATE THE THREE DIFFERENT REPORTS?