tidy_bernoulli() with {data.table}

code
rtip
datatable
Author

Steven P. Sanderson II, MPH

Published

March 7, 2023

Introduction

So I was challanged by Adrian Antico to learn data.table, so yesterday I started with a single function from my package {TidyDensity} called tidy_bernoulli().

So let’s see how I did (hint, works but needs a lot of improvement, so I’ll learn it.)

Function

Let’s see the function in data.table

library(data.table)
library(tidyr)
library(stats)
library(purrr)

new_func <- function(num_sims, n, pr) {

  # Create a data.table with one row per simulation
  sim_data <- data.table(sim_number = factor(seq(1, num_sims, 1)))
  
  # Group the data by sim_number and add columns for x and y
  sim_data[, `:=` (
    x = list(1:n),
    y = list(stats::rbinom(n = n, size = 1, prob = pr))
  ), by = sim_number]
  
  # Compute the density of the y values and add columns for dx and dy
  sim_data[, `:=` (
    d = list(density(unlist(y), n = n)[c("x", "y")] |>
      set_names("dx", "dy") |>
      as_tibble())
  ), by = sim_number]
  
  # Compute the p-values for the y values and add a column for p
  sim_data[, `:=` (
    p = list(stats::pbinom(unlist(y), size = 1, prob = pr))
  ), by = sim_number]
  
  # Compute the q-values for the p-values and add a column for q
  sim_data[, `:=` (
    q = list(stats::qbinom(unlist(p), size = 1, prob = pr))
  ), by = sim_number]
  
  # Unnest the columns for x, y, d, p, and q
  sim_data <- sim_data[, 
                       unnest(
                         .SD, 
                         cols = c("x", "y", "d", "p", "q")
                         ), 
                       by = sim_number]
  
  # Remove the grouping
  sim_data[, sim_number := as.factor(sim_number)]
  
  return(sim_data)
}

Example

Now, let’s see the output of the original function tidy_bernoulli() and new_func().

library(TidyDensity)
n <- 50
pr <- 0.1
sims <- 5

set.seed(123)
tb <- tidy_bernoulli(.n = n, .prob = pr, .num_sims = sims)

set.seed(123)
nf <- new_func(n = n, num_sims = sims, pr = pr)

print(tb)
# A tibble: 250 × 7
   sim_number     x     y      dx     dy     p     q
   <fct>      <int> <int>   <dbl>  <dbl> <dbl> <dbl>
 1 1              1     0 -0.405  0.0292   0.9     0
 2 1              2     0 -0.368  0.0637   0.9     0
 3 1              3     0 -0.331  0.129    0.9     0
 4 1              4     0 -0.294  0.243    0.9     0
 5 1              5     1 -0.258  0.424    1       1
 6 1              6     0 -0.221  0.688    0.9     0
 7 1              7     0 -0.184  1.03     0.9     0
 8 1              8     0 -0.147  1.44     0.9     0
 9 1              9     0 -0.110  1.87     0.9     0
10 1             10     0 -0.0727 2.25     0.9     0
# ℹ 240 more rows
print(nf)
     sim_number  x y         dx          dy   p q
  1:          1  1 0 -0.4053113 0.029196114 0.9 0
  2:          1  2 0 -0.3683598 0.063683226 0.9 0
  3:          1  3 0 -0.3314083 0.129227066 0.9 0
  4:          1  4 0 -0.2944568 0.242967496 0.9 0
  5:          1  5 1 -0.2575054 0.424395426 1.0 1
 ---                                             
246:          5 46 0  1.2575054 0.057872104 0.9 0
247:          5 47 0  1.2944568 0.033131931 0.9 0
248:          5 48 1  1.3314083 0.017621873 1.0 1
249:          5 49 1  1.3683598 0.008684076 1.0 1
250:          5 50 0  1.4053113 0.003981288 0.9 0

Ok so at least the output is identical which is a good sign. Now let’s benchmark the two solutions.

library(rbenchmark)
library(dplyr)

benchmark(
  "original" = {
    tidy_bernoulli(.n = n, .prob = pr, .num_sims = sims)
  },
  "data.table" = {
    new_func(n = n, pr = pr, num_sims = sims)
  },
  replications = 100,
  columns = c("test","replications","elapsed","relative","user.self","sys.self"  )
) |>
  arrange(relative)
        test replications elapsed relative user.self sys.self
1   original          100    3.18    1.000      2.89     0.12
2 data.table          100    4.89    1.538      4.49     0.11

Yeah, needs some work but it’s a start.