Parallel Computing Examples Using Rcurvep

Set up the packages

library(future)
library(furrr)
library(dplyr)
library(purrr)
#library(microbenchmark) # it is needed if recalculating the comparison
library(Rcurvep)

datasets from Rcurvep package

data("zfishdev_all") # two endpoints, each endpoint includes 32 chemicals/curves
data("zfishdev_act") # simulated curves based on zfishdev_all, each chemical has 100 simulated curves
data("zfishdev") # four endpoints, each endpoint includes 3 chemicals/curves

When no preferred BMR and would like to do a exhaustive search

This is a computationally intensive procedure. n_sample = 100 is preferred but for demonstration, n_sample = 10 is used. Expressions are used to delay the execution of commands.

In combi_run_rcurvep function, future_pmap function from furrr package is embedded. Use future::plan() to control the types of calculation.


# sequential run
seq_run_multi <- expression(
  
  # set up the plan
  future::plan(sequential),
  
  # calculation
  combi_run_rcurvep(
    zfishdev_all, 
    n_sample = 10,
    TRSH = seq(5, 95, by = 5),
    RNGE = 1000000,
    keep_sets = "act_set",
    seed = 300
  )
)


# parallel run
par_run_multi <- expression(
  
  # set up the plan
  future::plan(multisession, workers = 10),
  
  # calculation
  combi_run_rcurvep(
    zfishdev_all, 
    n_sample = 10,
    TRSH = seq(5, 95, by = 5),
    RNGE = 1000000,
    keep_sets = "act_set",
    seed = 300
  ),
  
  # re-set the plan back
  future::plan(sequential)
)

Calculate 10 times and compare the results

Due to the long time, the results are pasted below. Parallel calculation is faster.


run_speed_multi_rcurvep <- microbenchmark(
  eval(seq_run_multi),
  eval(par_run_multi),
  times = 10
)

#> run_speed_multi_rcurvep 
#Unit: seconds
#                expr      min       lq     mean   median       uq      max neval cld
# eval(seq_run_multi) 70.97373 71.06332 73.85309 74.02031 75.58980 77.31413    10  a 
# eval(par_run_multi) 22.73920 23.86592 24.80765 25.09510 25.36901 27.39045    10   b

When preferred BMRs are available for endpoints

Get the BMRs for each endpoint

bmr_out <- estimate_dataset_bmr(zfishdev_act, plot = FALSE)
bmrd <- bmr_out$outcome

Join the BMRs to the concentration-response data

inp_tb <- bmrd |>
  nest_join(
    zfishdev_all, by = c("endpoint"), keep = TRUE, name = "data"
  ) |>
  select(RNGE, endpoint, bmr_exp, data)
# input_data for combi_run_rcurvep
rmarkdown::paged_table(inp_tb)

Set up the expressions

n_sample = 1000 is preferred but for demonstration, n_sample = 100 is used.

In combi_run_rcurvep function, future_pmap function from furrr package is embedded. Using future_pmap function unexpectedly decreased the speed.

# sequential run
seq_run_bmr <- expression(
  future::plan(sequential),
  pmap(inp_tb, ~ combi_run_rcurvep(..4, TRSH = ..3, RNGE = ..1, n_samples = 100, seed = 300, keep_sets = c("act_set")))
)

# parallel run
par_run_bmr1 <- expression(
  
  future::plan(multisession, workers = 10),
  
  # calculation with no additional future_pmap
  pmap(inp_tb, ~ combi_run_rcurvep(..4, TRSH = ..3, RNGE = ..1, n_samples = 100, seed = 300, keep_sets = c("act_set"))),

  future::plan(sequential)
)


# parallel run
par_run_bmr2 <- expression(
  
  future::plan(multisession, workers = 10),
  
  # calculation with additional future_pmap
  future_pmap(inp_tb, ~ combi_run_rcurvep(..4, TRSH = ..3, RNGE = ..1, n_samples = 100, seed = 300, keep_sets = c("act_set")), .options = furrr_options(seed = 300)),

  future::plan(sequential)
)

Calculate 10 times and compare the results

Due to the long time, the results are pasted below. Parallel calculation is faster when no additional future_pmap is used.

run_speed_bmr_rcurvep <- microbenchmark(
  eval(seq_run_bmr),
  eval(par_run_bmr1),
  eval(par_run_bmr2),
  times = 10
)


#> run_speed_bmr_rcurvep
#Unit: seconds
#               expr      min       lq     mean   median       uq      max neval cld
#  eval(seq_run_bmr) 39.29451 40.34472 42.08456 40.92504 44.08229 47.01601    10 a  
# eval(par_run_bmr1) 17.66869 18.38648 18.89080 18.80669 19.20098 20.97384    10  b 
# eval(par_run_bmr2) 24.38823 25.35811 25.46403 25.48099 25.83589 26.54293    10   c

Fitting based on simulated curves using run_fit

In run_fit function, future_map function from furrr package are embedded. Use future::plan() to control the types of calculation.

Set up the expressions

n_sample = 1000 is preferred but for demonstration, n_sample = 100 is used. Also, create_dataset function is used to convert the incidence data into response data.


# sequential run
seq_fit_hill_boot <- expression(
  
  future::plan(sequential),
  run_fit(create_dataset(zfishdev), hill_pdir = 1, n_samples = 100, modls = "hill")

)

# parallel run
seq_fit_hill_boot <- expression(
  
  future::plan(multisession, workers = 10),
  
   # calculation
  run_fit(create_dataset(zfishdev), hill_pdir = 1, n_samples = 100, modls = "hill"),
  future::plan(sequential)
)

Calculate 10 times and compare the results

Due to the long time, the results are pasted below. Parallel calculation is faster.

run_speed_fit_hill_boot <- microbenchmark(
  eval(seq_fit_hill_boot),
  eval(par_fit_hill_boot),
  times = 10
)

#> run_speed_fit_hill_boot
#Unit: seconds
#               expr      min       lq     mean   median       uq      max neval
# eval(seq_fit_hill_boot) 60.75111 63.42611 64.02762 63.97692 65.46656 66.83198    10
# eval(par_fit_hill_boot) 34.81743 36.88777 37.43076 37.41199 38.88405 39.40711    10

Fitting based on simulated curves using run_fit and pmap

We can also use similar syntax as combi_run_rcurvep to run multiple datasets by testing to include or not include future_pmap.


# make the incidence data as response data
inp_tb_resp <- inp_tb |> mutate(data = map(data, create_dataset))

# sequential run
seq_fit_hill_multi <- expression(
  future::plan(sequential),
  set.seed(2003),
  pmap(inp_tb_resp, ~ run_fit(..4, modls = "hill", hill_pdir = ifelse(..3 < 0, -1, 1), n_samples = 100, keep_sets = c("fit_set")))
)

# parallel run
para_fit_hill_multi1 <- expression(
  future::plan(multisession, workers = 10), 
  set.seed(2003),
  # no future_pmap
  pmap(inp_tb_resp, ~ run_fit(..4, modls = "hill", hill_pdir = ifelse(..3 < 0, -1, 1), n_samples = 100, keep_sets = c("fit_set"))),
  
  future::plan(sequential)
)
  
# parallel run
para_fit_hill_multi2 <- expression(
  future::plan(multisession, workers = 10), 
  
  # with future_pmap
  future_pmap(inp_tb_resp, ~ run_fit(..4, modls = "hill", hill_pdir = ifelse(..3 < 0, -1, 1), n_samples = 100, keep_sets = c("fit_set")), .options = furrr_options(seed = 2023)),
  
  future::plan(sequential)
)

Calculate 10 times and compare the results

Due to the long time, the results are pasted below. Parallel calculation is faster with future_pmap function. It is possible because that only future_map function is used in the run_fit function.

run_speed_fit_hill_multi <- microbenchmark(
  eval(seq_fit_hill_multi),
  eval(para_fit_hill_multi1),
  eval(para_fit_hill_multi2),
  times = 10
)


#> run_speed_fit_hill_multi
#Unit: seconds
#                       expr       min        lq      mean    median        uq       max neval cld
# eval(seq_fit_hill_multi) 210.70736 211.45879 214.57743 212.88469 217.75672 222.96850    10 a  
# eval(para_fit_hill_multi1)  95.43516  95.98258  96.56978  96.65905  97.00355  97.55148    10  b 
# eval(para_fit_hill_multi2) 121.67944 122.25914 122.66318 122.80333 123.16349 123.24216    10   c

Fitting based on original data

In run_fit function, future_map from furrr package are embedded. Use future::plan() to control the types of calculation.

Two parameters are tested: hill and cc2. hill is 3-parameter Hill equation implemented using R. cc2 is 4-parameter Hill equation implemented using Java.

The dataset - respd_1 - includes 3000 curves, is not available in the package. If the dataset is too small, it might not worthwhile to start the parallel computing.

Use `modls = hill` parameter


# sequential run
seq_fit_hill_ori <- expression(
  
  future::plan(sequential),
  run_fit(respd_1,  modls = "hill")

)

# parallel run
par_fit_hill_ori <- expression(
  
  future::plan(multisession, workers = 5),
  run_fit(respd_1,  modls = "hill"),
  future::plan(sequential)
)

Calculate 5 times and compare the results

Due to the long time, the results are pasted below. Parallel calculation is faster.

run_speed_hit_hill_ori <- microbenchmark(
  eval(seq_fit_hill_ori),
  eval(par_fit_hill_ori),
  times = 5
)

#> run_speed_hit_hill_ori
#Unit: seconds
#                   expr       min        lq      mean    median       uq       max neval
# eval(seq_fit_hill_ori) 112.12160 112.30511 112.35622 112.40213 112.4308 112.52144     5
# eval(par_fit_hill_ori)  62.92156  63.04108  63.51158  63.60182  63.9130  64.08043     5

Use `modls = cc2` parameter


# sequential run
seq_fit_cc2 <- expression(
  
  future::plan(sequential),
  run_fit(respd_1,  modls = "cc2")

)

# parallel run
par_fit_cc2 <- expression(
  
  future::plan(multisession, workers = 5),
  run_fit(respd_1,  modls = "cc2"),
  future::plan(sequential)
)

Calculate 5 times and compare the results

Due to the long time, the results are pasted below. Parallel calculation does not improve much. It could be because the cc2 is implemented using Java.

run_speed_fit_cc2 <- microbenchmark(
  eval(seq_fit_cc2),
  eval(par_fit_cc2),
  times = 5
)

#> run_speed_fit_cc2
#Unit: seconds
#              expr      min       lq     mean   median       uq      max neval
# eval(seq_fit_cc2) 68.37777 68.39599 68.46936 68.45011 68.45746 68.66547     5
# eval(par_fit_cc2) 58.07689 58.31388 59.17766 59.01783 59.07421 61.40546     5

Set up the packages

datasets from Rcurvep package

When no preferred BMR and would like to do a exhaustive search

Calculate 10 times and compare the results

When preferred BMRs are available for endpoints

Get the BMRs for each endpoint

Join the BMRs to the concentration-response data

Set up the expressions

Calculate 10 times and compare the results

Fitting based on simulated curves using run_fit

Set up the expressions

Calculate 10 times and compare the results

Fitting based on simulated curves using run_fit and pmap

Calculate 10 times and compare the results

Fitting based on original data

Use modls = hill parameter

Calculate 5 times and compare the results

Use modls = cc2 parameter

Calculate 5 times and compare the results

Use `modls = hill` parameter

Use `modls = cc2` parameter