Skip to contents

Introduction

Here, we documented how to precompute null gene set scores and run simulations on high-performance computing (HPC) systems using Slurm. By leveraging job splitting and parallel processing, we provide step-by-step instructions for running distributed permutation tests and simulations across multiple nodes on an HPC cluster.

Note: This is intended for users who have access to an HPC system and want to perform large-scale analyses efficiently.

1. Installation of the BrainEnrich package

# Install remotes if you haven't already
if (!requireNamespace("remotes", quietly = TRUE)) {
  install.packages("remotes")
}
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
BiocManager::install("DOSE")
# Install brainEnrich from GitHub
remotes::install_github("zh1peng/BrainEnrich")

2. Download gene_data and annoData to the package

We only need to do this once, and they will be available for all the nodes in the HPC. If you download them during the slurm job, it might cause the job to fail due to the sequential download of the data. The first node is trying to download the data, and the other nodes are trying to access the data that is not yet downloaded.

library(BrainEnrich)
gene_data <- get_geneExp(atlas = "desikan", rdonor = "r0.4", hem = "L")
annoData <- get_annoData(type = "SynGO")
annoData <- get_annoData(type = "GO_MF")

3. use job_splitter (a generic function to split jobs) to do brainscore

3.1. Prepare the script for the job_splitter function

Here, we will prepare a script for distributed computing environments, allowing large-scale permutation tests to be split across multiple compute nodes.

# brainscore_vacc.R
# Rscript brainscore_vacc.R <job_id> <n_iter_per_job> <iter_total> <cor_method> <aggre_method> <null_model> <gs_type> <minGSSize> <maxGSSize>
# Get command-line arguments
args <- commandArgs(trailingOnly = TRUE)
job_id <- as.integer(args[1])
n_iter_per_job <- as.integer(args[2])
iter_total <- as.integer(args[3])
cor_method <- args[4]
aggre_method <- args[5]
null_model <- args[6]
gs_type <- args[7]
minGSSize <- as.integer(args[8])
maxGSSize <- as.integer(args[9])

# Load your R package and any required libraries
library(BrainEnrich)

# Set other parameters
output_dir_base <- "/gpfs1/home/z/c/zcao4/BrainEnrich/precomputed_brainscore"
data_path <- "/gpfs1/home/z/c/zcao4/BrainEnrich/data"

# Load hcp brain_data
brain_data <- readRDS(file.path(data_path, "hcp_brain_data_dk_lh.RDS"))
# Load additional necessary data
gene_data <- get_geneExp(atlas = "desikan", rdonor = "r0.4", hem = "L")
annoData <- get_annoData(type = gs_type)

if (null_model == "spin_brain") {
  subset_vars <- list(perm_id = perm_id_dk_lh_5000)
} else {
  subset_vars <- list()
}

# Update the output directory based on the input parameters
output_dir <- file.path(
  output_dir_base,
  sprintf(
    "%s_%s_%s_%s_%d_%d_%d",
    gs_type,
    null_model,
    cor_method,
    aggre_method,
    minGSSize,
    maxGSSize,
    iter_total
  )
)

# Call the job_splitter function to split the jobs and run the brainscore function
job_splitter(
  job_id = job_id,
  n_iter_per_job = n_iter_per_job,
  iter_total = iter_total,
  prefix = "res_job_",
  output_dir = output_dir,
  FUN = brainscore,
  subset_vars = subset_vars, # perm_id will be subsetted when spliting the jobs for spin_brain mode
  subset_total_var = "n_perm",
  brain_data = brain_data,
  gene_data = gene_data,
  annoData = annoData,
  cor_method = cor_method,
  aggre_method = aggre_method,
  null_model = null_model,
  n_cores = 1,
  minGSSize = minGSSize,
  maxGSSize = maxGSSize
)

3.2. Prepare the slurm job file

#!/bin/bash
#SBATCH --job-name=gsScore              # Job name
#SBATCH --output=gsScore.out  
#SBATCH --error=gsScore.err   
#SBATCH --time=10:00:00                 # Time limit hrs:min:sec
#SBATCH --nodes=1                       # Number of nodes
#SBATCH --ntasks=1                      # Number of tasks
#SBATCH --mem=16G                       # Memory per node
#SBATCH --array=1-1000                  # Array range (adjust based on perm_total / n_perm_per_job)

# Define variables for the script
n_iter_per_job=5          # Number of permutations per job (adjust as needed)
iter_total=5000           # Total number of permutations
cor_method="pearson"      # Correlation method
aggre_method="mean"       # Aggregation method
minGSSize=20              # Minimum gene set size
maxGSSize=200             # Maximum gene set size

# Define an array of null models to loop through
null_models=("resample_gene" "spin_brain")
gs_types=("SynGO" "GO_MF")

# Change directory to where your R script is located
cd /gpfs1/home/z/c/zcao4/BrainEnrich

for gs_type in "${gs_types[@]}"; do
  for null_model in "${null_models[@]}"; do
    Rscript --vanilla run_brainscore_hpc.R $SLURM_ARRAY_TASK_ID $n_iter_per_job $iter_total $cor_method $aggre_method $null_model $gs_type $minGSSize $maxGSSize
  done
done

Note: when running job_cat, it will detect if there are rds files missing in the output_dir if there are some job failed, do this to rerun the failed jobs Here is how to rerun the failed jobs:

#!/bin/bash
#SBATCH --job-name=gsScore_retry       # Job name
#SBATCH --output=gsScore_retry_%A_%a.out     # Standard output log
#SBATCH --error=gsScore_retry_%A_%a.err      # Standard error log
#SBATCH --time=10:00:00                # Time limit hrs:min:sec
#SBATCH --nodes=1                      # Number of nodes
#SBATCH --ntasks=1                     # Number of tasks
#SBATCH --mem=16G                      # Memory per node
#SBATCH --array=1-5                    # Array range to cover all indices

# Define variables for the script
n_iter_per_job=5          # Number of permutations per job (adjust as needed)
iter_total=5000           # Total number of permutations
cor_method="pearson"      # Correlation method
aggre_method="mean"       # Aggregation method
minGSSize=20              # Minimum gene set size
maxGSSize=200             # Maximum gene set size
null_model="resample_gene" # Null model
gs_type="GO_MF"           # Gene set type

# Define the list of missing job IDs
job_id_list=(70 74 75 76 79)

# Get the specific job ID based on the SLURM_ARRAY_TASK_ID
job_id=${job_id_list[$SLURM_ARRAY_TASK_ID-1]}

# Change directory to where your R script is located
cd /gpfs1/home/z/c/zcao4/BrainEnrich

# Run the R script for the specific jobs
Rscript --vanilla run_brainscore_hpc.R $job_id $n_iter_per_job $iter_total $cor_method $aggre_method $null_model $gs_type $minGSSize $maxGSSize

3.3. Use job_cat to collect the brainscore results from jobs

library(BrainEnrich)

# Set the base output directory
input_dir_base <- "/gpfs1/home/z/c/zcao4/BrainEnrich/precomputed_brainscore"

# Define the parameters of the analysis
n_iter_per_job <- 5 # Number of iterations per job (adjust as needed)
iter_total <- 5000 # Total number of iterations
cor_method <- "pearson" # Correlation method
aggre_method <- "mean" # Aggregation method
minGSSize <- 20 # Minimum gene set size
maxGSSize <- 200 # Maximum gene set size

# Define an array of null models and gene set types to loop through
null_models <- c("resample_gene", "spin_brain")
gs_types <- c("SynGO", "GO_MF")

# Loop through each combination of null model and gene set type
for (null_model in null_models) {
  for (gs_type in gs_types) {
    # Construct the folder name based on the parameters
    folder_name <- sprintf(
      "%s_%s_%s_%s_%d_%d_%d",
      gs_type, null_model, cor_method, aggre_method, minGSSize, maxGSSize, iter_total
    )

    # Construct the full path to the folder
    input_dir <- file.path(input_dir_base, folder_name)
    output_dir <- input_dir_base

    # Construct the save name based on the folder name
    save_name <- folder_name

    # Call the job_cat function to merge the RDS files in this folder
    job_cat(
      input_dir = input_dir,
      output_dir = output_dir,
      n_rds = iter_total / n_iter_per_job,
      save_name = save_name,
      file_pattern = "res_job_%d.rds",
      delete_originals = TRUE,
      preserve_attributes = TRUE,
      result_prefix = "null_"
    )
  }
}

3.4. loop through other input arguments

Note!

For the array in bash, do not include commas in the array. For example,

aggre_methods=("mean" "median" "meanabs" "maxmean", "ks_weighted", "ks_pos_neg_sum")

Here, maxmean will be treated as maxmean, and ks_weighted will be treated as ks_weighted, respectively.

We can also loop through more arguments in the slurm job file:

#!/bin/bash
#SBATCH --job-name=gsScore       # Job name
#SBATCH --output=gsScore.out     # Standard output log
#SBATCH --error=gsScore.err      # Standard error log
#SBATCH --time=10:00:00          # Time limit hrs:min:sec
#SBATCH --nodes=1                # Number of nodes
#SBATCH --ntasks=1               # Number of tasks
#SBATCH --mem=16G                # Memory per node
#SBATCH --array=1-1000           # Array range (adjust based on perm_total / n_perm_per_job)

# Define variables for the script
n_iter_per_job=5          # Number of permutations per job (adjust as needed)
iter_total=5000           # Total number of permutations
cor_method="pearson"      # Correlation method
minGSSize=20              # Minimum gene set size
maxGSSize=200             # Maximum gene set size

# Define arrays of null models, gene set types, and aggregation methods
null_models=("resample_gene" "spin_brain")
gs_types=("SynGO" "GO_MF")
aggre_methods=("mean" "median" "meanabs" "maxmean" "ks_weighted" "ks_pos_neg_sum")

# Change directory to where your R script is located
cd /gpfs1/home/z/c/zcao4/BrainEnrich

# Loop over gene set types, null models, and aggregation methods
for gs_type in "${gs_types[@]}"; do
  for null_model in "${null_models[@]}"; do
    for aggre_method in "${aggre_methods[@]}"; do
      Rscript --vanilla brainscore_vacc.R $SLURM_ARRAY_TASK_ID $n_iter_per_job $iter_total $cor_method $aggre_method $null_model $gs_type $minGSSize $maxGSSize
    done
  done
done

Similar way to collect the results using job_cat

# Set the base output directory
input_dir_base <- "/gpfs1/home/z/c/zcao4/BrainEnrich/precomputed_brainscore"

# Define the parameters of the analysis
n_iter_per_job <- 5 # Number of iterations per job
iter_total <- 5000 # Total number of iterations
cor_method <- "pearson" # Correlation method
minGSSize <- 20 # Minimum gene set size
maxGSSize <- 200 # Maximum gene set size

# Define an array of null models, gene set types, and aggregation methods to loop through
null_models <- c("resample_gene", "spin_brain")
gs_types <- c("SynGO", "GO_MF")
aggre_methods <- c("ks_weighted", "maxmean")

# Loop through each combination of null model, gene set type, and aggregation method
for (null_model in null_models) {
  for (gs_type in gs_types) {
    for (aggre_method in aggre_methods) {
      # Construct the folder name based on the parameters
      folder_name <- sprintf(
        "%s_%s_%s_%s_%d_%d_%d",
        gs_type, null_model, cor_method, aggre_method, minGSSize, maxGSSize, iter_total
      )

      # Construct the full path to the folder
      input_dir <- file.path(input_dir_base, folder_name)
      output_dir <- input_dir_base

      # Construct the save name based on the folder name
      save_name <- folder_name

      # Call the job_cat function to merge the RDS files in this folder
      job_cat(
        input_dir = input_dir,
        output_dir = output_dir,
        n_rds = iter_total / n_iter_per_job,
        save_name = save_name,
        file_pattern = "res_job_%d.rds",
        delete_originals = TRUE,
        preserve_attributes = TRUE,
        result_prefix = "null_"
      )
    }
  }
}

4. setup brainscore.lm_test with precomputed null scores

One advantage of precomputing null gene set scores is that you can reuse them for multiple analyses without recalculating them each time.

library(BrainEnrich)
data_path <- "/gpfs1/home/z/c/zcao4/BrainEnrich/data"
res_path <- "/gpfs1/home/z/c/zcao4/BrainEnrich/res"
if (!dir.exists(res_path)) {
  dir.create(res_path)
}

null_score_dir <- "/gpfs1/home/z/c/zcao4/BrainEnrich/precomputed_brainscore"

# Load hcp brain_data
gene_data <- get_geneExp(atlas = "desikan", rdonor = "r0.4", hem = "L")
brain_data <- readRDS(file.path(data_path, "hcp_brain_data_dk_lh.RDS"))
cov_df <- readRDS(file.path(data_path, "hcp_cov_df.RDS")) # Assuming covariates data is stored in cov_df.RDS
pred_df <- readRDS(file.path(data_path, "hcp_pred_df.RDS")) # Assuming predictor data is stored in pred_df.RDS

# Define the parameters of the analysis
n_iter_per_job <- 5 # Number of iterations per job
iter_total <- 5000 # Total number of iterations
cor_method <- "pearson" # Correlation method
minGSSize <- 20 # Minimum gene set size
maxGSSize <- 200 # Maximum gene set size

# Define an array of null models, gene set types, and aggregation methods to loop through
null_models <- c("resample_gene", "spin_brain")
gs_types <- c("SynGO")
aggre_methods <- c("mean", "median", "meanabs", "maxmean", "ks_weighted", "ks_pos_neg_sum")


# Loop through each combination of null model, gene set type, and aggregation method
for (null_model in null_models) {
  for (gs_type in gs_types) {
    annoData <- get_annoData(type = gs_type)
    for (aggre_method in aggre_methods) {
      print(paste(gs_type, null_model, cor_method, aggre_method, minGSSize, maxGSSize, iter_total, sep = "_"))

      # Construct the folder name based on the parameters
      nllScoreRds <- sprintf(
        "%s/%s_%s_%s_%s_%d_%d_%d.rds",
        null_score_dir, gs_type, null_model, cor_method, aggre_method, minGSSize, maxGSSize, iter_total
      )
      nullscore <- readRDS(nllScoreRds)

      res <- brainscore.lm_test(
        pred_df = pred_df,
        cov_df = cov_df,
        brain_data = brain_data,
        gene_data = gene_data,
        annoData = annoData,
        gsScoreList.null = nullscore,
        cor_method = "pearson",
        aggre_method = aggre_method,
        n_cores = 63,
        minGSSize = minGSSize,
        maxGSSize = maxGSSize,
        null_model = null_model,
        n_perm = iter_total,
        pvalueCutoff = 1,
        gsea_obj = FALSE, # return a df instead of a gsea object
        threshold_type = "none"
      )

      saveRDS(res, file.path(res_path, sprintf("res_%s_%s_%s_%s_%d_%d_%d.rds", gs_type, null_model, cor_method, aggre_method, minGSSize, maxGSSize, iter_total)))
    }
  }
}

setup slurm job file to run the brainscore.lm_test Note that if setting n_cores=0 to auto detect the number of cores, it will cause error. In the Rscript, we set n_cores=63 to use 63 cores, which is the maximum number of cores on the node.

#!/bin/bash
#SBATCH --job-name=lm              # Job name
#SBATCH --error=lm.err   
#SBATCH --out=lm.out   
#SBATCH --time=24:00:00                 # Time limit hrs:min:sec
#SBATCH --nodes=1                       # Number of nodes
#SBATCH --ntasks=64                    # Number of tasks
#SBATCH --mem=64G                       # Memory per node (increase memory if it is unexpextedly slow)

cd /gpfs1/home/z/c/zcao4/BrainEnrich
Rscript --vanilla lm_tes_vacc.R

5. use job_splitter (a generic function to split jobs) to do brainscore.simulate

5.1. Prepare the script for the job_splitter function

Here, we will prepare a script for distributed computing environments, allowing large-scale permutation tests to be split across multiple compute nodes.

# Rscript this_script.R <job_id> <n_iter_per_job> <iter_total> <cor_method> <aggre_method> <sim_type> <gs_type> <minGSSize> <maxGSSize> <gsScoreList.null>
# Get command-line arguments
# this hpc function is for type1 simulation
args <- commandArgs(trailingOnly = TRUE)
job_id <- as.integer(args[1])
n_iter_per_job <- as.integer(args[2])
iter_total <- as.integer(args[3])
cor_method <- args[4]
aggre_method <- args[5]
sim_type <- args[6]
gs_type <- args[7]
minGSSize <- as.integer(args[8])
maxGSSize <- as.integer(args[9])
gsScoreList.null <- args[10]

# Load your R package and any required libraries
library(BrainEnrich)

# Set other parameters
output_dir_base <- "/gpfs1/home/z/c/zcao4/BrainEnrich/sim_res"
data_path <- "/gpfs1/home/z/c/zcao4/BrainEnrich/data"

# Load hcp brain_data
brain_data <- readRDS(file.path(data_path, "hcp_brain_data_dk_lh.RDS"))
cov_df <- readRDS(file.path(data_path, "hcp_cov_df.RDS")) # Assuming covariates data is stored in cov_df.RDS
pred_df <- readRDS(file.path(data_path, "hcp_pred_df.RDS")) # Assuming predictor data is stored in pred_df.RDS

# Load additional necessary data
gene_data <- get_geneExp(atlas = "desikan", rdonor = "r0.4", hem = "L")
annoData <- get_annoData(type = gs_type)

if (sim_type == "spin_brain") {
  subset_vars <- list(perm_id = perm_id_dk_lh_5000)
} else {
  subset_vars <- list()
}

if (sim_type %in% c("resample_gene", "spin_brain")) {
  gsScoreList.null <- readRDS(gsScoreList.null)
}

# Update the output directory based on the input parameters
output_dir <- file.path(
  output_dir_base,
  sprintf(
    "%s_%s_%s_%s_%d_%d_%d",
    gs_type,
    sim_type,
    cor_method,
    aggre_method,
    minGSSize,
    maxGSSize,
    iter_total
  )
)

# Call the job_splitter function
job_splitter(
  job_id = job_id,
  n_iter_per_job = n_iter_per_job,
  iter_total = iter_total,
  output_dir = output_dir,
  FUN = brainscore.simulate,
  subset_vars = subset_vars, # No subset_vars required in this case
  subset_total_var = "sim_n",
  brain_data = brain_data,
  gene_data = gene_data,
  annoData = annoData,
  pred_df = pred_df,
  cov_df = cov_df,
  cor_method = cor_method,
  aggre_method = aggre_method,
  subsample_size = c(100, 200, 300),
  sim_setting = "type1",
  sim_type = sim_type,
  n_cores = 1,
  minGSSize = minGSSize,
  maxGSSize = maxGSSize,
  n_perm = 5000, # should match gsScoreList
  gsScoreList.null = gsScoreList.null # This argument is now correctly placed at the end
)

5.2. Prepare the slurm job file

#!/bin/bash
#SBATCH --job-name=sim_type1    # Job name
#SBATCH --output=type1.out             # Standard output log
#SBATCH --error=type1.err              # Standard error log
#SBATCH --time=24:00:00              # Time limit hrs:min:sec
#SBATCH --nodes=1                    # Number of nodes
#SBATCH --ntasks=1                   # Number of tasks
#SBATCH --mem=24G                    # Memory per node
#SBATCH --array=1-1000               # Array range (adjust based on iter_total / n_iter_per_job)

# Define the variables
n_iter_per_job=1          # Number of iterations per job
iter_total=1000           # Total number of iterations
cor_method="pearson"      # Correlation method
minGSSize=20              # Minimum gene set size
maxGSSize=200             # Maximum gene set size
sim_types=("randomize_pred" "spin_brain" "resample_gene")  # Simulation types
gs_types=("SynGO")        # Gene set types
aggre_methods=("mean" "median" "meanabs" "maxmean" "ks_weighted" "ks_pos_neg_sum") # Aggregation methods

# Change to the directory where the R script is located
cd /gpfs1/home/z/c/zcao4/BrainEnrich

# Loop through each combination of gene set type, simulation type, and aggregation method
for gs_type in "${gs_types[@]}"; do
  for sim_type in "${sim_types[@]}"; do
    for aggre_method in "${aggre_methods[@]}"; do
      if [[ "$sim_type" == "randomize_pred" ]]; then
        # Run the script without gsScoreList.null for randomize_pred
        Rscript --vanilla brainscore_simulate_type1_vacc.R $SLURM_ARRAY_TASK_ID $n_iter_per_job $iter_total $cor_method $aggre_method $sim_type $gs_type $minGSSize $maxGSSize ""
      else
        # Set the gsScoreList.null file path based on the current analysis parameters
        gsScoreList_null="/gpfs1/home/z/c/zcao4/BrainEnrich/precomputed_brainscore/${gs_type}_${sim_type}_${cor_method}_${aggre_method}_${minGSSize}_${maxGSSize}_5000.rds"
        
        # Run the script with gsScoreList.null for other simulation types
        Rscript --vanilla brainscore_simulate_type1_vacc.R $SLURM_ARRAY_TASK_ID $n_iter_per_job $iter_total $cor_method $aggre_method $sim_type $gs_type $minGSSize $maxGSSize $gsScoreList_null
      fi
    done
  done
done

5.3. Use job_cat to collect the brainscore results from jobs

library(BrainEnrich)

# Set the base input and output directories
input_dir_base <- "/gpfs1/home/z/c/zcao4/BrainEnrich/sim_res"
output_dir_base <- "/gpfs1/home/z/c/zcao4/BrainEnrich/sim_res"

# Define the parameters of the analysis
iter_total <- 1000 # Total number of iterations
cor_method <- "pearson" # Correlation method
aggre_methods <- c("ks_pos_neg_sum", "ks_weighted", "maxmean", "meanabs", "median", "mean") # Aggregation methods
null_models <- c("randomize_pred", "resample_gene", "spin_brain") # Null models
gs_types <- c("SynGO") # Gene set types
minGSSize <- 20 # Minimum gene set size
maxGSSize <- 200 # Maximum gene set size

# Loop through each combination of null model, aggregation method, and gene set type
for (null_model in null_models) {
  for (aggre_method in aggre_methods) {
    for (gs_type in gs_types) {
      # Construct the folder name based on the parameters
      folder_name <- sprintf(
        "%s_%s_%s_%s_%d_%d_%d",
        gs_type, null_model, cor_method, aggre_method, minGSSize, maxGSSize, iter_total
      )

      # Construct the full path to the folder
      input_dir <- file.path(input_dir_base, folder_name)
      output_dir <- output_dir_base

      # Construct the save name based on the folder name
      save_name <- folder_name

      # Call the job_cat function to merge the RDS files in this folder
      job_cat(
        input_dir = input_dir,
        output_dir = output_dir,
        n_rds = iter_total,
        save_name = save_name,
        file_pattern = "res_job_%d.rds",
        delete_originals = TRUE,
      )
    }
  }
}