Skip to contents

Introduction

The annoData object, from DOSE parckage, is a list containing two data frames: TERM2GENE and TERM2NAME. The TERM2GENE data frame contains the mapping of gene IDs to terms, while the TERM2NAME data frame contains the mapping of terms to their descriptions.

The documentation details the preparation of gene sets (annoData) and guides users on generating gene sets from various sources (e.g., gmt, csv). Users can create their own to match their research interests.

# Load required libraries
library(readxl)
library(dplyr)
library(tidyr)
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

bioc_packages <- c("org.Hs.eg.db", "clusterProfiler", "DOSE", "ReactomePA", "reactome.db")
for (pkg in bioc_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    BiocManager::install(pkg)
  }
}
library(org.Hs.eg.db)
library(clusterProfiler)
library(DOSE)
library(ReactomePA)
library(reactome.db)
library(BrainEnrich)
build_Anno <- getFromNamespace("build_Anno", "DOSE")

Since we may need to convert geneID to symbols, we first define helper functions for it.

symbol_annoData <- function(annoData) {
  require(org.Hs.eg.db)
  tmp <- split_Anno(annoData)
  path2gene <- tmp$path2gene
  path2name <- tmp$path2name
  geneIDs <- path2gene$geneID
  entrez_ids <- AnnotationDbi::keys(org.Hs.eg.db, keytype = "ENTREZID")
  matching_ids <- geneIDs %in% entrez_ids
  percentage_match <- sum(matching_ids) / length(geneIDs) * 100
  threshold <- 90 # For example, 80% match
  is_mostly_entrez <- percentage_match >= threshold
  if (is_mostly_entrez) {
    gene_symbols <- mapIds(org.Hs.eg.db, keys = geneIDs, column = "SYMBOL", keytype = "ENTREZID", multiVals = "first")
  } else {
    stop("The gene IDs are not mostly ENTREZID. Attempting to convert to ENTREZID...")
  }
  if (length(gene_symbols) != length(geneIDs)) {
    stop("Failed to convert gene IDs in the data frame to SYMBOL.")
  }
  path2gene$geneID <- gene_symbols
  path2gene <- path2gene[!(is.na(path2gene$geneID) | path2gene$geneID == ""), ]
  build_Anno <- getFromNamespace("build_Anno", "DOSE")
  annoData <- build_Anno(path2gene, path2name)
  return(annoData)
}

# Alternative approach to convert gene IDs to symbols
# #' Convert Entrez IDs to Gene Symbols
# #'
# #' This function converts a vector of Entrez IDs to gene symbols using the org.Hs.eg.db annotation package.
# #'
# #' @param entrezid A vector of Entrez IDs to be converted to gene symbols.
# #' @return A vector of gene symbols corresponding to the input Entrez IDs.
# #' @importFrom AnnotationDbi mapIds
# #' @export
# entrezid2symbol <- function(entrezid) {
#   # Ensure input is character vector
#   entrezid <- as.character(entrezid)

#   # Map Entrez IDs to gene symbols
#   mappedSymbol <- suppressMessages(
#     mapIds(
#       x = org.Hs.eg.db,
#       keys = entrezid,
#       keytype = "ENTREZID",
#       column = "SYMBOL",
#       multiVals = "first"
#     )
#   )
#   # Remove NA values and attributes
#   mappedSymbol <- na.omit(mappedSymbol)
#   attributes(mappedSymbol) <- NULL
#   return(unname(mappedSymbol))
# }

Gene ontology (GO) gene sets from org.Hs.eg.db

get_GO_data <- getFromNamespace("get_GO_data", "clusterProfiler")
annoData <- get_GO_data("org.Hs.eg.db", ont = "BP", keytype = "SYMBOL")
# saveRDS(annoData, file = "extdata/geneSets/GO_BP.rds", compress = "xz")
annoData <- get_GO_data("org.Hs.eg.db", ont = "MF", keytype = "SYMBOL")
# saveRDS(annoData, file = "extdata/geneSets/GO_MF.rds", compress = "xz")
annoData <- get_GO_data("org.Hs.eg.db", ont = "CC", keytype = "SYMBOL")
saveRDS(annoData, file = "inst/extdata/geneSets/GO_CC.rds", compress = "xz")

# check version of org.Hs.eg.db
packageVersion("clusterProfiler") # '4.12.0'
packageVersion("org.Hs.eg.db") # '3.19.1'

DisGeNET gene sets using get_DGN_data from DOSE package

get_DGN_data <- getFromNamespace("get_DGN_data", "DOSE")
annoData <- get_DGN_data()
test1 <- get_geneSetList(annoData)
str(head(test1))
get_termDescription("C0001546", annoData)
annoData <- symbol_annoData(annoData)
test2 <- get_geneSetList(annoData)
str(head(test2))

saveRDS(annoData, file = "inst/extdata/geneSets/DGN.rds", compress = "xz")
packageVersion("DOSE") # '3.30.1'
packageVersion("AnnotationDbi") # '1.66.0'

KEGG modules using get_KEGG_data from clusterProfiler package

prepare_KEGG <- getFromNamespace("prepare_KEGG", "clusterProfiler")
annoData <- prepare_KEGG("hsa", "MKEGG", "ncbi-geneid")
test1 <- get_geneSetList(annoData)
str(head(test1))

annoData <- symbol_annoData(annoData)
test2 <- get_geneSetList(annoData)
str(head(test2))
get_termDescription("M00124", annoData)
saveRDS(annoData, file = "inst/extdata/geneSets/MKEGG.rds", compress = "xz")

packageVersion("clusterProfiler") # '4.12.0'

KEGG pathways using get_KEGG_data from clusterProfiler package

prepare_KEGG <- getFromNamespace("prepare_KEGG", "clusterProfiler")
annoData <- prepare_KEGG("hsa", "KEGG", "ncbi-geneid")
test1 <- get_geneSetList(annoData)
str(head(test1))

annoData <- symbol_annoData(annoData)
test2 <- get_geneSetList(annoData)
str(head(test2))
get_termDescription("hsa03008", annoData)
saveRDS(annoData, file = "inst/extdata/geneSets/KEGG.rds", compress = "xz")
packageVersion("clusterProfiler") # '4.12.0'

WikiPathways using get_WP_data from clusterProfiler package

prepare_WP_data <- getFromNamespace("prepare_WP_data", "clusterProfiler")
wpdata <- prepare_WP_data("Homo sapiens")
TERM2GENE <- wpdata$WPID2GENE
TERM2NAME <- wpdata$WPID2NAME
build_Anno <- getFromNamespace("build_Anno", "DOSE")
annoData <- build_Anno(TERM2GENE, TERM2NAME)

test1 <- get_geneSetList(annoData)
str(head(test1))

annoData <- symbol_annoData(annoData)
test2 <- get_geneSetList(annoData)
str(head(test2))
get_termDescription("WP2338", annoData)
saveRDS(annoData, file = "inst/extdata/geneSets/WikiPathways.rds", compress = "xz")
packageVersion("clusterProfiler") # '4.12.0'

Reactome using get_Reactome_DATA from ReactomePA package

get_Reactome_DATA <- getFromNamespace("get_Reactome_DATA", "ReactomePA")
annoData <- get_Reactome_DATA("human")
test1 <- get_geneSetList(annoData)
str(head(test1))

annoData <- symbol_annoData(annoData)
test2 <- get_geneSetList(annoData)
str(head(test2))
get_termDescription("R-HSA-1059683", annoData)
saveRDS(annoData, file = "inst/extdata/geneSets/Reactome.rds", compress = "xz")
packageVersion("ReactomePA") # '1.48.0'

SynGO from xlsx file downloaded from SynGO website

# Define URL for downloading SynGO data
url <- "https://www.syngoportal.org/data/SynGO_bulk_download_release_20231201.zip"
# Create temporary file and directory
zip_path <- tempfile(fileext = ".zip")
temp_dir <- tempdir()
# Clean-up function to ensure temp files are removed
on.exit(
  {
    if (file.exists(zip_path)) unlink(zip_path)
    if (file.exists(extracted_file)) unlink(extracted_file)
  },
  add = TRUE
)
extracted_file <- file.path(temp_dir, "syngo_ontologies.xlsx")
download.file(url, zip_path, mode = "wb")
unzip(zip_path, files = "syngo_ontologies.xlsx", exdir = temp_dir)


# Read the data from the extracted file
data <- read_xlsx(extracted_file)

# Process TERM2GENE
TERM2GENE <- data %>%
  dplyr::select(id, hgnc_symbol) %>%
  dplyr::rename(term = id, gene = hgnc_symbol) %>%
  dplyr::mutate(gene = strsplit(gene, ", ")) %>%
  unnest(cols = c(gene))

# Process TERM2NAME
TERM2NAME <- data %>%
  dplyr::select(id, name) %>%
  dplyr::rename(term = id, description = name)

# Use build_Anno function from DOSE package
build_Anno <- getFromNamespace("build_Anno", "DOSE")
annoData <- build_Anno(TERM2GENE, TERM2NAME)
# saveRDS(annoData, file = "extdata/geneSets/SynGO.rds",compress='xz')

CellTypes_Seidlitz2020 from csv file downloaded from github

Ref: Seidlitz, J., Nadig, A., Liu, S., Bethlehem, R. A., Vértes, P. E., Morgan, S. E., … & Raznahan, A. (2020). Transcriptomic and cellular decoding of regional brain vulnerability to neurogenetic disorders. Nature communications, 11(1), 3358.

library(dplyr)
# Define URL for Seidlitz2020
url <- "https://github.com/jms290/PolySyn_MSNs/blob/master/Data/AHBA/celltypes_PSP.csv?raw=true"

# Create temporary file
temp_file <- tempfile()
# Clean-up function to ensure temp files are removed
on.exit(
  {
    if (file.exists(temp_file)) unlink(temp_file)
  },
  add = TRUE
)

download.file(url, temp_file, mode = "wb")

# Read CSV file
TERM2GENE <- read.csv(temp_file) %>%
  dplyr::mutate(term = class) %>%
  filter(gene != "") %>%
  dplyr::select(term, gene)

key_dict <- c(
  "Astro" = "Astrocytes",
  "Endo" = "Endothelial",
  "Micro" = "Microglial",
  "OPC" = "Oligodendrocyte Progenitors",
  "Neuro-Ex" = "Excitatory Neurons",
  "Neuro-In" = "Inhibitory Neurons",
  "Oligo" = "Oligodendrocytes"
)

TERM2NAME <- TERM2GENE %>%
  dplyr::mutate(description = dplyr::recode(term, !!!key_dict)) %>%
  dplyr::select(term, description)
annoData <- build_Anno(TERM2GENE, TERM2NAME)
test1 <- get_geneSetList(annoData)
str(head(test1))
get_termDescription(term = "Astro", annoData)

saveRDS(annoData, file = "inst/extdata/geneSets/CellTypes_Seidlitz2020.rds", compress = "xz")

CellTypes_Lake2018 from a gmt file downloaded from github

Ref: Lake, B. B., Chen, S., Sos, B. C., Fan, J., Kaeser, G. E., Yung, Y. C., … & Zhang, K. (2018). Integrative single-cell analysis of transcriptional and epigenetic states in the human adult brain. Nature biotechnology, 36(1), 70-80.

# Define URL for Lake2018
url <- "https://github.com/molecular-neuroimaging/Imaging_Transcriptomics/raw/main/imaging_transcriptomics/data/geneset_LAKE.gmt"
# Create temporary file
temp_file <- tempfile()
# Clean-up function to ensure temp files are removed
on.exit(
  {
    if (file.exists(temp_file)) unlink(temp_file)
  },
  add = TRUE
)

download.file(url, temp_file, mode = "wb")
# Read GMT file
TERM2GENE <- suppressWarnings({
  clusterProfiler::read.gmt(temp_file) %>%
    filter(gene != "") %>%
    dplyr::select(term, gene)
})

key_dict <- c(
  # Excitatory Neurons
  "Ex1" = "Excitatory Neurons 1",
  "Ex2" = "Excitatory Neurons 2",
  "Ex3a" = "Excitatory Neurons 3a",
  "Ex3b" = "Excitatory Neurons 3b",
  "Ex3c" = "Excitatory Neurons 3c",
  "Ex3d" = "Excitatory Neurons 3d",
  "Ex3e" = "Excitatory Neurons 3e",
  "Ex4" = "Excitatory Neurons 4",
  "Ex5a" = "Excitatory Neurons 5a",
  "Ex5b" = "Excitatory Neurons 5b",
  "Ex6a" = "Excitatory Neurons 6a",
  "Ex6b" = "Excitatory Neurons 6b",
  "Ex8" = "Excitatory Neurons 8",

  # Granule Cells (Cerebellar)
  "Gran" = "Cerebellar Granule Cells",

  # Inhibitory Neurons
  "In1a" = "Inhibitory Neurons 1a",
  "In1b" = "Inhibitory Neurons 1b",
  "In1c" = "Inhibitory Neurons 1c",
  "In2" = "Inhibitory Neurons 2",
  "In3" = "Inhibitory Neurons 3",
  "In4a" = "Inhibitory Neurons 4a",
  "In4b" = "Inhibitory Neurons 4b",
  "In6a" = "Inhibitory Neurons 6a",
  "In6b" = "Inhibitory Neurons 6b",
  "In7" = "Inhibitory Neurons 7",
  "In8" = "Inhibitory Neurons 8",

  # Purkinje Cells (Cerebellar Neurons)
  "Purk1" = "Purkinje Cells 1",
  "Purk2" = "Purkinje Cells 2",

  # Non-Neuronal Cells
  "End" = "Endothelial Cells",
  "Per" = "Pericytes/Smooth Muscle Cells",
  "Ast" = "Astrocytes",
  "Ast_Cer" = "Cerebellar Astrocytes",
  "Oli" = "Oligodendrocytes",
  "OPC" = "Oligodendrocyte Progenitor Cells",
  "OPC_Cer" = "Cerebellar OPCs",
  "Mic" = "Microglia"
)


TERM2NAME <- TERM2GENE %>%
  dplyr::mutate(description = dplyr::recode(term, !!!key_dict)) %>%
  dplyr::select(term, description)
unique(TERM2NAME)

annoData <- build_Anno(TERM2GENE, TERM2NAME)
test1 <- get_geneSetList(annoData)
str(head(test1))
get_termDescription(term = "Ast", annoData)

saveRDS(annoData, file = "inst/extdata/geneSets/CellTypes_Lake2018.rds", compress = "xz")

CellTypes_Martins2021 from a gmt file downloaded from github

Ref: Imaging transcriptomics: Convergent cellular, transcriptomic, and molecular neuroimaging signatures in the healthy adult human brain. Daniel Martins, Alessio Giacomel, Steven CR Williams, Federico Turkheimer, Ottavia Dipasquale, Mattia Veronese, PET templates working group. Cell Reports.

# Define URL for Martins2021
url <- "https://github.com/molecular-neuroimaging/Imaging_Transcriptomics/raw/main/imaging_transcriptomics/data/geneset_Pooled.gmt"
# Create temporary file
temp_file <- tempfile()
# Clean-up function to ensure temp files are removed
on.exit(
  {
    if (file.exists(temp_file)) unlink(temp_file)
  },
  add = TRUE
)
download.file(url, temp_file, mode = "wb")
# Read GMT file
TERM2GENE <- suppressWarnings({
  clusterProfiler::read.gmt(temp_file) %>%
    filter(gene != "") %>%
    dplyr::select(term, gene)
})

key_dict <- c(
  "Astro" = "Astrocytes",
  "Endo" = "Endothelial Cells",
  "Microglia" = "Microglial Cells",
  "Neuro_Exc" = "Excitatory Neurons",
  "Neuro_Inb" = "Inhibitory Neurons",
  "Oligo" = "Oligodendrocytes",
  "OPC" = "Oligodendrocyte Progenitor Cells"
)

TERM2NAME <- TERM2GENE %>%
  dplyr::mutate(description = dplyr::recode(term, !!!key_dict)) %>%
  dplyr::select(term, description)
unique(TERM2NAME)
annoData <- build_Anno(TERM2GENE, TERM2NAME)
test1 <- get_geneSetList(annoData)
str(head(test1))
get_termDescription(term = "Astro", annoData)
saveRDS(annoData, file = "inst/extdata/geneSets/CellTypes_Martins2021.rds", compress = "xz")

MeSH

First we define a function to prepare the MeSH data

get_MeSH_data <- function(MeSH.hsa.db, MeSH.db, database, category) {
  category <- toupper(category)
  categories <- c(
    "A", "B", "C", "D",
    "E", "F", "G", "H",
    "I", "J", "K", "L",
    "M", "N", "V", "Z"
  )

  if (!all(category %in% categories)) {
    stop("please check your 'category' parameter...")
  }

  mesh <- AnnotationDbi::select(MeSH.hsa.db, keys = database, columns = c("GENEID", "MESHID", "MESHCATEGORY"), keytype = "SOURCEDB")
  mesh <- mesh[mesh[, 3] %in% category, ]
  mesh2gene <- mesh[, c(2, 1)]
  mesh2name <- AnnotationDbi::select(MeSH.db, keys = unique(mesh2gene[, 1]), columns = c("MESHID", "MESHTERM"), keytype = "MESHID")
  build_Anno(mesh2gene, mesh2name)
}
BiocManager::install("AnnotationHub")
BiocManager::install("MeSHDbi")

library(AnnotationHub)
library(MeSHDbi)

# prepare MeSH.hsa.db (it is a gene-MeSH db)
ah <- AnnotationHub(cache = "C:/Users/Zhipeng/AppData/Local/Temp/Rtmpe24TtF/BiocFileCache", localHub = TRUE) # localHub=TRUE if you have downloaded the data
hsa <- query(ah, c("MeSHDb", "Homo sapiens"))
file_hsa <- hsa[[1]]
MeSH.hsa.db <- MeSHDbi::MeSHDb(file_hsa)

# prepare MeSH.db (it is a name db)
ah <- AnnotationHub(localHub = TRUE)
# cache="C:/Users/Zhipeng/AppData/Local/R/cache/R/AnnotationHub/205c177937a_98389"
dbfile1 <- query(ah, c("MeSHDb", "MeSH.db"))[[1]]
MeSH.db <- MeSHDbi::MeSHDb(dbfile1)

annoData <- get_MeSH_data(MeSH.hsa.db = MeSH.hsa.db, MeSH.db = MeSH.db, database = "gene2pubmed", category = "F")
test1 <- get_geneSetList(annoData)
str(head(test1))
get_termDescription(term = "D003071", annoData)

annoData <- symbol_annoData(annoData)
test2 <- get_geneSetList(annoData)
str(head(test2))
length(test2)
saveRDS(annoData, file = "inst/extdata/geneSets/test.rds", compress = "xz")

After figure out for one category, we can make it a loop for gene2pubmed

for (cat_i in c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "V", "Z")) {
  message("Preparing category: ", cat_i)
  # Fetching the MeSH data for the specified category
  annoData <- get_MeSH_data(MeSH.hsa.db = MeSH.hsa.db, MeSH.db = MeSH.db, database = "gene2pubmed", category = cat_i)
  geneSetList <- get_geneSetList(annoData)
  if (length(geneSetList) == 0) {
    message("No gene set found for category: ", cat_i)
  } else {
    message("Converting gene IDs to gene symbols for category: ", cat_i)
    annoData <- symbol_annoData(annoData)
    message("Saving annoData for category: ", cat_i)
    saveRDS(annoData, file = paste0("inst/extdata/geneSets/MeSH_", cat_i, ".rds"), compress = "xz")
  }
}