scGOclust_vignette

Load packages

scGOclust is a package that leverages Gene Ontology to analyse the functional profile of cells with scRNA-seq data.

 # load required libraries
 
 library(Seurat)
 #> Loading required package: SeuratObject
 #> Loading required package: sp
 #> 'SeuratObject' was built under R 4.3.1 but the current version is
 #> 4.3.2; it is recomended that you reinstall 'SeuratObject' as the ABI
 #> for R may have changed
 #> 
 #> Attaching package: 'SeuratObject'
 #> The following object is masked from 'package:base':
 #> 
 #> intersect
 library(pheatmap)
 library(httr)
 
 ## if (!require("devtools")) install.packages("devtools")
 
 ## install latest from source
 ## for reproducibility we do not update dependencies
 # devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)
 
 library(scGOclust)

2. Load input data

 # get a gene to GO BP terms mapping table
 # remove electronically inferred records
 
 # sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE)) 
 
 #mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
 #dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
 
 # here we load the example data for convenience
 data(mmu_tbl)
 data(dme_tbl)

 # load the gene expression raw count objects
 data(mmu_subset)
 data(dme_subset)
 ls()
 #> [1] "dme_subset" "dme_tbl" "mmu_subset" "mmu_tbl"

3. Build GO BP profile

 ## construct a Seurat object with GO BP as features
 
mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
 #> collect data
 #> compute GO to cell matrix, might take a few secs
 #> Warning: Data is of class matrix. Coercing to dgCMatrix.
 #> time used: 0.66 secs
 #> removing 0 all zero terms
 #> returning GO Seurat object
 
dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
 #> collect data
 #> compute GO to cell matrix, might take a few secs
 #> Warning: Data is of class matrix. Coercing to dgCMatrix.
 #> time used: 0.2 secs
 #> removing 0 all zero terms
 #> returning GO Seurat object

4. Calculate cell type average GO BP profile

 # specify the column with cell type annotation in seurat_obj@meta.data
 
mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
 #> perform normalization and log1p for mmu_go_obj
 #> Normalizing layer: counts
 #> Centering and scaling data matrix
 #> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
 #> Names of identity class contain underscores ('_'), replacing with dashes ('-')
 #> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
 #> perform normalization and log1p for dme_go_obj
 #> Normalizing layer: counts
 #> Centering and scaling data matrix

5. Calculate within-species cell type functional similariy

 # heatmap of Pearson's correlation coefficient of cell type average BP profiles within species
 
mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
 pheatmap(mmu_corr)

dme_corr = cellTypeGOCorr(cell_type_go = dme_ct_go, corr_method = 'pearson')
 pheatmap(dme_corr)

5. Calculate cross-species cell type functional similariy

 
 # calculate Pearson's correlation coefficient of cell type average BP profiles across species
 
corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')

 
 # tries to put cells with higher values on the diagonal
 # helpful when cross-species cell type similarity signal is less clear
 
 plotCellTypeCorrHeatmap(corr, width = 9, height = 10)

 
 # scale per column or row to see the relative similarity
 plotCellTypeCorrHeatmap(corr, scale = 'column', width = 9, height = 10)

6. Dimensional reduction and UMAP visualization of cells with GO profile

 
 # analyze the cell-by-GO BP profile as a count matrix
 # Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.
 
mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
 #> perform normalization and log1p for mmu_go_obj
 #> Normalizing layer: counts
 #> Warning: The following arguments are not used: spread
 #> Finding variable features for layer counts
 #> Warning: The following arguments are not used: spread
 
 #> Warning: The following arguments are not used: spread
 
 #> Warning: The following arguments are not used: spread
 #> Computing nearest neighbor graph
 #> Computing SNN
 #> Warning: The following arguments are not used: spread
 
 #> Warning: The following arguments are not used: spread
 #> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
 #> 
 #> Number of nodes: 219
 #> Number of edges: 9790
 #> 
 #> Running Louvain algorithm...
 #> Maximum modularity in 10 random starts: 0.4765
 #> Number of communities: 4
 #> Elapsed time: 0 seconds
 #> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
 #> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
 #> This message will be shown once per session
 #> 17:05:29 UMAP embedding parameters a = 5.069 b = 1.003
 #> Found more than one class "dist" in cache; using the first, from namespace 'spam'
 #> Also defined by 'BiocGenerics'
 #> 17:05:29 Read 219 rows and found 50 numeric columns
 #> 17:05:29 Using Annoy for neighbor search, n_neighbors = 30
 #> Found more than one class "dist" in cache; using the first, from namespace 'spam'
 #> Also defined by 'BiocGenerics'
 #> 17:05:29 Building Annoy index with metric = cosine, n_trees = 50
 #> 0% 10 20 30 40 50 60 70 80 90 100%
 #> [----|----|----|----|----|----|----|----|----|----|
 #> **************************************************|
 #> 17:05:29 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpDbhzmO/file123c6576dc9b7
 #> 17:05:29 Searching Annoy index using 1 thread, search_k = 3000
 #> 17:05:29 Annoy recall = 100%
 #> 17:05:29 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
 #> 17:05:30 Initializing from normalized Laplacian + noise (using RSpectra)
 #> 17:05:30 Commencing optimization for 500 epochs, with 7794 positive edges
 #> 17:05:30 Optimization finished

 # UMAP plot of the analyzed cell-by-GO BP profile
 # labeled by previously specified cell annotation column in meta.data
 
 DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()

dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
 #> perform normalization and log1p for dme_go_obj
 #> Normalizing layer: counts
 #> Warning: The following arguments are not used: min_dist, spread
 #> Finding variable features for layer counts
 #> Warning: The following arguments are not used: min_dist, spread
 
 #> Warning: The following arguments are not used: min_dist, spread
 
 #> Warning: The following arguments are not used: min_dist, spread
 #> Computing nearest neighbor graph
 #> Computing SNN
 #> Warning: The following arguments are not used: min_dist, spread
 
 #> Warning: The following arguments are not used: min_dist, spread
 #> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
 #> 
 #> Number of nodes: 180
 #> Number of edges: 6265
 #> 
 #> Running Louvain algorithm...
 #> Maximum modularity in 10 random starts: 0.5638
 #> Number of communities: 4
 #> Elapsed time: 0 seconds
 #> Warning: The following arguments are not used: min_dist
 #> 17:05:31 UMAP embedding parameters a = 3.244 b = 1.447
 #> Found more than one class "dist" in cache; using the first, from namespace 'spam'
 #> Also defined by 'BiocGenerics'
 #> 17:05:31 Read 180 rows and found 50 numeric columns
 #> 17:05:31 Using Annoy for neighbor search, n_neighbors = 30
 #> Found more than one class "dist" in cache; using the first, from namespace 'spam'
 #> Also defined by 'BiocGenerics'
 #> 17:05:31 Building Annoy index with metric = cosine, n_trees = 50
 #> 0% 10 20 30 40 50 60 70 80 90 100%
 #> [----|----|----|----|----|----|----|----|----|----|
 #> **************************************************|
 #> 17:05:31 Writing NN index file to temp file /var/folders/37/wf962dk574750g0xnnlxjwvm0000gp/T//RtmpDbhzmO/file123c67377bc6e
 #> 17:05:31 Searching Annoy index using 1 thread, search_k = 3000
 #> 17:05:31 Annoy recall = 100%
 #> 17:05:31 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
 #> 17:05:32 Initializing from normalized Laplacian + noise (using RSpectra)
 #> 17:05:32 Commencing optimization for 500 epochs, with 6618 positive edges
 #> 17:05:32 Optimization finished

 DimPlot(dme_go_analyzed, label = TRUE) + NoLegend()

7. Get co-up and co-down regulated terms between pairs of cell types

 
 ## calculation takes a few minutes due to the Wilcox signed rank test
 
ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')

 head(ct_shared_go$shared_sig_markers)

 
 # query shared GO terms for specific cell type pairs
 
 getCellTypeSharedTerms(shared_go = ct_shared_go,
 cell_type_sp1 = 'intestine_Enteroendocrine cell', 
 cell_type_sp2 = 'enteroendocrine cell',
 return_full = FALSE)

 
 # viola
 sessionInfo()
 #> R version 4.3.2 (2023年10月31日)
 #> Platform: aarch64-apple-darwin20 (64-bit)
 #> Running under: macOS Sonoma 14.2.1
 #> 
 #> Matrix products: default
 #> BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
 #> LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
 #> 
 #> locale:
 #> [1] C/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
 #> 
 #> time zone: Europe/London
 #> tzcode source: internal
 #> 
 #> attached base packages:
 #> [1] stats graphics grDevices utils datasets methods base 
 #> 
 #> other attached packages:
 #> [1] scGOclust_0.2.1 httr_1.4.7 pheatmap_1.0.12 Seurat_5.0.1 
 #> [5] SeuratObject_5.0.1 sp_2.1-1 
 #> 
 #> loaded via a namespace (and not attached):
 #> [1] RcppAnnoy_0.0.21 splines_4.3.2 later_1.3.1 
 #> [4] bitops_1.0-7 filelock_1.0.2 tibble_3.2.1 
 #> [7] polyclip_1.10-6 XML_3.99-0.14 fastDummies_1.7.3 
 #> [10] lifecycle_1.0.4 globals_0.16.2 lattice_0.21-9 
 #> [13] MASS_7.3-60 magrittr_2.0.3 limma_3.56.2 
 #> [16] plotly_4.10.3 sass_0.4.7 rmarkdown_2.25 
 #> [19] jquerylib_0.1.4 yaml_2.3.7 httpuv_1.6.12 
 #> [22] sctransform_0.4.1 spam_2.10-0 spatstat.sparse_3.0-3 
 #> [25] reticulate_1.34.0 cowplot_1.1.1 pbapply_1.7-2 
 #> [28] DBI_1.1.3 RColorBrewer_1.1-3 abind_1.4-5 
 #> [31] zlibbioc_1.46.0 Rtsne_0.16 purrr_1.0.2 
 #> [34] BiocGenerics_0.46.0 RCurl_1.98-1.12 pracma_2.4.2 
 #> [37] rappdirs_0.3.3 GenomeInfoDbData_1.2.10 IRanges_2.34.1 
 #> [40] S4Vectors_0.38.2 ggrepel_0.9.4 irlba_2.3.5.1 
 #> [43] listenv_0.9.0 spatstat.utils_3.0-4 goftest_1.2-3 
 #> [46] RSpectra_0.16-1 spatstat.random_3.2-1 fitdistrplus_1.1-11 
 #> [49] parallelly_1.36.0 leiden_0.4.3.1 codetools_0.2-19 
 #> [52] xml2_1.3.5 tidyselect_1.2.0 farver_2.1.1 
 #> [55] matrixStats_1.1.0 stats4_4.3.2 BiocFileCache_2.8.0 
 #> [58] spatstat.explore_3.2-5 jsonlite_1.8.7 ellipsis_0.3.2 
 #> [61] progressr_0.14.0 ggridges_0.5.4 survival_3.5-7 
 #> [64] tools_4.3.2 progress_1.2.2 ica_1.0-3 
 #> [67] Rcpp_1.0.11 glue_1.6.2 gridExtra_2.3 
 #> [70] xfun_0.41 GenomeInfoDb_1.36.4 dplyr_1.1.4 
 #> [73] withr_2.5.2 fastmap_1.1.1 fansi_1.0.5 
 #> [76] digest_0.6.33 R6_2.5.1 mime_0.12 
 #> [79] colorspace_2.1-0 networkD3_0.4 scattermore_1.2 
 #> [82] tensor_1.5 spatstat.data_3.0-3 biomaRt_2.56.1 
 #> [85] RSQLite_2.3.1 utf8_1.2.4 tidyr_1.3.0 
 #> [88] generics_0.1.3 data.table_1.14.8 prettyunits_1.2.0 
 #> [91] htmlwidgets_1.6.2 slanter_0.2-0 uwot_0.1.16 
 #> [94] pkgconfig_2.0.3 gtable_0.3.4 blob_1.2.4 
 #> [97] lmtest_0.9-40 XVector_0.40.0 htmltools_0.5.7 
 #> [100] dotCall64_1.1-0 scales_1.2.1 Biobase_2.60.0 
 #> [103] png_0.1-8 knitr_1.45 rstudioapi_0.15.0 
 #> [106] reshape2_1.4.4 nlme_3.1-163 curl_5.1.0 
 #> [109] cachem_1.0.8 zoo_1.8-12 stringr_1.5.1 
 #> [112] KernSmooth_2.23-22 parallel_4.3.2 miniUI_0.1.1.1 
 #> [115] AnnotationDbi_1.62.2 pillar_1.9.0 grid_4.3.2 
 #> [118] vctrs_0.6.4 RANN_2.6.1 promises_1.2.1 
 #> [121] dbplyr_2.3.4 xtable_1.8-4 cluster_2.1.4 
 #> [124] evaluate_0.23 cli_3.6.1 compiler_4.3.2 
 #> [127] rlang_1.1.2 crayon_1.5.2 future.apply_1.11.0 
 #> [130] labeling_0.4.3 plyr_1.8.9 stringi_1.8.1 
 #> [133] viridisLite_0.4.2 deldir_1.0-9 munsell_0.5.0 
 #> [136] Biostrings_2.68.1 lazyeval_0.2.2 spatstat.geom_3.2-7 
 #> [139] Matrix_1.6-3 RcppHNSW_0.5.0 hms_1.1.3 
 #> [142] patchwork_1.1.3 bit64_4.0.5 future_1.33.0 
 #> [145] ggplot2_3.4.4 KEGGREST_1.40.1 shiny_1.7.5.1 
 #> [148] highr_0.10 ROCR_1.0-11 igraph_1.5.1 
 #> [151] memoise_2.0.1 bslib_0.5.1 bit_4.0.5