Contents

1 Theory

See slides.

A recent tweet provides a nice summary of efforts to benchmark gene set enrichement analysis methods using the GSEABenchmarkR package.

2 Practice

Data input and massage

library(airway)
data(airway)
airway$dex <- relevel(airway$dex, "untrt")

Differential expression analysis

library(DESeq2)
des <- DESeqDataSet(airway, design = ~ cell + dex)
des <- DESeq(des)
res <- results(des)

Transition to tidy data

library(dplyr)
library(tibble)
tbl <- res %>%
 as.data.frame() %>%
 as_tibble(rownames = "ENSEMBL")
tbl
## # A tibble: 64,102 x 7
## ENSEMBL baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG0000000... 709. -0.381 0.101 -3.79 1.52e-4 1.28e-3
## 2 ENSG0000000... 0 NA NA NA NA NA 
## 3 ENSG0000000... 520. 0.207 0.112 1.84 6.53e-2 1.97e-1
## 4 ENSG0000000... 237. 0.0379 0.143 0.264 7.92e-1 9.11e-1
## 5 ENSG0000000... 57.9 -0.0882 0.287 -0.307 7.59e-1 8.95e-1
## 6 ENSG0000000... 0.318 -1.38 3.50 -0.394 6.94e-1 NA 
## 7 ENSG0000000... 5817. 0.426 0.0883 4.83 1.38e-6 1.82e-5
## 8 ENSG0000000... 1282. -0.241 0.0887 -2.72 6.58e-3 3.28e-2
## 9 ENSG0000000... 610. -0.0476 0.167 -0.286 7.75e-1 9.03e-1
## 10 ENSG0000000... 369. -0.500 0.121 -4.14 3.48e-5 3.42e-4
## # ... with 64,092 more rows

2.1 Example: hypergeometric test using limma ::goana()

Requires ENTREZ identifiers

library(org.Hs.eg.db)
tbl <- tbl %>%
 mutate(
 ENTREZID = mapIds(org.Hs.eg.db, ENSEMBL, "ENTREZID", "ENSEMBL")
 ) %>%
 dplyr::select(ENSEMBL, ENTREZID, everything())
tbl
## # A tibble: 64,102 x 8
## ENSEMBL ENTREZID baseMean log2FoldChange lfcSE stat pvalue
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG00... 7105 709. -0.381 0.101 -3.79 1.52e-4
## 2 ENSG00... 64102 0 NA NA NA NA 
## 3 ENSG00... 8813 520. 0.207 0.112 1.84 6.53e-2
## 4 ENSG00... 57147 237. 0.0379 0.143 0.264 7.92e-1
## 5 ENSG00... 55732 57.9 -0.0882 0.287 -0.307 7.59e-1
## 6 ENSG00... 2268 0.318 -1.38 3.50 -0.394 6.94e-1
## 7 ENSG00... 3075 5817. 0.426 0.0883 4.83 1.38e-6
## 8 ENSG00... 2519 1282. -0.241 0.0887 -2.72 6.58e-3
## 9 ENSG00... 2729 610. -0.0476 0.167 -0.286 7.75e-1
## 10 ENSG00... 4800 369. -0.500 0.121 -4.14 3.48e-5
## # ... with 64,092 more rows, and 1 more variable: padj <dbl>

Universe – must be testable for DE

tbl <- tbl %>%
 filter(!is.na(padj), !is.na(ENTREZID))
tbl
## # A tibble: 14,550 x 8
## ENSEMBL ENTREZID baseMean log2FoldChange lfcSE stat pvalue padj
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ENSG0000... 7105 709. -0.381 0.101 -3.79 1.52e-4 1.28e-3
## 2 ENSG0000... 8813 520. 0.207 0.112 1.84 6.53e-2 1.97e-1
## 3 ENSG0000... 57147 237. 0.0379 0.143 0.264 7.92e-1 9.11e-1
## 4 ENSG0000... 55732 57.9 -0.0882 0.287 -0.307 7.59e-1 8.95e-1
## 5 ENSG0000... 3075 5817. 0.426 0.0883 4.83 1.38e-6 1.82e-5
## 6 ENSG0000... 2519 1282. -0.241 0.0887 -2.72 6.58e-3 3.28e-2
## 7 ENSG0000... 2729 610. -0.0476 0.167 -0.286 7.75e-1 9.03e-1
## 8 ENSG0000... 4800 369. -0.500 0.121 -4.14 3.48e-5 3.42e-4
## 9 ENSG0000... 90529 183. -0.124 0.180 -0.689 4.91e-1 7.24e-1
## 10 ENSG0000... 57185 2814. -0.0411 0.103 -0.400 6.89e-1 8.57e-1
## # ... with 14,540 more rows

limma ::goana() – Hypergeometric

library(limma)
go <-
 goana(tbl$ENTREZID[tbl$padj < .05], tbl$ENTREZID, "Hs") %>%
 as_tibble(rownames = "GOID") %>%
 dplyr::select(GOID, Ont, everything())
go
## # A tibble: 20,656 x 6
## GOID Ont Term N DE P.DE
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 GO:00017... BP cell activation 939 333 6.80e-12
## 2 GO:00022... BP immune effector process 808 251 2.98e- 4
## 3 GO:00022... BP cell activation involved in immune... 498 148 2.43e- 2
## 4 GO:00022... BP myeloid leukocyte activation 463 147 2.01e- 3
## 5 GO:00022... BP myeloid cell activation involved i... 399 118 4.61e- 2
## 6 GO:00022... BP neutrophil activation involved in ... 361 108 4.08e- 2
## 7 GO:00023... BP leukocyte activation involved in i... 494 148 1.86e- 2
## 8 GO:00023... BP immune system process 1958 652 8.14e-16
## 9 GO:00024... BP leukocyte mediated immunity 538 165 5.33e- 3
## 10 GO:00024... BP myeloid leukocyte mediated immunity 408 124 1.89e- 2
## # ... with 20,646 more rows

What GO id’s are most differentially expressed? (Why do these gene sets seem to have a large size, N?)

go %>% arrange(P.DE)
## # A tibble: 20,656 x 6
## GOID Ont Term N DE P.DE
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 GO:0071944 CC cell periphery 3081 1104 8.45e-45
## 2 GO:0050896 BP response to stimulus 5799 1859 8.77e-45
## 3 GO:0032502 BP developmental process 4168 1410 9.73e-44
## 4 GO:0048856 BP anatomical structure development 3891 1330 3.45e-43
## 5 GO:0005886 CC plasma membrane 3013 1078 3.83e-43
## 6 GO:0032501 BP multicellular organismal process 4678 1546 1.83e-42
## 7 GO:0048731 BP system development 3202 1128 7.52e-42
## 8 GO:0023052 BP signaling 4060 1372 7.66e-42
## 9 GO:0007275 BP multicellular organism development 3579 1231 1.46e-40
## 10 GO:0007154 BP cell communication 4076 1371 1.57e-40
## # ... with 20,646 more rows

Sanity check?

go %>%
 filter(grepl("glucocorticoid", Term)) %>%
 arrange(P.DE)
## # A tibble: 22 x 6
## GOID Ont Term N DE P.DE
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 GO:0051... BP response to glucocorticoid 92 43 1.11e-5
## 2 GO:0071... BP cellular response to glucocorticoid... 42 21 6.42e-4
## 3 GO:2000... BP positive regulation of glucocortico... 2 2 6.64e-2
## 4 GO:2000... BP regulation of glucocorticoid recept... 8 4 1.25e-1
## 5 GO:0006... BP glucocorticoid biosynthetic process 8 4 1.25e-1
## 6 GO:0043... BP glucocorticoid mediated signaling p... 3 2 1.65e-1
## 7 GO:0008... BP glucocorticoid metabolic process 13 5 2.26e-1
## 8 GO:0004... MF glucocorticoid receptor activity 1 1 2.58e-1
## 9 GO:0031... BP negative regulation of glucocortico... 4 2 2.75e-1
## 10 GO:0031... BP negative regulation of glucocortico... 4 2 2.75e-1
## # ... with 12 more rows

What genes in set?

genesets <-
 AnnotationDbi::select(org.Hs.eg.db, tbl$ENTREZID, "GOALL", "ENTREZID") %>%
 as_tibble() %>%
 dplyr::select(GOID = GOALL, Ont = ONTOLOGYALL, ENTREZID) %>%
 distinct() %>%
 arrange(Ont, GOID)
genesets
## # A tibble: 1,574,080 x 3
## GOID Ont ENTREZID
## <chr> <chr> <chr> 
## 1 GO:0000002 BP 3980 
## 2 GO:0000002 BP 1890 
## 3 GO:0000002 BP 50484 
## 4 GO:0000002 BP 4205 
## 5 GO:0000002 BP 9093 
## 6 GO:0000002 BP 56652 
## 7 GO:0000002 BP 10891 
## 8 GO:0000002 BP 55186 
## 9 GO:0000002 BP 4358 
## 10 GO:0000002 BP 10000 
## # ... with 1,574,070 more rows

3 Provenance

sessionInfo()
## R version 3.6.0 (2019年04月26日)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19
## 
## Matrix products: default
## BLAS: /home/msmith/Applications/R/R-3.6.0/lib/libRblas.so
## LAPACK: /home/msmith/Applications/R/R-3.6.0/lib/libRlapack.so
## 
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C 
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8 
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C 
## [9] LC_ADDRESS=C LC_TELEPHONE=C 
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C 
## 
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets 
## [8] methods base 
## 
## other attached packages:
## [1] tibble_2.1.3 limma_3.40.2 
## [3] GO.db_3.8.2 org.Hs.eg.db_3.8.2 
## [5] AnnotationDbi_1.46.0 dplyr_0.8.3 
## [7] airway_1.4.0 DESeq2_1.24.0 
## [9] SummarizedExperiment_1.14.0 DelayedArray_0.10.0 
## [11] BiocParallel_1.18.0 matrixStats_0.54.0 
## [13] Biobase_2.44.0 GenomicRanges_1.36.0 
## [15] GenomeInfoDb_1.20.0 IRanges_2.18.1 
## [17] S4Vectors_0.22.0 BiocGenerics_0.30.0 
## [19] BiocStyle_2.12.0 
## 
## loaded via a namespace (and not attached):
## [1] bit64_0.9-7 splines_3.6.0 Formula_1.2-3 
## [4] assertthat_0.2.1 BiocManager_1.30.4 latticeExtra_0.6-28 
## [7] blob_1.2.0 GenomeInfoDbData_1.2.1 yaml_2.2.0 
## [10] pillar_1.4.2 RSQLite_2.1.1 backports_1.1.4 
## [13] lattice_0.20-38 glue_1.3.1 digest_0.6.20 
## [16] RColorBrewer_1.1-2 XVector_0.24.0 checkmate_1.9.4 
## [19] colorspace_1.4-1 htmltools_0.3.6 Matrix_1.2-17 
## [22] XML_3.98-1.20 pkgconfig_2.0.2 genefilter_1.66.0 
## [25] bookdown_0.12 zlibbioc_1.30.0 purrr_0.3.2 
## [28] xtable_1.8-4 scales_1.0.0 htmlTable_1.13.1 
## [31] annotate_1.62.0 ggplot2_3.2.0 nnet_7.3-12 
## [34] lazyeval_0.2.2 cli_1.1.0 survival_2.44-1.1 
## [37] magrittr_1.5 crayon_1.3.4 memoise_1.1.0 
## [40] evaluate_0.14 fansi_0.4.0 foreign_0.8-71 
## [43] tools_3.6.0 data.table_1.12.2 stringr_1.4.0 
## [46] locfit_1.5-9.1 munsell_0.5.0 cluster_2.1.0 
## [49] compiler_3.6.0 rlang_0.4.0 grid_3.6.0 
## [52] RCurl_1.95-4.12 rstudioapi_0.10 htmlwidgets_1.3 
## [55] bitops_1.0-6 base64enc_0.1-3 rmarkdown_1.14 
## [58] codetools_0.2-16 gtable_0.3.0 DBI_1.0.0 
## [61] R6_2.4.0 gridExtra_2.3 knitr_1.23 
## [64] utf8_1.1.4 zeallot_0.1.0 bit_1.1-14 
## [67] Hmisc_4.2-0 stringi_1.4.3 Rcpp_1.0.1 
## [70] geneplotter_1.62.0 vctrs_0.2.0 rpart_4.1-15 
## [73] acepack_1.4.1 tidyselect_0.2.5 xfun_0.8

AltStyle によって変換されたページ (->オリジナル) /