vignettes/quick_start.Rmd
quick_start.Rmd
library(ProjectSVR)
library(Seurat)
library(tidyverse)
options(timeout = max(3600, getOption("timeout")))
`%notin%` <- Negate(`%in%`)
if (!dir.exists("quickstart")) dir.create("quickstart")
# reference data
download.file(url = "https://zenodo.org/record/8147304/files/disco_pbmc_small.seurat.slim.rds", destfile = "quickstart/disco_pbmc_small.seurat.slim.rds")
# query data
download.file(url = "https://zenodo.org/record/8147304/files/query_pbmc_small.seurat.slim.rds", destfile = "quickstart/query_pbmc_small.seurat.slim.rds")
seu.ref <- readRDS("quickstart/disco_pbmc_small.seurat.slim.rds")
DimPlot(seu.ref, pt.size = .4) +
scale_color_manual(values = seu.ref@misc$data.refplot$colors) +
geom_text(inherit.aes = F, data = seu.ref@misc$data.refplot$text.pos,
mapping = aes(x, y, label = label), size = 4)
Here we extract the top25 marker genes for each cell type (ribosomal and mitochondrial genes were removed).
##
## memory CD4 T CD16 NK
## 200 200
## memory B naive CD4 T
## 200 200
## GZMB CD8 T proliferation T/NK
## 200 200
## naive CD8 T cytotoxic CD4 T
## 200 200
## naive B Treg
## 200 200
## GZMK NK GZMK CD8 T
## 200 200
## MAIT CD16 monocyte
## 200 200
## CD14 monocyte platelet
## 200 200
## plasma cell CD14/CD16 monocyte
## 200 200
## pDC cDC
## 200 200
## neutrophil RBC
## 200 200
## proliferation myeloid cell HSC
## 110 108
seu.ref[["RNA"]]@counts <- seu.ref[["RNA"]]@data
seu.ref <- NormalizeData(seu.ref)
all.markers <- mcFindAllMarkers(seu.ref, do.flatten = F, n.cores = 20)
top.genes <- lapply(all.markers, function(xx){
yy <- subset(xx, p_val_adj < 1e-6 & avg_log2FC > log2(1.5))
yy <- subset(yy, Gene.name.uniq %notin% ribo.genes)
yy <- yy[!grepl("^MT-", yy$Gene.name.uniq), ]
head(yy$Gene.name.uniq, 25)
})
sapply(top.genes, length)
## memory CD4 T CD16 NK
## 25 25
## memory B naive CD4 T
## 25 25
## GZMB CD8 T proliferation T/NK
## 25 25
## naive CD8 T cytotoxic CD4 T
## 25 25
## naive B Treg
## 25 25
## GZMK NK GZMK CD8 T
## 25 25
## MAIT CD16 monocyte
## 25 25
## CD14 monocyte platelet
## 25 25
## plasma cell CD14/CD16 monocyte
## 25 25
## pDC cDC
## 25 25
## neutrophil RBC
## 25 25
## proliferation myeloid cell HSC
## 25 25
seu.ref <- ComputeModuleScore(seu.ref, gene.sets = top.genes,
bg.genes = bg.genes, method = "UCell",
cores = 20)
# The signature score matrix is stored in 'SignatureScore' assay
Assays(seu.ref)
## [1] "RNA" "SignatureScore"
DefaultAssay(seu.ref) <- "SignatureScore"
FeaturePlot(seu.ref, features = "feature.1", pt.size = .3)
gss.mat <- FetchData(seu.ref, vars = rownames(seu.ref))
embeddings.df <- FetchData(seu.ref, vars = paste0("UMAP_", 1:2))
batch.size = 4000 # number of subsampled cells for each SVR model
n.models = 5 # number of SVR models trained
umap.model <- FitEnsembleSVM(feature.mat = gss.mat,
emb.mat = embeddings.df,
batch.size = batch.size,
n.models = n.models,
cores = 5)
[optional] colors: for plots
[optional] text.pos: text annotation on the reference plots
meta.data: cell meta data (embeddings + cell type information)
colors <- seu.ref@misc$data.refplot$colors
text.pos <- seu.ref@misc$data.refplot$text.pos
meta.data <- FetchData(seu.ref, vars = c(paste0("UMAP_", 1:2), "cell_type", "cell_subtype"))
reference <- CreateReference(umap.model = umap.model, # fitted SVR model
gene.sets = top.genes, # signatures for calculating gene set score
bg.genes = bg.genes, # background genes for calculating gene set score
meta.data = meta.data, # cell meta data of reference atlas
gss.method = "UCell", # algorithm for calculation of gene set score
colors = colors, # colors map to cell type
text.pos = text.pos) # text annotation on projection plot
class(reference) # return a reference model
## [1] "reference.model"
saveRDS(reference, "quickstart/model.disco_pbmc_quickstart.rds")
reference <- readRDS("quickstart/model.disco_pbmc_quickstart.rds")
seu.q <- readRDS("quickstart/query_pbmc_small.seurat.slim.rds")
seu.q[["RNA"]]@counts <- seu.q[["RNA"]]@data
seu.q <- ProjectSVR::MapQuery(seu.q, reference = reference, add.map.qual = T, ncores = 5)
p1 <- DimPlot(seu.q, reduction = "ref.umap", group.by = "donor")
p2 <- DimPlot(seu.q, reduction = "ref.umap", group.by = "cell_subtype",
label = T)
(p1 + p2) & ggsci::scale_color_d3("category20")
The metric for evaluating the mapping quality is essential for users to identify and discard erroneous projected cells. We introduce the mean kNN distance and demonstrate its utilization for quality control in reference mapping using ProjectSVR.
We believe that a good projection means the local topological
relationship should be kept after projection. Thus we build a nearest
neighbor (NN) graph in feature space (signature score matrix) and
measure the average distance of its K nearest neighbors (called
mean.knn.dist
). A smaller mean.knn.dist
means
a good projection.
To access the p-value of a given mean.knn.dist
, we build
the null distribution by calculating the randomly selected K nearest
neighbors for the given cell and repeat the process by 1000 times. Then
the empirical p values were calculated according to this null
distribution. Adjusted p values were calculated by Benjamini-Hochberg
Procedure.
data.plot <- FetchData(seu.q, vars = c("mean.knn.dist", "mapQ.p.adj"))
ggplot(data.plot, aes(mean.knn.dist, -log10(mapQ.p.adj))) +
geom_point(size = .3)
## cutoff by adjusted p value
MapQCPlot(seu.q, p.adj.cutoff = 1e-3)
## or mean.knn.dist
MapQCPlot(seu.q, map.q.cutoff = 4)
seu.q$cell_subtype.orig <- seu.q$cell_subtype
PlotProjection(seu.q, reference, split.by = "cell_subtype.orig",
ref.color.by = "cell_subtype",
ref.size = .5, ref.alpha = .3, query.size = 1,
query.alpha = .5, n.row = 4)
seu.q <- ProjectSVR::LabelTransfer(seu.q, reference, ref.label.col = "cell_subtype")
DimPlot(seu.q, reduction = "ref.umap", group.by = "knn.pred.celltype") +
scale_color_manual(values = reference$ref.cellmeta$colors)
data.stat <- table(seu.q$cell_subtype, seu.q$knn.pred.celltype)
pheatmap::pheatmap(data.stat, display_numbers = T, number_format = "%.0f",
number_color = "black")
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.3
## [5] purrr_1.0.2 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
## [9] ggplot2_3.4.3 tidyverse_2.0.0 SeuratObject_4.1.3 Seurat_4.3.0.1
## [13] ProjectSVR_0.2.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.2-3 reticulate_1.31
## [4] tidyselect_1.2.0 mlr3learners_0.5.6 htmlwidgets_1.6.2
## [7] BiocParallel_1.28.3 grid_4.1.2 Rtsne_0.16
## [10] mlr3misc_0.12.0 munsell_0.5.0 codetools_0.2-18
## [13] bbotk_0.7.2 ragg_1.2.5 ica_1.0-3
## [16] future_1.33.0 miniUI_0.1.1.1 mlr3verse_0.2.8
## [19] withr_2.5.0 spatstat.random_3.1-6 colorspace_2.1-0
## [22] progressr_0.14.0 highr_0.10 knitr_1.43
## [25] uuid_1.1-1 rstudioapi_0.15.0 stats4_4.1.2
## [28] ROCR_1.0-11 robustbase_0.99-0 tensor_1.5
## [31] listenv_0.9.0 labeling_0.4.3 mlr3tuning_0.19.0
## [34] polyclip_1.10-4 lgr_0.4.4 pheatmap_1.0.12
## [37] farver_2.1.1 rprojroot_2.0.3 parallelly_1.36.0
## [40] vctrs_0.6.3 generics_0.1.3 xfun_0.40
## [43] timechange_0.2.0 diptest_0.76-0 R6_2.5.1
## [46] doParallel_1.0.17 clue_0.3-64 isoband_0.2.7
## [49] flexmix_2.3-19 spatstat.utils_3.0-3 cachem_1.0.8
## [52] promises_1.2.1 scales_1.2.1 nnet_7.3-17
## [55] gtable_0.3.4 globals_0.16.2 goftest_1.2-3
## [58] mlr3hyperband_0.4.5 mlr3mbo_0.2.1 rlang_1.1.1
## [61] systemfonts_1.0.4 GlobalOptions_0.1.2 splines_4.1.2
## [64] lazyeval_0.2.2 paradox_0.11.1 spatstat.geom_3.2-5
## [67] checkmate_2.2.0 yaml_2.3.7 reshape2_1.4.4
## [70] abind_1.4-5 mlr3_0.16.1 backports_1.4.1
## [73] httpuv_1.6.11 tools_4.1.2 ellipsis_0.3.2
## [76] jquerylib_0.1.4 RColorBrewer_1.1-3 BiocGenerics_0.40.0
## [79] ggridges_0.5.4 Rcpp_1.0.11 plyr_1.8.8
## [82] deldir_1.0-9 pbapply_1.7-2 GetoptLong_1.0.5
## [85] cowplot_1.1.1 S4Vectors_0.32.4 zoo_1.8-12
## [88] ggrepel_0.9.3 cluster_2.1.2 here_1.0.1
## [91] fs_1.6.3 magrittr_2.0.3 data.table_1.14.8
## [94] scattermore_1.2 circlize_0.4.15 lmtest_0.9-40
## [97] RANN_2.6.1 fitdistrplus_1.1-11 matrixStats_1.0.0
## [100] hms_1.1.3 patchwork_1.1.3 mime_0.12
## [103] evaluate_0.21 xtable_1.8-4 mclust_6.0.0
## [106] IRanges_2.28.0 gridExtra_2.3 shape_1.4.6
## [109] UCell_1.3.1 compiler_4.1.2 mlr3cluster_0.1.8
## [112] KernSmooth_2.23-20 crayon_1.5.2 htmltools_0.5.6
## [115] tzdb_0.4.0 later_1.3.1 ComplexHeatmap_2.10.0
## [118] rappdirs_0.3.3 MASS_7.3-55 fpc_2.2-10
## [121] mlr3data_0.7.0 Matrix_1.6-1 cli_3.6.1
## [124] parallel_4.1.2 igraph_1.5.1 pkgconfig_2.0.3
## [127] pkgdown_2.0.7 sp_2.0-0 plotly_4.10.2
## [130] spatstat.sparse_3.0-2 foreach_1.5.2 bslib_0.5.1
## [133] mlr3fselect_0.11.0 digest_0.6.33 sctransform_0.3.5
## [136] RcppAnnoy_0.0.21 mlr3filters_0.7.1 spatstat.data_3.0-1
## [139] rmarkdown_2.24 leiden_0.4.3 uwot_0.1.16
## [142] kernlab_0.9-32 shiny_1.7.5 modeltools_0.2-23
## [145] rjson_0.2.21 nlme_3.1-155 lifecycle_1.0.3
## [148] jsonlite_1.8.7 mlr3tuningspaces_0.4.0 desc_1.4.2
## [151] viridisLite_0.4.2 fansi_1.0.4 pillar_1.9.0
## [154] ggsci_3.0.0 lattice_0.20-45 fastmap_1.1.1
## [157] httr_1.4.7 DEoptimR_1.1-2 survival_3.2-13
## [160] glue_1.6.2 mlr3viz_0.6.1 png_0.1-8
## [163] prabclus_2.3-2 iterators_1.0.14 spacefillr_0.3.2
## [166] class_7.3-20 stringi_1.7.12 sass_0.4.7
## [169] mlr3pipelines_0.5.0-1 palmerpenguins_0.1.1 textshaping_0.3.6
## [172] memoise_2.0.1 irlba_2.3.5.1 future.apply_1.11.0