library(ProjectSVR)
library(Seurat)
library(tidyverse)
options(timeout = max(3600, getOption("timeout")))
`%notin%` <- Negate(`%in%`)
if (!dir.exists("quickstart")) dir.create("quickstart")
# reference data
download.file(url = "https://zenodo.org/record/8147304/files/disco_pbmc_small.seurat.slim.rds", destfile = "quickstart/disco_pbmc_small.seurat.slim.rds")
# query data
download.file(url = "https://zenodo.org/record/8147304/files/query_pbmc_small.seurat.slim.rds", destfile = "quickstart/query_pbmc_small.seurat.slim.rds")

Build Reference Model

seu.ref <- readRDS("quickstart/disco_pbmc_small.seurat.slim.rds")

DimPlot(seu.ref, pt.size = .4) +
  scale_color_manual(values = seu.ref@misc$data.refplot$colors) +
  geom_text(inherit.aes = F, data = seu.ref@misc$data.refplot$text.pos,
            mapping = aes(x, y, label = label), size = 4)

Extract signatures

Here we extract the top25 marker genes for each cell type (ribosomal and mitochondrial genes were removed).

data(ribo.genes)

table(Idents(seu.ref))
## 
##               memory CD4 T                    CD16 NK 
##                        200                        200 
##                   memory B                naive CD4 T 
##                        200                        200 
##                 GZMB CD8 T         proliferation T/NK 
##                        200                        200 
##                naive CD8 T            cytotoxic CD4 T 
##                        200                        200 
##                    naive B                       Treg 
##                        200                        200 
##                    GZMK NK                 GZMK CD8 T 
##                        200                        200 
##                       MAIT              CD16 monocyte 
##                        200                        200 
##              CD14 monocyte                   platelet 
##                        200                        200 
##                plasma cell         CD14/CD16 monocyte 
##                        200                        200 
##                        pDC                        cDC 
##                        200                        200 
##                 neutrophil                        RBC 
##                        200                        200 
## proliferation myeloid cell                        HSC 
##                        110                        108
seu.ref[["RNA"]]@counts <- seu.ref[["RNA"]]@data
seu.ref <- NormalizeData(seu.ref)
all.markers <- mcFindAllMarkers(seu.ref, do.flatten = F, n.cores = 20)

top.genes <- lapply(all.markers, function(xx){
  yy <- subset(xx, p_val_adj < 1e-6 & avg_log2FC > log2(1.5))
  yy <- subset(yy, Gene.name.uniq %notin% ribo.genes)
  yy <- yy[!grepl("^MT-", yy$Gene.name.uniq), ]
  head(yy$Gene.name.uniq, 25)
})

sapply(top.genes, length)
##               memory CD4 T                    CD16 NK 
##                         25                         25 
##                   memory B                naive CD4 T 
##                         25                         25 
##                 GZMB CD8 T         proliferation T/NK 
##                         25                         25 
##                naive CD8 T            cytotoxic CD4 T 
##                         25                         25 
##                    naive B                       Treg 
##                         25                         25 
##                    GZMK NK                 GZMK CD8 T 
##                         25                         25 
##                       MAIT              CD16 monocyte 
##                         25                         25 
##              CD14 monocyte                   platelet 
##                         25                         25 
##                plasma cell         CD14/CD16 monocyte 
##                         25                         25 
##                        pDC                        cDC 
##                         25                         25 
##                 neutrophil                        RBC 
##                         25                         25 
## proliferation myeloid cell                        HSC 
##                         25                         25
# rename the gene set
# [Note] Avoid to use '_' or '-' in the gene set names.
names(top.genes) <- paste0("feature.", 1:length(top.genes))
# the background genes are the union of all markers
bg.genes <- do.call(c, top.genes) %>% unique()

Transfer raw count matrix to gene set score matrix

seu.ref <- ComputeModuleScore(seu.ref, gene.sets = top.genes, 
                              bg.genes = bg.genes, method = "UCell", 
                              cores = 20)
# The signature score matrix is stored in 'SignatureScore' assay
Assays(seu.ref)
## [1] "RNA"            "SignatureScore"
DefaultAssay(seu.ref) <- "SignatureScore"
FeaturePlot(seu.ref, features = "feature.1", pt.size = .3)

Training reference model

gss.mat <- FetchData(seu.ref, vars = rownames(seu.ref))
embeddings.df <- FetchData(seu.ref, vars = paste0("UMAP_", 1:2))
batch.size = 4000 # number of subsampled cells for each SVR model 
n.models = 5      # number of SVR models trained
umap.model <- FitEnsembleSVM(feature.mat = gss.mat,
                             emb.mat = embeddings.df,
                             batch.size = batch.size,
                             n.models = n.models,
                             cores = 5)

Save the reference model

  1. [optional] colors: for plots

  2. [optional] text.pos: text annotation on the reference plots

  3. meta.data: cell meta data (embeddings + cell type information)

colors <- seu.ref@misc$data.refplot$colors
text.pos <- seu.ref@misc$data.refplot$text.pos
meta.data <- FetchData(seu.ref, vars = c(paste0("UMAP_", 1:2), "cell_type", "cell_subtype"))

reference <- CreateReference(umap.model = umap.model, # fitted SVR model
                             gene.sets = top.genes, # signatures for calculating gene set score
                             bg.genes = bg.genes,   # background genes for calculating gene set score
                             meta.data = meta.data, # cell meta data of reference atlas
                             gss.method = "UCell",  # algorithm for calculation of gene set score
                             colors = colors,       # colors map to cell type
                             text.pos = text.pos)   # text annotation on projection plot
class(reference) # return a reference model
## [1] "reference.model"
saveRDS(reference, "quickstart/model.disco_pbmc_quickstart.rds")

Map Query to Reference

Reference mapping

reference <- readRDS("quickstart/model.disco_pbmc_quickstart.rds")
seu.q <- readRDS("quickstart/query_pbmc_small.seurat.slim.rds")
seu.q[["RNA"]]@counts <- seu.q[["RNA"]]@data
seu.q <- ProjectSVR::MapQuery(seu.q, reference = reference, add.map.qual = T, ncores = 5)

p1 <- DimPlot(seu.q, reduction = "ref.umap", group.by = "donor")
p2 <- DimPlot(seu.q, reduction = "ref.umap", group.by = "cell_subtype", 
              label = T)
(p1 + p2) & ggsci::scale_color_d3("category20")

Maping quality

The metric for evaluating the mapping quality is essential for users to identify and discard erroneous projected cells. We introduce the mean kNN distance and demonstrate its utilization for quality control in reference mapping using ProjectSVR.

We believe that a good projection means the local topological relationship should be kept after projection. Thus we build a nearest neighbor (NN) graph in feature space (signature score matrix) and measure the average distance of its K nearest neighbors (called mean.knn.dist). A smaller mean.knn.dist means a good projection.

To access the p-value of a given mean.knn.dist, we build the null distribution by calculating the randomly selected K nearest neighbors for the given cell and repeat the process by 1000 times. Then the empirical p values were calculated according to this null distribution. Adjusted p values were calculated by Benjamini-Hochberg Procedure.

data.plot <- FetchData(seu.q, vars = c("mean.knn.dist", "mapQ.p.adj"))

ggplot(data.plot, aes(mean.knn.dist, -log10(mapQ.p.adj))) +
  geom_point(size = .3)

## cutoff by adjusted p value
MapQCPlot(seu.q, p.adj.cutoff = 1e-3)

## or mean.knn.dist
MapQCPlot(seu.q, map.q.cutoff = 4)

seu.q$cell_subtype.orig <- seu.q$cell_subtype

PlotProjection(seu.q, reference, split.by = "cell_subtype.orig", 
               ref.color.by = "cell_subtype",
               ref.size = .5, ref.alpha = .3, query.size = 1, 
               query.alpha = .5, n.row = 4)

Label transfer

seu.q <- ProjectSVR::LabelTransfer(seu.q, reference, ref.label.col = "cell_subtype")

DimPlot(seu.q, reduction = "ref.umap", group.by = "knn.pred.celltype") + 
  scale_color_manual(values = reference$ref.cellmeta$colors)

data.stat <- table(seu.q$cell_subtype, seu.q$knn.pred.celltype)
pheatmap::pheatmap(data.stat, display_numbers = T, number_format = "%.0f", 
                   number_color = "black")

Session Info
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.9.2    forcats_1.0.0      stringr_1.5.0      dplyr_1.1.3       
##  [5] purrr_1.0.2        readr_2.1.4        tidyr_1.3.0        tibble_3.2.1      
##  [9] ggplot2_3.4.3      tidyverse_2.0.0    SeuratObject_4.1.3 Seurat_4.3.0.1    
## [13] ProjectSVR_0.2.0  
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3             spatstat.explore_3.2-3 reticulate_1.31       
##   [4] tidyselect_1.2.0       mlr3learners_0.5.6     htmlwidgets_1.6.2     
##   [7] BiocParallel_1.28.3    grid_4.1.2             Rtsne_0.16            
##  [10] mlr3misc_0.12.0        munsell_0.5.0          codetools_0.2-18      
##  [13] bbotk_0.7.2            ragg_1.2.5             ica_1.0-3             
##  [16] future_1.33.0          miniUI_0.1.1.1         mlr3verse_0.2.8       
##  [19] withr_2.5.0            spatstat.random_3.1-6  colorspace_2.1-0      
##  [22] progressr_0.14.0       highr_0.10             knitr_1.43            
##  [25] uuid_1.1-1             rstudioapi_0.15.0      stats4_4.1.2          
##  [28] ROCR_1.0-11            robustbase_0.99-0      tensor_1.5            
##  [31] listenv_0.9.0          labeling_0.4.3         mlr3tuning_0.19.0     
##  [34] polyclip_1.10-4        lgr_0.4.4              pheatmap_1.0.12       
##  [37] farver_2.1.1           rprojroot_2.0.3        parallelly_1.36.0     
##  [40] vctrs_0.6.3            generics_0.1.3         xfun_0.40             
##  [43] timechange_0.2.0       diptest_0.76-0         R6_2.5.1              
##  [46] doParallel_1.0.17      clue_0.3-64            isoband_0.2.7         
##  [49] flexmix_2.3-19         spatstat.utils_3.0-3   cachem_1.0.8          
##  [52] promises_1.2.1         scales_1.2.1           nnet_7.3-17           
##  [55] gtable_0.3.4           globals_0.16.2         goftest_1.2-3         
##  [58] mlr3hyperband_0.4.5    mlr3mbo_0.2.1          rlang_1.1.1           
##  [61] systemfonts_1.0.4      GlobalOptions_0.1.2    splines_4.1.2         
##  [64] lazyeval_0.2.2         paradox_0.11.1         spatstat.geom_3.2-5   
##  [67] checkmate_2.2.0        yaml_2.3.7             reshape2_1.4.4        
##  [70] abind_1.4-5            mlr3_0.16.1            backports_1.4.1       
##  [73] httpuv_1.6.11          tools_4.1.2            ellipsis_0.3.2        
##  [76] jquerylib_0.1.4        RColorBrewer_1.1-3     BiocGenerics_0.40.0   
##  [79] ggridges_0.5.4         Rcpp_1.0.11            plyr_1.8.8            
##  [82] deldir_1.0-9           pbapply_1.7-2          GetoptLong_1.0.5      
##  [85] cowplot_1.1.1          S4Vectors_0.32.4       zoo_1.8-12            
##  [88] ggrepel_0.9.3          cluster_2.1.2          here_1.0.1            
##  [91] fs_1.6.3               magrittr_2.0.3         data.table_1.14.8     
##  [94] scattermore_1.2        circlize_0.4.15        lmtest_0.9-40         
##  [97] RANN_2.6.1             fitdistrplus_1.1-11    matrixStats_1.0.0     
## [100] hms_1.1.3              patchwork_1.1.3        mime_0.12             
## [103] evaluate_0.21          xtable_1.8-4           mclust_6.0.0          
## [106] IRanges_2.28.0         gridExtra_2.3          shape_1.4.6           
## [109] UCell_1.3.1            compiler_4.1.2         mlr3cluster_0.1.8     
## [112] KernSmooth_2.23-20     crayon_1.5.2           htmltools_0.5.6       
## [115] tzdb_0.4.0             later_1.3.1            ComplexHeatmap_2.10.0 
## [118] rappdirs_0.3.3         MASS_7.3-55            fpc_2.2-10            
## [121] mlr3data_0.7.0         Matrix_1.6-1           cli_3.6.1             
## [124] parallel_4.1.2         igraph_1.5.1           pkgconfig_2.0.3       
## [127] pkgdown_2.0.7          sp_2.0-0               plotly_4.10.2         
## [130] spatstat.sparse_3.0-2  foreach_1.5.2          bslib_0.5.1           
## [133] mlr3fselect_0.11.0     digest_0.6.33          sctransform_0.3.5     
## [136] RcppAnnoy_0.0.21       mlr3filters_0.7.1      spatstat.data_3.0-1   
## [139] rmarkdown_2.24         leiden_0.4.3           uwot_0.1.16           
## [142] kernlab_0.9-32         shiny_1.7.5            modeltools_0.2-23     
## [145] rjson_0.2.21           nlme_3.1-155           lifecycle_1.0.3       
## [148] jsonlite_1.8.7         mlr3tuningspaces_0.4.0 desc_1.4.2            
## [151] viridisLite_0.4.2      fansi_1.0.4            pillar_1.9.0          
## [154] ggsci_3.0.0            lattice_0.20-45        fastmap_1.1.1         
## [157] httr_1.4.7             DEoptimR_1.1-2         survival_3.2-13       
## [160] glue_1.6.2             mlr3viz_0.6.1          png_0.1-8             
## [163] prabclus_2.3-2         iterators_1.0.14       spacefillr_0.3.2      
## [166] class_7.3-20           stringi_1.7.12         sass_0.4.7            
## [169] mlr3pipelines_0.5.0-1  palmerpenguins_0.1.1   textshaping_0.3.6     
## [172] memoise_2.0.1          irlba_2.3.5.1          future.apply_1.11.0