012‐ Batch Sample alignment - rezakj/iCellR GitHub Wiki

Batch correction (sample alignment) methods: 1- CPCA (iCellR)** recommended (faster than CCCA)

2- CCCA (iCellR)* recommended

3- MNN (scran wraper) optional

4- MultiCCA (Seurat wraper) optional

5- CPCA + KNetL based clustering (iCellR)*** recommended for best results!

Run CPCA

For batch alignment, simply run the command below. This will replace your original PCA data with batch-aligned data. However, you will need to re-run processes such as UMAP, KNetL, t-SNE, clustering, and marker detection afterward.

my.obj <- iba(my.obj,dims = 1:30, k = 10,ba.method = "CPCA", method = "gene.model", gene.list = [email protected])

To explore further, here’s an example run using 9 sample datasets:

1- How to perform Combined Principal Component Alignment (CPCA)

Here we analyzed nine PBMC sample datasets provided by the Broad Institute to detect batch differences. These datasets were generated using varying technologies, including 10x Chromium v2 (3 samples), 10x Chromium v3, CEL-Seq2, Drop-seq, inDrop, Seq-Well and SMART-Seq. For more info read: https://www.biorxiv.org/content/10.1101/2020.03.31.019109v1.full

## download an object of 9 PBMC samples 
sample.file.url = "https://genome.med.nyu.edu/results/external/iCellR/data/pbmc_data/my.obj.Robj"

# download the file
download.file(url = sample.file.url,
     destfile = "my.obj.Robj",
     method = "auto")
     
### load iCellR and the object 
library(iCellR)
load("my.obj.Robj")

### run PCA on top 2000 genes 

my.obj <- run.pca(my.obj, top.rank = 2000)

### find best genes for second round PCA or batch alignment

my.obj <- find.dim.genes(my.obj, dims = 1:30,top.pos = 20, top.neg = 20)
length([email protected])

########### Batch alignment (CPCA method)

my.obj <- iba(my.obj,dims = 1:30, k = 10,ba.method = "CPCA", method = "gene.model", gene.list = [email protected])

### impute data 

my.obj <- run.impute(my.obj,dims = 1:10,data.type = "pca", nn = 10)

### tSNE and UMAP
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
my.obj <- run.umap(my.obj, dims = 1:10)

### save object 
save(my.obj, file = "my.obj.Robj")

### plot

 library(gridExtra)
A= cluster.plot(my.obj,plot.type = "umap",interactive = F,cell.size = 0.1)
B= cluster.plot(my.obj,plot.type = "tsne",interactive = F,cell.size = 0.1) 
C= cluster.plot(my.obj,plot.type = "umap",col.by = "conditions",interactive = F,cell.size = 0.1)
D=cluster.plot(my.obj,plot.type = "tsne",col.by = "conditions",interactive = F,cell.size = 0.1)

png('AllClusts.png', width = 12, height = 12, units = 'in', res = 300)
grid.arrange(A,B,C,D)
dev.off()

png('AllConds_clusts.png', width = 15, height = 15, units = 'in', res = 300)
cluster.plot(my.obj,
              cell.size = 0.5,
              plot.type = "umap",
              cell.color = "black",
              back.col = "white",
              cell.transparency = 1,
              clust.dim = 2,
              interactive = F,cond.facet = T)
dev.off()


genelist = c("PPBP","LYZ","MS4A1","GNLY","FCGR3A","NKG7","CD14","S100A9","CD3E","CD8A","CD4","CD19","IL7R","FOXP3","EPCAM")

for(i in genelist){
	MyPlot <- gene.plot(my.obj, gene = i, 
		interactive = F,
		conds.to.plot = NULL,
		cell.size = 0.1,
		data.type = "main",
		plot.data.type = "umap",
		scaleValue = T,
		min.scale = -2.5,max.scale = 2.0,
		cell.transparency = 1)
	NameCol=paste("PL",i,sep="_")
	eval(call("<-", as.name(NameCol), MyPlot))
}

UMAP = cluster.plot(my.obj,plot.type = "umap",interactive = F,cell.size = 0.1, anno.size=5)
library(cowplot)
filenames <- ls(pattern="PL_")
filenames <- c("UMAP", filenames)

png('genes.png',width = 18, height = 15, units = 'in', res = 300)
plot_grid(plotlist=mget(filenames))
dev.off()

2- How to perform Combined Coverage Correction Alignment (CCCA)

# same as above only change the option to CCCA

my.obj <- iba(my.obj,dims = 1:30, k = 10,ba.method = "CCCA", method = "gene.model", gene.list = [email protected])

3- How to perform mutual nearest neighbor (MNN) sample alignment

# same as above only use run.mnn function instead of iba.
###### Run MNN 
# This would automatically run all the samples in your experiment 

library(scran)
my.obj <- run.mnn(my.obj, k=20, d=50, method = "gene.model", gene.list = [email protected])

# detach the scran pacakge after MNN as it masks some of the functions 
detach("package:scran", unload=TRUE)

4- How to perform Seurat's MultiCCA sample alignment

# same as above only use run.anchor function instead of iba.
###### Run Anchor 
# This would automatically run all the samples in your experiment 

library(Seurat)
my.obj <- run.anchor(my.obj,
    normalization.method = "SCT",
    scale.factor = 10000,
    selection.method = "vst",
    nfeatures = 2000,
    dims = 1:20)

5- How to perform CPCA + KNetL based clustering for sample alignment/integration

## download an object of 9 PBMC samples 
sample.file.url = "https://genome.med.nyu.edu/results/external/iCellR/example2/my.obj.Robj"

# download the file
download.file(url = sample.file.url,
     destfile = "my.obj.Robj",
     method = "auto")
     
### load iCellR and the object 
library(iCellR)
load("my.obj.Robj")

### run PCA on top 2000 genes 

my.obj <- run.pca(my.obj, top.rank = 2000)

### find best genes for second round PCA or batch alignment

my.obj <- find.dim.genes(my.obj, dims = 1:30,top.pos = 20, top.neg = 20)
length([email protected])

########### Batch alignment (CPCA method)

my.obj <- iba(my.obj,dims = 1:30, k = 10,ba.method = "CPCA", method = "gene.model", gene.list = [email protected])

### impute data 

my.obj <- run.impute(my.obj,dims = 1:10,data.type = "pca", nn = 10)

### tSNE and UMAP
my.obj <- run.pc.tsne(my.obj, dims = 1:10)
my.obj <- run.umap(my.obj, dims = 1:10)
### run KNetL 
my.obj <- run.knetl(my.obj, dims = 1:20, k = 400)

### cluster based on KNetL coordinates 
# The object is already clustered but here is an example: 
# my.obj <- iclust(my.obj, k = 300, data.type = "knetl")

### save object 
save(my.obj, file = "my.obj.Robj")

### plot 1 
A= cluster.plot(my.obj,plot.type = "pca",interactive = F,cell.size = 0.5,cell.transparency = 1, anno.clust=F)
B= cluster.plot(my.obj,plot.type = "umap",interactive = F,cell.size = 0.5,cell.transparency = 1,anno.clust=F)
C= cluster.plot(my.obj,plot.type = "tsne",interactive = F,cell.size = 0.5,cell.transparency = 1,anno.clust=F)
D= cluster.plot(my.obj,plot.type = "knetl",interactive = F,cell.size = 0.5,cell.transparency = 1,anno.clust=F)

library(gridExtra)
grid.arrange(A,B,C,D)

### plot 2
cluster.plot(my.obj,
              cell.size = 0.5,
              plot.type = "knetl",
              cell.color = "black",
              back.col = "white",
              cell.transparency = 1,
              clust.dim = 2,
              interactive = F,cond.facet = T)
	      
### plot 3	      	      
genelist = c("LYZ","MS4A1","GNLY","FCGR3A","NKG7","CD14","S100A9","CD3E","CD8A","CD4","CD19","KLRB1","LTB","IL7R","GZMH","CD68","CCR7","CD68","CD69","CXCR4","IFITM3","IL32","JCHAIN","VCAN","PPBP")	      


rm(list = ls(pattern="PL_"))
for(i in genelist){
    MyPlot <- gene.plot(my.obj, gene = i,
        interactive = F,
        cell.size = 0.1,
        plot.data.type = "knetl",
        data.type = "main",
        scaleValue = T,
        min.scale = -2.5,max.scale = 2.0,
        cell.transparency = 1)
    NameCol=paste("PL",i,sep="_")
    eval(call("<-", as.name(NameCol), MyPlot))
}

library(cowplot)
filenames <- ls(pattern="PL_")

B <- cluster.plot(my.obj,plot.type = "knetl",interactive = F,cell.size = 0.1,cell.transparency = 1,anno.clust=T)
filenames <- c("B",filenames)

plot_grid(plotlist=mget(filenames))