016‐ scVDJ‐seq data - rezakj/iCellR GitHub Wiki
How to analyze scVDJ-seq data using iCellR
Here is an example of how to add VDJ data.
###### an example file
my.vdj <- read.csv(file = system.file('extdata', 'all_contig_annotations.csv',
package = 'iCellR'),
as.is = TRUE)
###
head(my.vdj)
# barcode is_cell contig_id high_confidence length
#1 AAACCTGTCCGAACGC-1 True AAACCTGTCCGAACGC-1_contig_1 True 654
#2 AAACCTGTCCGAACGC-1 True AAACCTGTCCGAACGC-1_contig_2 True 697
#3 AAACCTGTCCGAACGC-1 True AAACCTGTCCGAACGC-1_contig_3 False 496
#4 AAACCTGTCCGAACGC-1 True AAACCTGTCCGAACGC-1_contig_4 True 539
#5 AAACCTGTCGATGAGG-1 True AAACCTGTCGATGAGG-1_contig_1 True 705
#6 AAACCTGTCGATGAGG-1 True AAACCTGTCGATGAGG-1_contig_2 True 491
# chain v_gene d_gene j_gene c_gene full_length productive cdr3
#1 TRB TRBV4-1 None TRBJ2-7 TRBC2 True True CASSQGVEQYF
#2 TRA TRAV8-1 None TRAJ42 TRAC True True CAVKGGSQGNLIF
#3 TRB None None TRBJ1-4 TRBC1 False None None
#4 Multi None None TRAJ10 TRBC1 False None None
#5 TRB TRBV5-5 TRBD1 TRBJ2-7 TRBC1 True True CASSLVSGGNEQYF
#6 TRB None None TRBJ1-2 TRBC1 False None None
# cdr3_nt reads umis raw_clonotype_id
#1 TGCGCCAGCAGCCAAGGGGTCGAGCAGTACTTC 42610 19 clonotype150
#2 TGTGCCGTGAAGGGAGGAAGCCAAGGAAATCTCATCTTT 12297 4 clonotype150
#3 None 4314 1 clonotype150
#4 None 2212 1 clonotype150
#5 TGTGCCAGCAGCTTGGTCTCAGGGGGAAACGAGCAGTACTTC 21148 8 clonotype2
#6 None 17717 16 clonotype2
# raw_consensus_id
#1 clonotype150_consensus_1
#2 clonotype150_consensus_2
#3 None
#4 None
#5 clonotype2_consensus_1
#6 None
#### Prepare the vdj file
My.VDJ <- prep.vdj(vdj.data = my.vdj, cond.name = "NULL")
###
head(My.VDJ)
# raw_clonotype_id barcode is_cell contig_id
#1 clonotype1 ACGCCAGCAAGCGCTC.1 True ACGCCAGCAAGCGCTC-1_contig_2
#2 clonotype1 AACGTTGAGTACGATA.1 True AACGTTGAGTACGATA-1_contig_2
#3 clonotype1 AACTCTTGTCAAAGCG.1 True AACTCTTGTCAAAGCG-1_contig_1
#4 clonotype1 AACGTTGAGTACGATA.1 True AACGTTGAGTACGATA-1_contig_1
#5 clonotype1 ACGCCAGCAAGCGCTC.1 True ACGCCAGCAAGCGCTC-1_contig_1
#6 clonotype1 ACGATGTTCTGGTATG.1 True ACGATGTTCTGGTATG-1_contig_2
# high_confidence length chain v_gene d_gene j_gene c_gene full_length
#1 True 571 TRA TRAV27 None TRAJ37 TRAC True
#2 True 730 TRA TRAV27 None TRAJ37 TRAC True
#3 True 722 TRB TRBV6-3 TRBD2 TRBJ1-1 TRBC1 True
#4 True 723 TRB TRBV6-3 TRBD2 TRBJ1-1 TRBC1 True
#5 True 722 TRB TRBV6-3 TRBD2 TRBJ1-1 TRBC1 True
#6 True 726 TRA TRAV27 None TRAJ37 TRAC True
# productive cdr3 cdr3_nt reads
#1 True CAGGRSSNTGKLIF TGTGCAGGAGGACGCTCTAGCAACACAGGCAAACTAATCTTT 14241
#2 True CAGGRSSNTGKLIF TGTGCAGGAGGACGCTCTAGCAACACAGGCAAACTAATCTTT 27679
#3 True CASRTGAGATEAFF TGTGCCAGCAGGACCGGGGCGGGAGCCACTGAAGCTTTCTTT 51844
#4 True CASRTGAGATEAFF TGTGCCAGCAGGACCGGGGCGGGAGCCACTGAAGCTTTCTTT 38120
#5 True CASRTGAGATEAFF TGTGCCAGCAGGACCGGGGCGGGAGCCACTGAAGCTTTCTTT 24635
#6 True CAGGRSSNTGKLIF TGTGCAGGAGGACGCTCTAGCAACACAGGCAAACTAATCTTT 13720
# umis raw_consensus_id my.raw_clonotype_id clonotype.Freq proportion
#1 8 clonotype1_consensus_2 clonotype1 43 0.1572212
#2 10 clonotype1_consensus_2 clonotype1 43 0.1572212
#3 24 clonotype1_consensus_1 clonotype1 43 0.1572212
#4 23 clonotype1_consensus_1 clonotype1 43 0.1572212
#5 11 clonotype1_consensus_1 clonotype1 43 0.1572212
#6 7 clonotype1_consensus_2 clonotype1 43 0.1572212
# total.colonotype
#1 109
#2 109
#3 109
#4 109
#5 109
#6 109
####
png('vdj.stats.png',width = 16, height = 8, units = 'in', res = 300)
vdj.stats(My.VDJ)
dev.off()
### add vdj data to you object
my.obj <- add.vdj(demo.obj, vdj.data = My.VDJ)
Another example with multiple files
# First read the vdj data
File="all_contig_annotations.csv"
my.vdj.data <- read.csv(File)
# then see the conditions
my.obj
# For each condition (WT,KO, ...) subset from the VDJ data
Get="WT"
#######
dat <- colnames([email protected])
name <- paste(Get,".tsv",sep="")
do <- grep(Get,dat, value=T)
do <- as.character(as.matrix(data.frame(do.call('rbind', strsplit(as.character(do),'_',fixed=TRUE)))[2]))
do <- gsub("\\.","-",do)
do <- subset(my.vdj.data, my.vdj.data$barcode %in% do)
write.table((do),file=name,sep="\t", row.names =F)
#######
Get="KO"
#######
dat <- colnames([email protected])
name <- paste(Get,".tsv",sep="")
do <- grep(Get,dat, value=T)
do <- as.character(as.matrix(data.frame(do.call('rbind', strsplit(as.character(do),'_',fixed=TRUE)))[2]))
do <- gsub("\\.","-",do)
do <- subset(my.vdj.data, my.vdj.data$barcode %in% do)
write.table((do),file=name,sep="\t", row.names =F)
#######
#### read and prep all conditions
Get="WT"
name <- paste(Get,".tsv",sep="")
do <- read.table(name, header=T)
WT <- prep.vdj(vdj.data = do, cond.name = Get)
Get="KO"
name <- paste(Get,".tsv",sep="")
do <- read.table(name, header=T)
KO <- prep.vdj(vdj.data = do, cond.name = Get)
# concatenate all the conditions
my.vdj.data <- rbind(WT, KO)
# see head of the file
head(my.vdj.data)
# raw_clonotype_id barcode is_cell contig_id
#1 clonotype1 WT_AAACCTGAGCTAACTC-1 True AAACCTGAGCTAACTC-1_contig_1
#2 clonotype1 WT_AAACCTGAGCTAACTC-1 True AAACCTGAGCTAACTC-1_contig_2
#3 clonotype1 WT_AGTTGGTTCTCGCATC-1 True AGTTGGTTCTCGCATC-1_contig_3
#4 clonotype1 WT_TGACAACCAACTGCTA-1 True TGACAACCAACTGCTA-1_contig_1
#5 clonotype1 WT_TGTCCCAGTCAAACTC-1 True TGTCCCAGTCAAACTC-1_contig_1
#6 clonotype1 WT_TGTCCCAGTCAAACTC-1 True TGTCCCAGTCAAACTC-1_contig_2
# high_confidence length chain v_gene d_gene j_gene c_gene full_length
#1 True 693 TRA TRAV8-1 None TRAJ21 TRAC True
#2 True 744 TRB TRBV28 TRBD1 TRBJ2-1 TRBC2 True
#3 True 647 TRA TRAV8-1 None TRAJ21 TRAC True
#4 True 508 TRB TRBV28 TRBD1 TRBJ2-1 TRBC2 True
#5 True 660 TRA TRAV8-1 None TRAJ21 TRAC True
#6 True 770 TRB TRBV28 TRBD1 TRBJ2-1 TRBC2 True
# productive cdr3 cdr3_nt
#1 True CAVKDFNKFYF TGTGCCGTGAAAGACTTCAACAAATTTTACTTT
#2 True CASSLFSGTGTNEQFF TGTGCCAGCAGTTTATTTTCCGGGACAGGGACGAATGAGCAGTTCTTC
#3 True CAVKDFNKFYF TGTGCCGTGAAAGACTTCAACAAATTTTACTTT
#4 True CASSLFSGTGTNEQFF TGTGCCAGCAGTTTATTTTCCGGGACAGGGACGAATGAGCAGTTCTTC
#5 True CAVKDFNKFYF TGTGCCGTGAAAGACTTCAACAAATTTTACTTT
#6 True CASSLFSGTGTNEQFF TGTGCCAGCAGTTTATTTTCCGGGACAGGGACGAATGAGCAGTTCTTC
# reads umis raw_consensus_id my.raw_clonotype_id clonotype.Freq
#1 1241 2 clonotype1_consensus_1 clonotype1 120
#2 2400 4 clonotype1_consensus_2 clonotype1 120
#3 1090 2 clonotype1_consensus_1 clonotype1 120
#4 2455 4 clonotype1_consensus_2 clonotype1 120
#5 1346 2 clonotype1_consensus_1 clonotype1 120
#6 3073 8 clonotype1_consensus_2 clonotype1 120
# proportion total.colonotype
#1 0.04098361 1292
#2 0.04098361 1292
#3 0.04098361 1292
#4 0.04098361 1292
#5 0.04098361 1292
#6 0.04098361 1292
# add it to iCellR object
my.obj <- add.vdj(my.obj, vdj.data = my.vdj.data)
How to plot clonotypes
# once you have imported your clonotype data to your iCellR object, in order to plot them you need to have the following parapmeters:
# -1 clonotype name (e.g. clono = "clonotype1")
# -2 which column number has the clonotype names (e.g. clonotype.column = 2)
# -3 which column number has the cell barcode names (e.g. barcode.column = 1)
# In order to plot you need 2 things a- cell barcodes that match the barcodes in UMAP,PCA,tSNE or KNetL data and b- clonotype names.
# to check your clonotype data do this (example):
head([email protected])
# raw_clonotype_id_SampleID MyBarcodes V1
#1 S5_clonotype98 Nor2.A_AAACCTGAGACAGACC.1 AAACCTGAGACAGACC.1
#2 S5_clonotype98 Nor2.A_AAACCTGAGACAGACC.1 AAACCTGAGACAGACC.1
#3 S4_clonotype100 Nor2.B_AAACCTGAGAGACTAT.1 AAACCTGAGAGACTAT.1
#4 S4_clonotype100 Nor2.B_AAACCTGAGAGACTAT.1 AAACCTGAGAGACTAT.1
#5 S3_clonotype3 Nor1.B_AAACCTGAGAGTCGGT.1 AAACCTGAGAGTCGGT.1
#6 S5_clonotype99 Nor2.A_AAACCTGAGATATGGT.1 AAACCTGAGATATGGT.1
# barcode SampleID raw_clonotype_id is_cell
#1 S5_AAACCTGAGACAGACC.1 5 clonotype98 True
#2 S5_AAACCTGAGACAGACC.1 5 clonotype98 True
#3 S4_AAACCTGAGAGACTAT.1 4 clonotype100 True
#4 S4_AAACCTGAGAGACTAT.1 4 clonotype100 True
#5 S3_AAACCTGAGAGTCGGT.1 3 clonotype3 True
#6 S5_AAACCTGAGATATGGT.1 5 clonotype99 True
# contig_id high_confidence length chain v_gene d_gene
#1 AAACCTGAGACAGACC-1_contig_2 True 514 TRB TRBV14 None
#2 AAACCTGAGACAGACC-1_contig_1 True 495 TRB TRBV20-1 None
#3 AAACCTGAGAGACTAT-1_contig_2 True 496 TRB TRBV9 None
#4 AAACCTGAGAGACTAT-1_contig_1 True 529 TRA TRAV26-1 None
#5 AAACCTGAGAGTCGGT-1_contig_1 True 512 TRB TRBV6-5 None
#6 AAACCTGAGATATGGT-1_contig_2 True 544 TRA TRAV12-2 None
# j_gene c_gene full_length productive cdr3
#1 TRBJ1-5 TRBC1 True True CASSFEGGSTQPQHF
#2 TRBJ2-7 TRBC2 True True CSARVRGRSSYEQYF
#3 TRBJ2-2 TRBC2 True True CASSVGVNTGELFF
#4 TRAJ52 TRAC True True CIVRGAGGTSYGKLTF
#5 TRBJ1-1 TRBC1 True True CASSYRPNTEAFF
#6 TRAJ33 TRAC True True CAVKRDSNYQLIW
# cdr3_nt reads umis
#1 TGTGCCAGCAGTTTTGAGGGGGGATCGACTCAGCCCCAGCATTTT 886 1
#2 TGCAGTGCTAGAGTAAGGGGACGGAGCTCCTACGAGCAGTACTTC 1912 3
#3 TGTGCCAGCAGCGTGGGCGTAAACACCGGGGAGCTGTTTTTT 10804 12
#4 TGCATCGTCAGGGGGGCTGGTGGTACTAGCTATGGAAAGCTGACATTT 960 4
#5 TGTGCCAGCAGTTACCGCCCGAACACTGAAGCTTTCTTT 4286 6
#6 TGTGCCGTGAAAAGGGATAGCAACTATCAGTTAATCTGG 1244 2
# raw_consensus_id my.raw_clonotype_id clonotype.Freq proportion
#1 clonotype98_consensus_1 S5_clonotype98 1 0.0001983930
#2 clonotype98_consensus_2 S5_clonotype98 1 0.0001983930
#3 clonotype100_consensus_2 S4_clonotype100 1 0.0001923817
#4 clonotype100_consensus_1 S4_clonotype100 1 0.0001923817
#5 clonotype3_consensus_1 S3_clonotype3 49 0.0070635721
#6 clonotype99_consensus_1 S5_clonotype99 1 0.0001983930
# total.colonotype
#1 5096
#2 5096
#3 5280
#4 5280
#5 5943
#6 5096
# In this example column number 1 and 2 have the clonotype and barcode info needed to plot.
# Sort clonotype names with highset frequency:
clonotype.frequency <- as.data.frame(sort(table(as.character(as.matrix(([email protected])[1]))),decreasing = TRUE))
head(clonotype.frequency)
# Var1 Freq
#1 S2_clonotype1 306
#2 S1_clonotype1 242
#3 S3_clonotype1 232
#4 S4_clonotype1 216
#5 S5_clonotype1 210
#6 S2_clonotype2 113
# let's plot S1_clonotype1 which is seen in 242 cells in all the conditions.
# if you want to plot only in one condtion or few conditions use this option "conds.to.plot" (e.g. conds.to.plot = c("WT","KO"))
# If conds.to.plot = NULL it would plot all of them (all 242 cells).
# Plot colonotype 1
clono.plot(my.obj, plot.data.type = "knetl",
clonotype.column = 1,
barcode.column = 2,
clono = "S1_clonotype1",
conds.to.plot = NULL,
cell.transparency = 1,
clust.dim = 2,
interactive = F)
# plot multiple clonotypes
ordered.clonotypes <- as.character(as.matrix((clonotype.frequency)[1]))
# let's plot top 19 clonotypes with highest frequency:
clonolist <- (ordered.clonotypes)[1:19]
clonolist
# [1] "S2_clonotype1" "S1_clonotype1" "S3_clonotype1" "S4_clonotype1"
# [5] "S5_clonotype1" "S2_clonotype2" "S3_clonotype2" "S1_clonotype2"
# [9] "S2_clonotype4" "S1_clonotype4" "S3_clonotype4" "S2_clonotype3"
#[13] "S4_clonotype2" "S1_clonotype3" "S4_clonotype3" "S5_clonotype2"
#[17] "S3_clonotype3" "S2_clonotype9" "S3_clonotype6"
rm(list = ls(pattern="PL_"))
for(i in clonolist){
MyPlot <- clono.plot(my.obj, plot.data.type = "knetl",
clonotype.column = 1,
barcode.column = 2,
clono = i,
conds.to.plot = NULL,
cell.transparency = 1,
clust.dim = 2,
interactive = F)
NameCol=paste("PL",i,sep="_")
eval(call("<-", as.name(NameCol), MyPlot))
}
library(cowplot)
filenames <- ls(pattern="PL_")
B= cluster.plot(my.obj,plot.type = "knetl",interactive = F,cell.size = 0.5,cell.transparency = 1,anno.clust=TRUE)
filenames <- c("B",filenames)
png("19_clonotypes.png",width = 20, height = 20, units = 'in', res = 300)
plot_grid(plotlist=mget(filenames))
dev.off()