016‐ scVDJ‐seq data - rezakj/iCellR GitHub Wiki

How to analyze scVDJ-seq data using iCellR

Here is an example of how to add VDJ data.

###### an example file 
my.vdj <- read.csv(file = system.file('extdata', 'all_contig_annotations.csv',
              package = 'iCellR'),
              as.is = TRUE)
          
###
head(my.vdj)
#             barcode is_cell                   contig_id high_confidence length
#1 AAACCTGTCCGAACGC-1    True AAACCTGTCCGAACGC-1_contig_1            True    654
#2 AAACCTGTCCGAACGC-1    True AAACCTGTCCGAACGC-1_contig_2            True    697
#3 AAACCTGTCCGAACGC-1    True AAACCTGTCCGAACGC-1_contig_3           False    496
#4 AAACCTGTCCGAACGC-1    True AAACCTGTCCGAACGC-1_contig_4            True    539
#5 AAACCTGTCGATGAGG-1    True AAACCTGTCGATGAGG-1_contig_1            True    705
#6 AAACCTGTCGATGAGG-1    True AAACCTGTCGATGAGG-1_contig_2            True    491
#  chain  v_gene d_gene  j_gene c_gene full_length productive           cdr3
#1   TRB TRBV4-1   None TRBJ2-7  TRBC2        True       True    CASSQGVEQYF
#2   TRA TRAV8-1   None  TRAJ42   TRAC        True       True  CAVKGGSQGNLIF
#3   TRB    None   None TRBJ1-4  TRBC1       False       None           None
#4 Multi    None   None  TRAJ10  TRBC1       False       None           None
#5   TRB TRBV5-5  TRBD1 TRBJ2-7  TRBC1        True       True CASSLVSGGNEQYF
#6   TRB    None   None TRBJ1-2  TRBC1       False       None           None
#                                     cdr3_nt reads umis raw_clonotype_id
#1          TGCGCCAGCAGCCAAGGGGTCGAGCAGTACTTC 42610   19     clonotype150
#2    TGTGCCGTGAAGGGAGGAAGCCAAGGAAATCTCATCTTT 12297    4     clonotype150
#3                                       None  4314    1     clonotype150
#4                                       None  2212    1     clonotype150
#5 TGTGCCAGCAGCTTGGTCTCAGGGGGAAACGAGCAGTACTTC 21148    8       clonotype2
#6                                       None 17717   16       clonotype2
#          raw_consensus_id
#1 clonotype150_consensus_1
#2 clonotype150_consensus_2
#3                     None
#4                     None
#5   clonotype2_consensus_1
#6                     None

#### Prepare the vdj file
    My.VDJ <- prep.vdj(vdj.data = my.vdj, cond.name = "NULL")
###
head(My.VDJ)
#  raw_clonotype_id            barcode is_cell                   contig_id
#1       clonotype1 ACGCCAGCAAGCGCTC.1    True ACGCCAGCAAGCGCTC-1_contig_2
#2       clonotype1 AACGTTGAGTACGATA.1    True AACGTTGAGTACGATA-1_contig_2
#3       clonotype1 AACTCTTGTCAAAGCG.1    True AACTCTTGTCAAAGCG-1_contig_1
#4       clonotype1 AACGTTGAGTACGATA.1    True AACGTTGAGTACGATA-1_contig_1
#5       clonotype1 ACGCCAGCAAGCGCTC.1    True ACGCCAGCAAGCGCTC-1_contig_1
#6       clonotype1 ACGATGTTCTGGTATG.1    True ACGATGTTCTGGTATG-1_contig_2
#  high_confidence length chain  v_gene d_gene  j_gene c_gene full_length
#1            True    571   TRA  TRAV27   None  TRAJ37   TRAC        True
#2            True    730   TRA  TRAV27   None  TRAJ37   TRAC        True
#3            True    722   TRB TRBV6-3  TRBD2 TRBJ1-1  TRBC1        True
#4            True    723   TRB TRBV6-3  TRBD2 TRBJ1-1  TRBC1        True
#5            True    722   TRB TRBV6-3  TRBD2 TRBJ1-1  TRBC1        True
#6            True    726   TRA  TRAV27   None  TRAJ37   TRAC        True
#  productive           cdr3                                    cdr3_nt reads
#1       True CAGGRSSNTGKLIF TGTGCAGGAGGACGCTCTAGCAACACAGGCAAACTAATCTTT 14241
#2       True CAGGRSSNTGKLIF TGTGCAGGAGGACGCTCTAGCAACACAGGCAAACTAATCTTT 27679
#3       True CASRTGAGATEAFF TGTGCCAGCAGGACCGGGGCGGGAGCCACTGAAGCTTTCTTT 51844
#4       True CASRTGAGATEAFF TGTGCCAGCAGGACCGGGGCGGGAGCCACTGAAGCTTTCTTT 38120
#5       True CASRTGAGATEAFF TGTGCCAGCAGGACCGGGGCGGGAGCCACTGAAGCTTTCTTT 24635
#6       True CAGGRSSNTGKLIF TGTGCAGGAGGACGCTCTAGCAACACAGGCAAACTAATCTTT 13720
#  umis       raw_consensus_id my.raw_clonotype_id clonotype.Freq proportion
#1    8 clonotype1_consensus_2          clonotype1             43  0.1572212
#2   10 clonotype1_consensus_2          clonotype1             43  0.1572212
#3   24 clonotype1_consensus_1          clonotype1             43  0.1572212
#4   23 clonotype1_consensus_1          clonotype1             43  0.1572212
#5   11 clonotype1_consensus_1          clonotype1             43  0.1572212
#6    7 clonotype1_consensus_2          clonotype1             43  0.1572212
#  total.colonotype
#1              109
#2              109
#3              109
#4              109
#5              109
#6              109

####
png('vdj.stats.png',width = 16, height = 8, units = 'in', res = 300)
vdj.stats(My.VDJ)
dev.off()

### add vdj data to you object 
my.obj <- add.vdj(demo.obj, vdj.data = My.VDJ)

Another example with multiple files

# First read the vdj data

File="all_contig_annotations.csv"
my.vdj.data <- read.csv(File)

# then see the conditions
my.obj

# For each condition (WT,KO, ...) subset from the VDJ data

Get="WT"
#######
dat <- colnames([email protected])
name <- paste(Get,".tsv",sep="")
do <- grep(Get,dat, value=T)
do <- as.character(as.matrix(data.frame(do.call('rbind', strsplit(as.character(do),'_',fixed=TRUE)))[2]))
do <- gsub("\\.","-",do)
do <- subset(my.vdj.data, my.vdj.data$barcode %in% do)
write.table((do),file=name,sep="\t", row.names =F)
#######

Get="KO"
#######
dat <- colnames([email protected])
name <- paste(Get,".tsv",sep="")
do <- grep(Get,dat, value=T)
do <- as.character(as.matrix(data.frame(do.call('rbind', strsplit(as.character(do),'_',fixed=TRUE)))[2]))
do <- gsub("\\.","-",do)
do <- subset(my.vdj.data, my.vdj.data$barcode %in% do)
write.table((do),file=name,sep="\t", row.names =F)
#######

#### read and prep all conditions
Get="WT"
name <- paste(Get,".tsv",sep="")
do <- read.table(name, header=T)
WT <- prep.vdj(vdj.data = do, cond.name = Get)

Get="KO"
name <- paste(Get,".tsv",sep="")
do <- read.table(name, header=T)
KO <- prep.vdj(vdj.data = do, cond.name = Get)

# concatenate all the conditions
my.vdj.data <- rbind(WT, KO)

# see head of the file
head(my.vdj.data)
#  raw_clonotype_id               barcode is_cell                   contig_id
#1       clonotype1 WT_AAACCTGAGCTAACTC-1    True AAACCTGAGCTAACTC-1_contig_1
#2       clonotype1 WT_AAACCTGAGCTAACTC-1    True AAACCTGAGCTAACTC-1_contig_2
#3       clonotype1 WT_AGTTGGTTCTCGCATC-1    True AGTTGGTTCTCGCATC-1_contig_3
#4       clonotype1 WT_TGACAACCAACTGCTA-1    True TGACAACCAACTGCTA-1_contig_1
#5       clonotype1 WT_TGTCCCAGTCAAACTC-1    True TGTCCCAGTCAAACTC-1_contig_1
#6       clonotype1 WT_TGTCCCAGTCAAACTC-1    True TGTCCCAGTCAAACTC-1_contig_2
#  high_confidence length chain  v_gene d_gene  j_gene c_gene full_length
#1            True    693   TRA TRAV8-1   None  TRAJ21   TRAC        True
#2            True    744   TRB  TRBV28  TRBD1 TRBJ2-1  TRBC2        True
#3            True    647   TRA TRAV8-1   None  TRAJ21   TRAC        True
#4            True    508   TRB  TRBV28  TRBD1 TRBJ2-1  TRBC2        True
#5            True    660   TRA TRAV8-1   None  TRAJ21   TRAC        True
#6            True    770   TRB  TRBV28  TRBD1 TRBJ2-1  TRBC2        True
#  productive             cdr3                                          cdr3_nt
#1       True      CAVKDFNKFYF                TGTGCCGTGAAAGACTTCAACAAATTTTACTTT
#2       True CASSLFSGTGTNEQFF TGTGCCAGCAGTTTATTTTCCGGGACAGGGACGAATGAGCAGTTCTTC
#3       True      CAVKDFNKFYF                TGTGCCGTGAAAGACTTCAACAAATTTTACTTT
#4       True CASSLFSGTGTNEQFF TGTGCCAGCAGTTTATTTTCCGGGACAGGGACGAATGAGCAGTTCTTC
#5       True      CAVKDFNKFYF                TGTGCCGTGAAAGACTTCAACAAATTTTACTTT
#6       True CASSLFSGTGTNEQFF TGTGCCAGCAGTTTATTTTCCGGGACAGGGACGAATGAGCAGTTCTTC
#  reads umis       raw_consensus_id my.raw_clonotype_id clonotype.Freq
#1  1241    2 clonotype1_consensus_1          clonotype1            120
#2  2400    4 clonotype1_consensus_2          clonotype1            120
#3  1090    2 clonotype1_consensus_1          clonotype1            120
#4  2455    4 clonotype1_consensus_2          clonotype1            120
#5  1346    2 clonotype1_consensus_1          clonotype1            120
#6  3073    8 clonotype1_consensus_2          clonotype1            120
#  proportion total.colonotype
#1 0.04098361             1292
#2 0.04098361             1292
#3 0.04098361             1292
#4 0.04098361             1292
#5 0.04098361             1292
#6 0.04098361             1292

# add it to iCellR object
my.obj <- add.vdj(my.obj, vdj.data = my.vdj.data)

How to plot clonotypes

# once you have imported your clonotype data to your iCellR object, in order to plot them you need to have the following parapmeters:
# -1 clonotype name (e.g. clono = "clonotype1")
# -2 which column number has the clonotype names (e.g. clonotype.column = 2)
# -3 which column number has the cell barcode names (e.g. barcode.column = 1)

# In order to plot you need 2 things a- cell barcodes that match the barcodes in UMAP,PCA,tSNE or KNetL data and b- clonotype names.

# to check your clonotype data do this (example):

head([email protected])

#  raw_clonotype_id_SampleID                MyBarcodes                 V1
#1            S5_clonotype98 Nor2.A_AAACCTGAGACAGACC.1 AAACCTGAGACAGACC.1
#2            S5_clonotype98 Nor2.A_AAACCTGAGACAGACC.1 AAACCTGAGACAGACC.1
#3           S4_clonotype100 Nor2.B_AAACCTGAGAGACTAT.1 AAACCTGAGAGACTAT.1
#4           S4_clonotype100 Nor2.B_AAACCTGAGAGACTAT.1 AAACCTGAGAGACTAT.1
#5             S3_clonotype3 Nor1.B_AAACCTGAGAGTCGGT.1 AAACCTGAGAGTCGGT.1
#6            S5_clonotype99 Nor2.A_AAACCTGAGATATGGT.1 AAACCTGAGATATGGT.1
#                barcode SampleID raw_clonotype_id is_cell
#1 S5_AAACCTGAGACAGACC.1        5      clonotype98    True
#2 S5_AAACCTGAGACAGACC.1        5      clonotype98    True
#3 S4_AAACCTGAGAGACTAT.1        4     clonotype100    True
#4 S4_AAACCTGAGAGACTAT.1        4     clonotype100    True
#5 S3_AAACCTGAGAGTCGGT.1        3       clonotype3    True
#6 S5_AAACCTGAGATATGGT.1        5      clonotype99    True
#                    contig_id high_confidence length chain   v_gene d_gene
#1 AAACCTGAGACAGACC-1_contig_2            True    514   TRB   TRBV14   None
#2 AAACCTGAGACAGACC-1_contig_1            True    495   TRB TRBV20-1   None
#3 AAACCTGAGAGACTAT-1_contig_2            True    496   TRB    TRBV9   None
#4 AAACCTGAGAGACTAT-1_contig_1            True    529   TRA TRAV26-1   None
#5 AAACCTGAGAGTCGGT-1_contig_1            True    512   TRB  TRBV6-5   None
#6 AAACCTGAGATATGGT-1_contig_2            True    544   TRA TRAV12-2   None
#   j_gene c_gene full_length productive             cdr3
#1 TRBJ1-5  TRBC1        True       True  CASSFEGGSTQPQHF
#2 TRBJ2-7  TRBC2        True       True  CSARVRGRSSYEQYF
#3 TRBJ2-2  TRBC2        True       True   CASSVGVNTGELFF
#4  TRAJ52   TRAC        True       True CIVRGAGGTSYGKLTF
#5 TRBJ1-1  TRBC1        True       True    CASSYRPNTEAFF
#6  TRAJ33   TRAC        True       True    CAVKRDSNYQLIW
#                                           cdr3_nt reads umis
#1    TGTGCCAGCAGTTTTGAGGGGGGATCGACTCAGCCCCAGCATTTT   886    1
#2    TGCAGTGCTAGAGTAAGGGGACGGAGCTCCTACGAGCAGTACTTC  1912    3
#3       TGTGCCAGCAGCGTGGGCGTAAACACCGGGGAGCTGTTTTTT 10804   12
#4 TGCATCGTCAGGGGGGCTGGTGGTACTAGCTATGGAAAGCTGACATTT   960    4
#5          TGTGCCAGCAGTTACCGCCCGAACACTGAAGCTTTCTTT  4286    6
#6          TGTGCCGTGAAAAGGGATAGCAACTATCAGTTAATCTGG  1244    2
#          raw_consensus_id my.raw_clonotype_id clonotype.Freq   proportion
#1  clonotype98_consensus_1      S5_clonotype98              1 0.0001983930
#2  clonotype98_consensus_2      S5_clonotype98              1 0.0001983930
#3 clonotype100_consensus_2     S4_clonotype100              1 0.0001923817
#4 clonotype100_consensus_1     S4_clonotype100              1 0.0001923817
#5   clonotype3_consensus_1       S3_clonotype3             49 0.0070635721
#6  clonotype99_consensus_1      S5_clonotype99              1 0.0001983930
#  total.colonotype
#1             5096
#2             5096
#3             5280
#4             5280
#5             5943
#6             5096


# In this example column number 1 and 2 have the clonotype and barcode info needed to plot. 

# Sort clonotype names with highset frequency:

clonotype.frequency <- as.data.frame(sort(table(as.character(as.matrix(([email protected])[1]))),decreasing = TRUE))

head(clonotype.frequency)
#           Var1 Freq
#1 S2_clonotype1  306
#2 S1_clonotype1  242
#3 S3_clonotype1  232
#4 S4_clonotype1  216
#5 S5_clonotype1  210
#6 S2_clonotype2  113

# let's plot S1_clonotype1 which is seen in 242 cells in all the conditions. 
# if you want to plot only in one condtion or few conditions use this option "conds.to.plot" (e.g. conds.to.plot = c("WT","KO"))
# If conds.to.plot = NULL it would plot all of them (all 242 cells). 

# Plot colonotype 1
clono.plot(my.obj, plot.data.type = "knetl",
   clonotype.column = 1,
   barcode.column = 2,
   clono = "S1_clonotype1",
   conds.to.plot = NULL,
   cell.transparency = 1,
   clust.dim = 2,
   interactive = F)
   
# plot multiple clonotypes 

ordered.clonotypes <- as.character(as.matrix((clonotype.frequency)[1]))

# let's plot top 19 clonotypes with highest frequency:
clonolist <- (ordered.clonotypes)[1:19]
clonolist
# [1] "S2_clonotype1" "S1_clonotype1" "S3_clonotype1" "S4_clonotype1"
# [5] "S5_clonotype1" "S2_clonotype2" "S3_clonotype2" "S1_clonotype2"
# [9] "S2_clonotype4" "S1_clonotype4" "S3_clonotype4" "S2_clonotype3"
#[13] "S4_clonotype2" "S1_clonotype3" "S4_clonotype3" "S5_clonotype2"
#[17] "S3_clonotype3" "S2_clonotype9" "S3_clonotype6"


rm(list = ls(pattern="PL_"))
for(i in clonolist){
   MyPlot <- clono.plot(my.obj, plot.data.type = "knetl",
   clonotype.column = 1,
   barcode.column = 2,
   clono = i,
   conds.to.plot = NULL,
   cell.transparency = 1,
   clust.dim = 2,
   interactive = F)
   NameCol=paste("PL",i,sep="_")
   eval(call("<-", as.name(NameCol), MyPlot))
}

library(cowplot)
filenames <- ls(pattern="PL_")

B= cluster.plot(my.obj,plot.type = "knetl",interactive = F,cell.size = 0.5,cell.transparency = 1,anno.clust=TRUE)
filenames <- c("B",filenames)

png("19_clonotypes.png",width = 20, height = 20, units = 'in', res = 300)
plot_grid(plotlist=mget(filenames))
dev.off()