013‐ Demultiplex hashtag oligos (HTOs) - rezakj/iCellR GitHub Wiki

Read this example file

my.hto <- read.table(file = system.file('extdata', 'dense_umis.tsv', package = 'iCellR'), as.is = TRUE)

# or 
my.data <- load10x("filtered_feature_bc_matrix",gene.name = 2)

Your HTOs are usually in the end data. Use the tail function to check their names

tail(row.names(my.data),5)

# [1] "TotalSeq.C0254_anti.human_Hashtag_4_Antibody"
# [2] "TotalSeq.C0255_anti.human_Hashtag_5_Antibody"
# [3] "TotalSeq.C0256_anti.human_Hashtag_6_Antibody"
# [4] "TotalSeq.C0257_anti.human_Hashtag_7_Antibody"
# [5] "TotalSeq.C0258_anti.human_Hashtag_8_Antibody" 

# your HTOs are usually in the matrix and have names that are different than gene names

In the example above, the word TotalSeq appears at the beginning of all the HTOs. Use the grep function to identify them and store the results in an object.

# your HTO names 
HTOs <- grep("^TotalSeq",row.names(my.data),value=T)

Separate the gene names from the HTO names.

# your gene names 
RNAs <- subset(row.names(my.data), !(row.names(my.data) %in% HTOs))

Separate the gene count matrix from the HTO count matrix.

MyHTOs <- subset(my.data, row.names(my.data) %in% HTOs)
MyRNAs <- subset(my.data, row.names(my.data) %in% RNAs)

dim(MyHTOs)
dim(MyRNAs)

Run iCellR's hto.anno tool.

data <- hto.anno(hto.data = MyHTOs, cov.thr = 3, assignment.thr = 80)

Write the data out

data <- (cbind(ID = rownames(data),data))
write.table((data),"HTOs_annotated_HSThigh.tsv",sep="\t", row.names =F)

head(data)
#                 Hashtag1-GTCAACTCTTTAGCG Hashtag2-TGATGGCCTATTGGG
#TGACAACAGGGCTCTC                        3                       18
#AAGGAGCGTCATTAGC                        7                       24
#AGTGAGGAGACTGTAA                        7                     1761
#ATCCACCCATGTTCCC                      753                       20
#AAACGGGCAGGACCCT                      728                       24
#ATGTGTGAGTCTTGCA                        4                       25
#                 Hashtag3-TTCCGCCTCTCTTTG Hashtag4-AGTAAGTTCAGCGTA
#TGACAACAGGGCTCTC                        7                        0
#AAGGAGCGTCATTAGC                        8                        0
#AGTGAGGAGACTGTAA                        5                        0
#ATCCACCCATGTTCCC                        3                        0
#AAACGGGCAGGACCCT                        3                        0
#ATGTGTGAGTCTTGCA                      370                        0
#                 Hashtag5-AAGTATCGTTTCGCA Hashtag7-TGTCTTTCCTGCCAG unmapped
#TGACAACAGGGCTCTC                      890                        5       17
#AAGGAGCGTCATTAGC                        2                        3        3
#AGTGAGGAGACTGTAA                       11                        3       87
#ATCCACCCATGTTCCC                        5                        6       18
#AAACGGGCAGGACCCT                        9                        3       16
#ATGTGTGAGTCTTGCA                        9                     1011       25
#                    assignment.annotation percent.match coverage low.cov
#TGACAACAGGGCTCTC Hashtag5-AAGTATCGTTTCGCA      94.68085      940   FALSE
#AAGGAGCGTCATTAGC Hashtag2-TGATGGCCTATTGGG      51.06383       47    TRUE
#AGTGAGGAGACTGTAA Hashtag2-TGATGGCCTATTGGG      93.97012     1874   FALSE
#ATCCACCCATGTTCCC Hashtag1-GTCAACTCTTTAGCG      93.54037      805   FALSE
#AAACGGGCAGGACCCT Hashtag1-GTCAACTCTTTAGCG      92.97573      783   FALSE
#ATGTGTGAGTCTTGCA Hashtag7-TGTCTTTCCTGCCAG      70.01385     1444   FALSE
#                 assignment.threshold
#TGACAACAGGGCTCTC      good.assignment
#AAGGAGCGTCATTAGC               unsure
#AGTGAGGAGACTGTAA      good.assignment
#ATCCACCCATGTTCCC      good.assignment
#AAACGGGCAGGACCCT      good.assignment
#ATGTGTGAGTCTTGCA               unsure

Plot

A = ggplot(data, aes(assignment.annotation,percent.match)) +
	geom_jitter(alpha = 0.25, color = "blue") +
	geom_boxplot(alpha = 0.5) + 
	theme_bw() + 
	theme(axis.text.x=element_text(angle=90))

B = ggplot(data, aes(low.cov,percent.match)) +
	geom_jitter(alpha = 0.25, color = "blue") +
	geom_boxplot(alpha = 0.5) + 
	theme_bw() + 
	theme(axis.text.x=element_text(angle=90))

library(gridExtra)
Name="HTO_stats.png"
png(Name, width = 8, height = 8, units = 'in', res = 300)
grid.arrange(A,B,ncol=2)
dev.off()
  • Filtering HTOs
# let's see how many cells are there
dim(data)

# let's say you want to have the cells that are above 80 % likelihood of belonging to an HTO
data <- subset(data, percent.match > 80)

# let's say you want to have the cells that are above 100 in total HTO coverage
data <- subset(data, coverage > 100)

# let's see how many cells are left
dim(data)

Take the HTO IDs that passed filtering

 bestHTOs <- as.character(unique(data$assignment.annotation))

create new files (matrices) for each HTO (with number of cells added to the folder names)

library(Matrix)

# run this function. This will generate and write each sample out. 

for(i in bestHTOs){
	sample <- row.names(subset(data,data$assignment.annotation == i))
	message(paste(" getting sample",i,"..."))
	sample <- MyRNAs[ , which(names(MyRNAs) %in% sample)]
	message(paste(" number of cells",dim(sample)[2]))
	Name=paste("RNAs",i,dim(sample)[2],sep="_")
	message(paste(" writing sample",i,"..."))
dir.create(Name)
COLs <- colnames(sample)
ROWs <- row.names(sample)
colnames(sample) <- NULL
row.names(sample) <- NULL
sparse.gbm <- Matrix(as.matrix.data.frame(sample), sparse = T )
Name1=paste(Name,"matrix.mtx",sep="/")
writeMM(obj = sparse.gbm, file=Name1)
Name1=paste(Name,"barcodes.tsv.gz",sep="/")
write.table((COLs),gzfile(Name1), row.names =FALSE, quote = FALSE, col.names = FALSE)
MY.ROWs <- cbind(ROWs,ROWs)
Name1=paste(Name,"genes.tsv.gz",sep="/")
write.table((MY.ROWs),gzfile(Name1),sep="\t", row.names =F, quote = FALSE, col.names = FALSE)
}