013‐ Demultiplex hashtag oligos (HTOs) - rezakj/iCellR GitHub Wiki
Read this example file
my.hto <- read.table(file = system.file('extdata', 'dense_umis.tsv', package = 'iCellR'), as.is = TRUE)
# or
my.data <- load10x("filtered_feature_bc_matrix",gene.name = 2)
Your HTOs are usually in the end data. Use the tail function to check their names
tail(row.names(my.data),5)
# [1] "TotalSeq.C0254_anti.human_Hashtag_4_Antibody"
# [2] "TotalSeq.C0255_anti.human_Hashtag_5_Antibody"
# [3] "TotalSeq.C0256_anti.human_Hashtag_6_Antibody"
# [4] "TotalSeq.C0257_anti.human_Hashtag_7_Antibody"
# [5] "TotalSeq.C0258_anti.human_Hashtag_8_Antibody"
# your HTOs are usually in the matrix and have names that are different than gene names
In the example above, the word TotalSeq appears at the beginning of all the HTOs. Use the grep function to identify them and store the results in an object.
# your HTO names
HTOs <- grep("^TotalSeq",row.names(my.data),value=T)
Separate the gene names from the HTO names.
# your gene names
RNAs <- subset(row.names(my.data), !(row.names(my.data) %in% HTOs))
Separate the gene count matrix from the HTO count matrix.
MyHTOs <- subset(my.data, row.names(my.data) %in% HTOs)
MyRNAs <- subset(my.data, row.names(my.data) %in% RNAs)
dim(MyHTOs)
dim(MyRNAs)
Run iCellR's hto.anno tool.
data <- hto.anno(hto.data = MyHTOs, cov.thr = 3, assignment.thr = 80)
Write the data out
data <- (cbind(ID = rownames(data),data))
write.table((data),"HTOs_annotated_HSThigh.tsv",sep="\t", row.names =F)
head(data)
# Hashtag1-GTCAACTCTTTAGCG Hashtag2-TGATGGCCTATTGGG
#TGACAACAGGGCTCTC 3 18
#AAGGAGCGTCATTAGC 7 24
#AGTGAGGAGACTGTAA 7 1761
#ATCCACCCATGTTCCC 753 20
#AAACGGGCAGGACCCT 728 24
#ATGTGTGAGTCTTGCA 4 25
# Hashtag3-TTCCGCCTCTCTTTG Hashtag4-AGTAAGTTCAGCGTA
#TGACAACAGGGCTCTC 7 0
#AAGGAGCGTCATTAGC 8 0
#AGTGAGGAGACTGTAA 5 0
#ATCCACCCATGTTCCC 3 0
#AAACGGGCAGGACCCT 3 0
#ATGTGTGAGTCTTGCA 370 0
# Hashtag5-AAGTATCGTTTCGCA Hashtag7-TGTCTTTCCTGCCAG unmapped
#TGACAACAGGGCTCTC 890 5 17
#AAGGAGCGTCATTAGC 2 3 3
#AGTGAGGAGACTGTAA 11 3 87
#ATCCACCCATGTTCCC 5 6 18
#AAACGGGCAGGACCCT 9 3 16
#ATGTGTGAGTCTTGCA 9 1011 25
# assignment.annotation percent.match coverage low.cov
#TGACAACAGGGCTCTC Hashtag5-AAGTATCGTTTCGCA 94.68085 940 FALSE
#AAGGAGCGTCATTAGC Hashtag2-TGATGGCCTATTGGG 51.06383 47 TRUE
#AGTGAGGAGACTGTAA Hashtag2-TGATGGCCTATTGGG 93.97012 1874 FALSE
#ATCCACCCATGTTCCC Hashtag1-GTCAACTCTTTAGCG 93.54037 805 FALSE
#AAACGGGCAGGACCCT Hashtag1-GTCAACTCTTTAGCG 92.97573 783 FALSE
#ATGTGTGAGTCTTGCA Hashtag7-TGTCTTTCCTGCCAG 70.01385 1444 FALSE
# assignment.threshold
#TGACAACAGGGCTCTC good.assignment
#AAGGAGCGTCATTAGC unsure
#AGTGAGGAGACTGTAA good.assignment
#ATCCACCCATGTTCCC good.assignment
#AAACGGGCAGGACCCT good.assignment
#ATGTGTGAGTCTTGCA unsure
Plot
A = ggplot(data, aes(assignment.annotation,percent.match)) +
geom_jitter(alpha = 0.25, color = "blue") +
geom_boxplot(alpha = 0.5) +
theme_bw() +
theme(axis.text.x=element_text(angle=90))
B = ggplot(data, aes(low.cov,percent.match)) +
geom_jitter(alpha = 0.25, color = "blue") +
geom_boxplot(alpha = 0.5) +
theme_bw() +
theme(axis.text.x=element_text(angle=90))
library(gridExtra)
Name="HTO_stats.png"
png(Name, width = 8, height = 8, units = 'in', res = 300)
grid.arrange(A,B,ncol=2)
dev.off()
- Filtering HTOs
# let's see how many cells are there
dim(data)
# let's say you want to have the cells that are above 80 % likelihood of belonging to an HTO
data <- subset(data, percent.match > 80)
# let's say you want to have the cells that are above 100 in total HTO coverage
data <- subset(data, coverage > 100)
# let's see how many cells are left
dim(data)
Take the HTO IDs that passed filtering
bestHTOs <- as.character(unique(data$assignment.annotation))
create new files (matrices) for each HTO (with number of cells added to the folder names)
library(Matrix)
# run this function. This will generate and write each sample out.
for(i in bestHTOs){
sample <- row.names(subset(data,data$assignment.annotation == i))
message(paste(" getting sample",i,"..."))
sample <- MyRNAs[ , which(names(MyRNAs) %in% sample)]
message(paste(" number of cells",dim(sample)[2]))
Name=paste("RNAs",i,dim(sample)[2],sep="_")
message(paste(" writing sample",i,"..."))
dir.create(Name)
COLs <- colnames(sample)
ROWs <- row.names(sample)
colnames(sample) <- NULL
row.names(sample) <- NULL
sparse.gbm <- Matrix(as.matrix.data.frame(sample), sparse = T )
Name1=paste(Name,"matrix.mtx",sep="/")
writeMM(obj = sparse.gbm, file=Name1)
Name1=paste(Name,"barcodes.tsv.gz",sep="/")
write.table((COLs),gzfile(Name1), row.names =FALSE, quote = FALSE, col.names = FALSE)
MY.ROWs <- cbind(ROWs,ROWs)
Name1=paste(Name,"genes.tsv.gz",sep="/")
write.table((MY.ROWs),gzfile(Name1),sep="\t", row.names =F, quote = FALSE, col.names = FALSE)
}