Transcriptomics - ASBioinfo/Utils-hub GitHub Wiki
1. Transcriptome pipeline (Using Salmon)
- Run fastp
fastp --in1 27_R1_001.fastq.gz --in2 27_R2_001.fastq.gz --out1 27_trim_R1.fastq.gz --out2 27_trim_R2.fastq.gz --adapter_fasta adapter_seq.fa --thread 16 -p --overrepresentation_analysis -l 50 -q 20 -u 20 -h 27_report.html -j 27_report.json -R 27_report --trim_front1 2 --trim_tail1 2 --trim_front2 2 --trim_tail2 2 --dont_overwrite
- Run sortme RNA
There is four database is present but choose only smr_v4.3_default_db.fasta as it contain all the features dont go for other until unless looking for a specific type of rRNA
sortmerna --ref rRNA_databases_v4.3.6/smr_v4.3_default_db.fasta --reads 31_trim_R1.fastq --reads 31_trim_R2.fastq --threads 35 --workdir ./31/ --aligned 31_rRNA_reads --fastx --other 31_non_rRNA_reads --paired_in --out2
[!IMPORTANT] Keep only desired RNA types (eg., only protein, tRNA etc.) in .gtf file of reference using script and use in the next step.
- Get fasta using gffread for salmon quantifications
gffread -w Bos_indicus_hybrid_salmon.fasta -g Bos_indicus_hybrid.UOA_Brahman_1.dna_sm.toplevel.fa Bos_indicus_hybrid.UOA_Brahman_non_RNA.gtf
- Indexing using salmon
salmon index --threads 40 -t salmon_Bos_indicus_hybrid.fa -i salmon_index -k 31
Here,
-t -> input
-i -> output
Further run next command in respective sample folder
salmon quant -i ../assembly/salmon_index_protein_coding --threads 40 -l A -1 dis_04R_no_rRNA_R1.fq -2 dis_04R_no_rRNA_R2.fq --validateMappings -o dis_04R
[!NOTE] Then get quant.sf as the output for all the sample in their respective folder
Further below script will be executed in R/Rscript
library(tximport)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(apeglm)
library(vsn)
library(EnhancedVolcano)
library(ashr)
##Starting of tximport after quantify using salmon,chk that each directory have the quant.sf should be there##
samples<-read.csv("design_matrix.txt",header=T) ##design_matrix contains matrix for tximport not to be confused with deseq2 design matrix
tx2gene<-read.csv("tx2gene_protein_coding",sep=",") ##Matrix of two column on contains the transcriptId and one with geneId,derived from gff file
files <- file.path("/home/sarwar/sp058/rnaseq/re_analysis/salmon/protein_coding_quant/", samples$sample, "quant.sf")
samples$sample ->names(files)
txi<- tximport(files, type = "salmon", tx2gene = tx2gene)
write.csv(as.data.frame(txi$counts), file = "tx2gene_protein_coding_counts.csv") ##Save counts from tximport if required neither move to deseq2
##Starting of Deseq2##
col_data <- read.csv('design_matrix.txt', sep=',', row.names=1) #here matrix of diseased and healthy shold be given
#counts <- as.matrix(read.csv('./tximport_deg_only_PC_lnc_combined_counts.csv', sep=',', row.names='gene_id'))
dds <- DESeqDataSetFromTximport(txi = txi,colData = col_data,design= ~ condition) ##as data imported from tximport DESeqDataSetFromTximport should be provided neither provide the DESeqDataSetFromMatrix if taken from featurecounts or prepde.py
#dds <- DESeqDataSetFromMatrix(countData =round(counts), colData = col_data, design = ~condition)
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dds$condition <- relevel(dds$condition, ref='healthy')
dds$condition
des <- DESeq(dds)
resultsNames(des)
res <- results(des, alpha = 0.05) ###here to remove outlier lfcThreshold can be given,but not recommendet, not to confuse with log2fc
summary(res)
#for lfcshrinkage of ashr
#reslfcashr <- lfcShrink(des, res=res,coef="condition_diseased_vs_healthy",type="ashr" )
#for lfcshrinkage apeglm
#reslfcapeglm <- lfcShrink(des, res=res,coef="condition_diseased_vs_healthy",type="apeglm" )
#get vst
vsd <- vst(des, blind = F)
#vsd <- vst(des, blind = T,nsub = 3000, fitType = "parametric")
vsc <- assay (vsd) ##vsc just a variable for saving the vst result
write.csv(as.data.frame(vsc), file="protein_coding_vst_full.csv")
#plot sample PCA using vsd
plotPCA(vsd, intgroup = c("condition"))+geom_text(aes(label=name,fontface = "bold"),vjust=2,check_overlap = TRUE)+scale_color_manual(values = c("diseased" = "red", "healthy" = "darkgreen"))+theme(axis.text = element_text(size = 12,face = "bold"),axis.title= element_text(size = 16,face = "bold"),legend.text=element_text(size =14,face = "bold"),legend.title = element_text(size = 14, face = "bold"))
##Volcanoplot
EnhancedVolcano(reslfcashr,lab = rownames(res),x = 'log2FoldChange',y = 'pvalue',pCutoff = 0.05,FCcutoff = 2,pointSize = 2.8,labSize = 4.0)
#MA plot with lfcshrinkage accordingly ashr or apeglm,removed outliers
plotMA(reslfcashr, ylim=c(-5,5))
#standard deviation of the transformed data
meanSdPlot(assay(vsd))
#Heatmap of the sample-to-sample distances
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$label)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors)
# Heatmap of differentially expressed genes
pal <- colorRampPalette(c("#67001F", 'white', "#053061"))
selected <- order(res$padj, decreasing = F)[1:100]
pheatmap(counts(des)[selected,], cluster_rows = T, show_rownames = TRUE, cluster_cols = FALSE, annotation_col = col_data, breaks = c(seq(1, 10000, by = 100)))
#p-value graph
use <- res$baseMean > metadata(res)$filterThreshold
h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,col = colori, space = 0, main = "", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
#boxplot
par(mar=c(8,5,2,2))
boxplot(log10(assays(des)["cooks"](/ASBioinfo/Utils-hub/wiki/"cooks")), range=0, las=2)
#Gene dispersion plot
plotDispEsts(des)
#save upreguated in healthy
healthy_res <- res[(res$padj < 0.05) & (res$log2FoldChange < -2) & !(is.na(res$padj)),] #log2fc is -2
healthy_des <- des[rownames(healthy_res),]
# Select 100 top most DE genes according adjust p-value
healthy_selected <- order(healthy_res$padj, decreasing = F)[1:100]
counts(healthy_des[healthy_selected,])
# Plot heatmap of top 100 genes
pheatmap(counts(healthy_des)[healthy_selected,], cluster_rows = T, show_rownames = TRUE, cluster_cols = FALSE, annotation_col = col_data, breaks = c(seq(1, 10000, by = 100)))
#save upreguated in diseased
diseased_res <- res[(res$padj < 0.05) & (res$log2FoldChange > 2) & !(is.na(res$padj)),] #log2fc is 2
diseased_des <- des[rownames(diseased_res),]
diseased_selected <- order(diseased_res$padj, decreasing = F)[1:100]
counts(diseased_des[diseased_selected,])
pheatmap(counts(diseased_des)[diseased_selected,], cluster_rows = T, show_rownames = TRUE, cluster_cols = FALSE, annotation_col = col_data, breaks = c(seq(1, 10000, by = 100)))
#save both the file
write.table(as.data.frame(healthy_res), file = 'healthy_upregulated.csv', row.names = T, quote = F, sep = ",")
write.table(as.data.frame(diseased_res), file = 'diseased_upregulated.csv', row.names = T, quote = F, sep = ",")
[!TIP] Other important commands..
- STAR aligner
STAR --runThreadN 50 --runMode alignReads --sjdbGTFfile ./assembly/Bos_indicus_hybrid.UOA_Brahman_1.110.gtf --genomeDir ./assembly/ --outSAMstrandField intronMotif --readFilesIn ../sortme_rna/247_non_rRNA_R1.fq ../sortme_rna/247_non_rRNA_R2.fq --outSAMtype BAM SortedByCoordinate --outSAMunmapped None --outFileNamePrefix 247_dis --twopassMode Basic --outFilterIntronMotifs RemoveNoncanonical --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --chimSegmentMin 12 --chimJunctionOverhangMin 12 --chimSegmentReadGapMax 3
- Making of org.db annotation
library(dbplyr)
library(BiocFileCache)
library(AnnotationHub)
library(ensembldb)
ah <- AnnotationHub()
dbFile <- ensDbFromAH(ah["AH110632"])
edb <- EnsDb(dbFile)
makeEnsembldbPackage(ensdb=dbFile, version="0.0.1")
ens.str <- substr(rownames(reslfcashr))
rownames(reslfcashr)
ens.str <-rownames(reslfcashr)
reslfcashr$symbol<-mapIds(edb,keys=ens.str,column="SYMBOL",keytype="GENEID",multiVals="first")