Transcriptomics - ASBioinfo/Utils-hub GitHub Wiki

1. Transcriptome pipeline (Using Salmon)

Run fastp

fastp --in1 27_R1_001.fastq.gz --in2 27_R2_001.fastq.gz --out1 27_trim_R1.fastq.gz --out2 27_trim_R2.fastq.gz --adapter_fasta adapter_seq.fa --thread 16 -p --overrepresentation_analysis -l 50 -q 20 -u 20 -h 27_report.html -j 27_report.json -R 27_report --trim_front1 2 --trim_tail1 2 --trim_front2 2 --trim_tail2 2 --dont_overwrite

Run sortme RNA

There is four database is present but choose only smr_v4.3_default_db.fasta as it contain all the features dont go for other until unless looking for a specific type of rRNA

sortmerna --ref rRNA_databases_v4.3.6/smr_v4.3_default_db.fasta --reads 31_trim_R1.fastq --reads 31_trim_R2.fastq --threads 35 --workdir ./31/ --aligned 31_rRNA_reads --fastx --other 31_non_rRNA_reads --paired_in --out2

[!IMPORTANT] Keep only desired RNA types (eg., only protein, tRNA etc.) in .gtf file of reference using script and use in the next step.

Get fasta using gffread for salmon quantifications

gffread -w Bos_indicus_hybrid_salmon.fasta -g Bos_indicus_hybrid.UOA_Brahman_1.dna_sm.toplevel.fa Bos_indicus_hybrid.UOA_Brahman_non_RNA.gtf

Indexing using salmon

salmon index --threads 40 -t salmon_Bos_indicus_hybrid.fa -i salmon_index -k 31

Here,

-t -> input

-i -> output

Further run next command in respective sample folder

salmon quant -i ../assembly/salmon_index_protein_coding --threads 40 -l A -1 dis_04R_no_rRNA_R1.fq -2 dis_04R_no_rRNA_R2.fq --validateMappings -o dis_04R

[!NOTE] Then get quant.sf as the output for all the sample in their respective folder

Further below script will be executed in R/Rscript

library(tximport)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(ggplot2)
library(apeglm)
library(vsn)
library(EnhancedVolcano)
library(ashr)

##Starting of tximport after quantify using salmon,chk that each directory have the quant.sf should be there##

samples<-read.csv("design_matrix.txt",header=T)									##design_matrix contains matrix for tximport not to be confused with deseq2 design matrix
tx2gene<-read.csv("tx2gene_protein_coding",sep=",")								##Matrix of two column on contains the transcriptId and one with geneId,derived from gff file
files <- file.path("/home/sarwar/sp058/rnaseq/re_analysis/salmon/protein_coding_quant/", samples$sample, "quant.sf")
samples$sample ->names(files)
txi<- tximport(files, type = "salmon", tx2gene = tx2gene)
write.csv(as.data.frame(txi$counts), file = "tx2gene_protein_coding_counts.csv")							##Save counts from tximport if required neither move to deseq2

##Starting of Deseq2##

col_data <- read.csv('design_matrix.txt', sep=',', row.names=1)		#here matrix of diseased and healthy shold be given
#counts <- as.matrix(read.csv('./tximport_deg_only_PC_lnc_combined_counts.csv', sep=',', row.names='gene_id'))

dds <- DESeqDataSetFromTximport(txi = txi,colData = col_data,design= ~ condition)						##as data imported from tximport DESeqDataSetFromTximport should be provided neither provide the DESeqDataSetFromMatrix if taken from featurecounts or prepde.py
#dds <- DESeqDataSetFromMatrix(countData =round(counts), colData = col_data, design = ~condition)
keep <- rowSums(counts(dds)) >= 10                         
dds <- dds[keep,]
dds$condition <- relevel(dds$condition, ref='healthy')
dds$condition
des <- DESeq(dds)
resultsNames(des)
res <- results(des, alpha = 0.05)							###here to remove outlier lfcThreshold can be given,but not recommendet, not to confuse with log2fc
summary(res)
#for lfcshrinkage of ashr
#reslfcashr <- lfcShrink(des, res=res,coef="condition_diseased_vs_healthy",type="ashr" )

#for lfcshrinkage apeglm
#reslfcapeglm <- lfcShrink(des, res=res,coef="condition_diseased_vs_healthy",type="apeglm" )

#get vst
vsd <- vst(des, blind = F)
#vsd <- vst(des, blind = T,nsub = 3000, fitType = "parametric")
vsc <- assay (vsd)                                                                                                                    ##vsc just a variable for saving the vst result
write.csv(as.data.frame(vsc), file="protein_coding_vst_full.csv")

#plot sample PCA using vsd

plotPCA(vsd, intgroup = c("condition"))+geom_text(aes(label=name,fontface = "bold"),vjust=2,check_overlap = TRUE)+scale_color_manual(values = c("diseased" = "red", "healthy" = "darkgreen"))+theme(axis.text = element_text(size = 12,face = "bold"),axis.title= element_text(size = 16,face = "bold"),legend.text=element_text(size =14,face = "bold"),legend.title = element_text(size = 14, face = "bold"))

##Volcanoplot

EnhancedVolcano(reslfcashr,lab = rownames(res),x = 'log2FoldChange',y = 'pvalue',pCutoff = 0.05,FCcutoff = 2,pointSize = 2.8,labSize = 4.0)

#MA plot with lfcshrinkage accordingly ashr or apeglm,removed outliers
plotMA(reslfcashr, ylim=c(-5,5))

#standard deviation of the transformed data
meanSdPlot(assay(vsd))
#Heatmap of the sample-to-sample distances
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$label)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)
# Heatmap of differentially expressed genes
pal <- colorRampPalette(c("#67001F", 'white', "#053061"))
selected <- order(res$padj, decreasing = F)[1:100]
pheatmap(counts(des)[selected,], cluster_rows = T, show_rownames = TRUE, cluster_cols = FALSE, annotation_col = col_data, breaks = c(seq(1, 10000, by = 100)))

#p-value graph
use <- res$baseMean > metadata(res)$filterThreshold
h1 <- hist(res$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(res$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")

barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,col = colori, space = 0, main = "", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))

#boxplot
par(mar=c(8,5,2,2))
boxplot(log10(assays(des)["cooks"](/ASBioinfo/Utils-hub/wiki/"cooks")), range=0, las=2)

#Gene dispersion plot
plotDispEsts(des)

#save upreguated in healthy
healthy_res <- res[(res$padj < 0.05) & (res$log2FoldChange < -2) & !(is.na(res$padj)),]                                                               #log2fc is -2
healthy_des <- des[rownames(healthy_res),]
# Select 100 top most DE genes according adjust p-value
healthy_selected <- order(healthy_res$padj, decreasing = F)[1:100]
counts(healthy_des[healthy_selected,])
# Plot heatmap of top 100 genes
pheatmap(counts(healthy_des)[healthy_selected,], cluster_rows = T, show_rownames = TRUE, cluster_cols = FALSE, annotation_col = col_data, breaks = c(seq(1, 10000, by = 100)))

#save upreguated in diseased
diseased_res <- res[(res$padj < 0.05) & (res$log2FoldChange > 2) & !(is.na(res$padj)),]                                                              #log2fc is  2
diseased_des <- des[rownames(diseased_res),]
diseased_selected <- order(diseased_res$padj, decreasing = F)[1:100]
counts(diseased_des[diseased_selected,])
pheatmap(counts(diseased_des)[diseased_selected,], cluster_rows = T, show_rownames = TRUE, cluster_cols = FALSE, annotation_col = col_data, breaks = c(seq(1, 10000, by = 100)))

#save both the file
write.table(as.data.frame(healthy_res), file = 'healthy_upregulated.csv', row.names = T, quote = F, sep = ",")
write.table(as.data.frame(diseased_res), file = 'diseased_upregulated.csv', row.names = T, quote = F, sep = ",")

[!TIP] Other important commands..

STAR aligner

STAR --runThreadN 50 --runMode alignReads --sjdbGTFfile ./assembly/Bos_indicus_hybrid.UOA_Brahman_1.110.gtf --genomeDir ./assembly/ --outSAMstrandField intronMotif --readFilesIn ../sortme_rna/247_non_rRNA_R1.fq ../sortme_rna/247_non_rRNA_R2.fq --outSAMtype BAM SortedByCoordinate --outSAMunmapped None --outFileNamePrefix 247_dis --twopassMode Basic --outFilterIntronMotifs RemoveNoncanonical --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outFilterMultimapNmax 20 --alignSJoverhangMin 8 --chimSegmentMin 12 --chimJunctionOverhangMin 12 --chimSegmentReadGapMax 3

Making of org.db annotation

library(dbplyr)
library(BiocFileCache)
library(AnnotationHub)
library(ensembldb)

ah <- AnnotationHub()
dbFile <- ensDbFromAH(ah["AH110632"])
edb <- EnsDb(dbFile)
makeEnsembldbPackage(ensdb=dbFile, version="0.0.1")

ens.str <- substr(rownames(reslfcashr))
rownames(reslfcashr)
ens.str <-rownames(reslfcashr)
reslfcashr$symbol<-mapIds(edb,keys=ens.str,column="SYMBOL",keytype="GENEID",multiVals="first")