RNAseq_Analysis_Pipeline - DR-genomics/Genomics-pipelines GitHub Wiki

Workflow - RNAseq Analysis using Salmon & DEseq2

Quantification of transcripts in various tissue samples using salmon

Create index of the transcriptome

salmon index -t PO1735_Microstegium_vimineum.transcript.fasta -i Mvim_index

Quantify the samples all at once using the following script

#!/bin/bash
for i in  /data/dhanu/JS_assembly_files/RNAseq_data/*_R1_001.fastq.gz
do
        j=${i%%_R1_001.fastq.gz}"_R2_001.fastq.gz"
        describer=$(echo ${i} | sed 's/_R1_001.fastq.gz//')
        echo "Processing sample ${describer}"

#       salmon quant -i Sorghum_index -l A -1 $i -2 $j -p 8 --validateMappings --gcBias -o quants/${describer}_Sbi_quant
        salmon quant -i Mvim_index -l A -1 $i -2 $j -p 8 --validateMappings --gcBias -o quants/${describer}_Mvim_quant
done
~

Import reads using tximport in R To see which genes are differentially expressed, we need to link transcript names to the gene names. Using the gtf annotation file and GenomicFeature package, follow steps below:

if (!requireNamespace("BiocManager", quietly = TRUE))
+     install.packages("BiocManager")

BiocManager::install("tximport")
BiocManager::install("GenomicFeatures")
BiocManager::install("readr")

library(tximport)
library(GenomicFeatures)
library(readr)

setwd('/Path_to_salmon_output/quants/')
txdb <- makeTxDbFromGFF("PO1735_Microstegium_vimineum.annotation.gtf")
k <- keys(txdb, keytype="TXNAME")
tx2gene <- select(txdb, k, "GENEID", "TXNAME")
head(tx2gene)

Import the salmon quantification

samples <- read.table("samples.txt", header = TRUE)
files <- file.path("quants_Mvim_ref", samples$sample, "quant.sf")
names(files) <- paste0(samples$sample)
txi <- tximport(files, type = "salmon", tx2gene = tx2gene)
head(txi$counts)

txi object is simply a list of matrices

names(txi)
#> [1] "abundance" "counts" "length"
#> [4] "countsFromAbundance"

txi$counts[1:3,1:3]
 1-leaf_Mvim_quant 2-chasmogamous-inflorescence_Mvim_quant
ANN00001                81                                 382.341
ANN00002                 2                                  12.000
ANN00003                 3                                  55.996
         3-cleistogamous-inflorescence_Mvim_quant
ANN00001                                  309.002
ANN00002                                    8.000
ANN00003                                   32.001

txi$abundance
  1-leaf_Mvim_quant 2-chasmogamous-inflorescence_Mvim_quant
ANN00001          5.655971                               16.023565
ANN00002          1.079559                                3.984520
ANN00003          0.679264                                7.631967
         3-cleistogamous-inflorescence_Mvim_quant
ANN00001                                14.006744
ANN00002                                 2.780923
ANN00003                                 4.556543

Diff. Expression using DESeq2

BiocManager::install("DESeq2")
library(DESeq2)

Generate result table from DESeq Data set

dds <- DESeqDataSetFromTximport(txi, samples, ~condition)
dds <- DESeq(dds)
res <- results(dds)
summary(res)

Plotting

Dispersion plot

plotDispEsts(dds, main="Dispersion plot")

Clustering & heatmaps

rld <- rlogTransformation(dds)
library(RColorBrewer)
library(gplots)

(mycols <- brewer.pal(8, "Dark2")[1:length(unique(samples$condition))])
sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
          col=colorpanel(100, "black", "white"),
          ColSideColors=mycols[samples$condition],
          RowSideColors=mycols[samples$condition],
          margin=c(10, 10), main="Sample Distance Matrix")

PCA plot

DESeq2::plotPCA(rld, intgroup="condition")

To look at p-values

table(res$padj<0.05)
res <- res[order(res$padj), ]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)

Examine plot of p-values, the MA plot and the Volcano Plot: MA plot: By using DESeq2 builtin function [, it throws: Error in results(object, ...) : unused argument (cex = 1)

Volcano plot

with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))

Annotation