RNAseq_Analysis_Pipeline - DR-genomics/Genomics-pipelines GitHub Wiki
Workflow - RNAseq Analysis using Salmon & DEseq2
Quantification of transcripts in various tissue samples using salmon
Create index of the transcriptome
salmon index -t PO1735_Microstegium_vimineum.transcript.fasta -i Mvim_index
Quantify the samples all at once using the following script
#!/bin/bash
for i in /data/dhanu/JS_assembly_files/RNAseq_data/*_R1_001.fastq.gz
do
j=${i%%_R1_001.fastq.gz}"_R2_001.fastq.gz"
describer=$(echo ${i} | sed 's/_R1_001.fastq.gz//')
echo "Processing sample ${describer}"
# salmon quant -i Sorghum_index -l A -1 $i -2 $j -p 8 --validateMappings --gcBias -o quants/${describer}_Sbi_quant
salmon quant -i Mvim_index -l A -1 $i -2 $j -p 8 --validateMappings --gcBias -o quants/${describer}_Mvim_quant
done
~
Import reads using tximport in R To see which genes are differentially expressed, we need to link transcript names to the gene names. Using the gtf annotation file and GenomicFeature package, follow steps below:
if (!requireNamespace("BiocManager", quietly = TRUE))
+ install.packages("BiocManager")
BiocManager::install("tximport")
BiocManager::install("GenomicFeatures")
BiocManager::install("readr")
library(tximport)
library(GenomicFeatures)
library(readr)
setwd('/Path_to_salmon_output/quants/')
txdb <- makeTxDbFromGFF("PO1735_Microstegium_vimineum.annotation.gtf")
k <- keys(txdb, keytype="TXNAME")
tx2gene <- select(txdb, k, "GENEID", "TXNAME")
head(tx2gene)
Import the salmon quantification
samples <- read.table("samples.txt", header = TRUE)
files <- file.path("quants_Mvim_ref", samples$sample, "quant.sf")
names(files) <- paste0(samples$sample)
txi <- tximport(files, type = "salmon", tx2gene = tx2gene)
head(txi$counts)
txi object is simply a list of matrices
names(txi)
#> [1] "abundance" "counts" "length"
#> [4] "countsFromAbundance"
txi$counts[1:3,1:3]
1-leaf_Mvim_quant 2-chasmogamous-inflorescence_Mvim_quant
ANN00001 81 382.341
ANN00002 2 12.000
ANN00003 3 55.996
3-cleistogamous-inflorescence_Mvim_quant
ANN00001 309.002
ANN00002 8.000
ANN00003 32.001
txi$abundance
1-leaf_Mvim_quant 2-chasmogamous-inflorescence_Mvim_quant
ANN00001 5.655971 16.023565
ANN00002 1.079559 3.984520
ANN00003 0.679264 7.631967
3-cleistogamous-inflorescence_Mvim_quant
ANN00001 14.006744
ANN00002 2.780923
ANN00003 4.556543
Diff. Expression using DESeq2
BiocManager::install("DESeq2")
library(DESeq2)
Generate result table from DESeq Data set
dds <- DESeqDataSetFromTximport(txi, samples, ~condition)
dds <- DESeq(dds)
res <- results(dds)
summary(res)
Plotting
Dispersion plot
plotDispEsts(dds, main="Dispersion plot")
Clustering & heatmaps
rld <- rlogTransformation(dds)
library(RColorBrewer)
library(gplots)
(mycols <- brewer.pal(8, "Dark2")[1:length(unique(samples$condition))])
sampleDists <- as.matrix(dist(t(assay(rld))))
heatmap.2(as.matrix(sampleDists), key=F, trace="none",
col=colorpanel(100, "black", "white"),
ColSideColors=mycols[samples$condition],
RowSideColors=mycols[samples$condition],
margin=c(10, 10), main="Sample Distance Matrix")
PCA plot
DESeq2::plotPCA(rld, intgroup="condition")
To look at p-values
table(res$padj<0.05)
res <- res[order(res$padj), ]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "Gene"
head(resdata)
Examine plot of p-values, the MA plot and the Volcano Plot: MA plot: By using DESeq2 builtin function [, it throws: Error in results(object, ...) : unused argument (cex = 1)
Volcano plot
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2)))
with(subset(res, padj<.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
Annotation