Amplicon Analysis - iffatAGheyas/bioinformatics-tutorial-wiki GitHub Wiki
In this section we’ll process amplicon data to infer exact sequence variants (ASVs) with DADA2, run a full workflow in QIIME 2, and assign taxonomy using reference databases (SILVA, Greengenes).
Installation
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("dada2")
library(dada2)
# 1. Define file paths (after trimming)
path <- "trimmed/"
fnFs <- sort(list.files(path, pattern="_R1.trimmed.fastq.gz", full.names=TRUE))
fnRs <- sort(list.files(path, pattern="_R2.trimmed.fastq.gz", full.names=TRUE))
# 2. Filter & trim (optional, if not already done)
filtFs <- file.path(path, "filtered", basename(fnFs))
filtRs <- file.path(path, "filtered", basename(fnRs))
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs,
truncLen=c(240,200), maxN=0, maxEE=c(2,2),
truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE)
# 3. Learn error rates
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
# 4. Dereplicate
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)
# 5. Sample inference (core DADA algorithm)
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)
# 6. Merge paired reads
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
# 7. Create ASV count table
seqtab <- makeSequenceTable(mergers)
# 8. Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus",
multithread=TRUE, verbose=TRUE)
# 9. Inspect dimensions
dim(seqtab.nochim) # samples × ASVs
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- paste0(">ASV", seq_len(length(asv_seqs)))
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, file="results/ASVs.fa")
write.csv(seqtab.nochim, file="results/ASV_counts.csv", row.names=TRUE)
conda create -n qiime2 -c qiime2 -c conda-forge qiime2
conda activate qiime2
# 1) Import trimmed FASTQs
qiime tools import \
--type 'SampleData[PairedEndSequencesWithQuality]' \
--input-path trimmed/ \
--input-format CasavaOneEightSingleLanePerSampleDirFmt \
--output-path demux.qza
# 2) Visualize quality
qiime demux summarize \
--i-data demux.qza \
--o-visualization demux.qzv
# 3) DADA2 denoise
qiime dada2 denoise-paired \
--i-demultiplexed-seqs demux.qza \
--p-trunc-len-f 240 \
--p-trunc-len-r 200 \
--o-table table.qza \
--o-representative-sequences rep-seqs.qza \
--o-denoising-stats denoising-stats.qza
# 4) View results
qiime feature-table summarize \
--i-table table.qza \
--o-visualization table.qzv
qiime metadata tabulate \
--m-input-file denoising-stats.qza \
--o-visualization denoising-stats.qzv
- DADA2 assignTaxonomy Download a pretrained SILVA database (e.g. SILVA v138):
wget -O silva_train_set.fa.gz \
https://zenodo.org/record/4587955/files/silva_nr_v138_train_set.fa.gz
gunzip silva_train_set.fa.gz
wget -O silva_species_assignment.fa.gz \
https://zenodo.org/record/4587955/files/silva_species_assignment_v138.fa.gz
gunzip silva_species_assignment.fa.gz
In R:
taxa <- assignTaxonomy(seqtab.nochim,
"silva_nr_v138_train_set.fa",
multithread=TRUE)
taxa <- addSpecies(taxa, "silva_species_assignment_v138.fa")
write.csv(taxa, "results/ASV_taxonomy.csv", row.names=TRUE)
- QIIME 2 feature-classifier
# 1) Import reference
qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path silva_138_99_seqs.fa \
--output-path silva_ref_seqs.qza
qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-path silva_138_7_taxonomy.txt \
--output-path silva_ref_tax.qza
# 2) Train classifier (only once)
qiime feature-classifier fit-classifier-naive-bayes \
--i-reference-reads silva_ref_seqs.qza \
--i-reference-taxonomy silva_ref_tax.qza \
--o-classifier silva_classifier.qza
# 3) Classify ASVs
qiime feature-classifier classify-sklearn \
--i-classifier silva_classifier.qza \
--i-reads rep-seqs.qza \
--o-classification taxonomy.qza
qiime metadata tabulate \
--m-input-file taxonomy.qza \
--o-visualization taxonomy.qzv
Next: move on to 6.2.5 Shotgun Taxonomic Profiling or begin diversity analysis (Section 6.2.7).