Run an R script on Hipergator - meyermicrobiolab/Meyer_Lab_Resources GitHub Wiki
If you have >100 samples from your illumina amplicon sequencing run, you will most likely need to run dada2 (to pick ASVs and assign taxonomy) on hipergator instead of your laptop. In addition to a directory with your paired fastq files that have been through cutadapt to remove primers and illumina adaptors, you will need 2 files: a bash script and an r script.
Example of a bash script (using burst) to run an R script:
#!/bin/bash
#SBATCH --job-name=qualityfilter_%j
#SBATCH --output=qualityfilter_%j.log
#SBATCH --error=qualityfilter_%j.err
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH --mail-user [email protected]
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=60gb
#SBATCH --time=4-00:00:00
#SBATCH --account=juliemeyer
#SBATCH --qos=juliemeyer-b
#
pwd; hostname; date
cd $SLURM_SUBMIT_DIR
module load R
Rscript dada2_hipergator.R
You will also need the file that contains the R code named in the bash script (the example below would be named "dada2_hipergator.R"). An example of the beginning of an R code is shown below. Don't forget to update the path name for the directory with all of your fastq files
library(dada2)
library(ggplot2)
writeLines(capture.output(sessionInfo()), "sessionInfo.txt")
#### 1st sequencing run
path <- "blue/juliemeyer/juliemeyer/cutadapt_NS2107_BS2_August"
list.files(path)
fnFs <- sort(list.files(path, pattern="_R1_cut.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2_cut.fastq.gz", full.names = TRUE))
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
# Perform filtering and trimming
filt_path <- file.path(path, "filtered")
filtFs <- file.path(filt_path, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(filt_path, paste0(sample.names, "_R_filt.fastq.gz"))
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(150,150),
maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE,
compress=TRUE, multithread=TRUE)
head(out)
# Learn the Error Rates, it TAKES TIME!
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
plotErrors(errF, nominalQ=TRUE)
# Dereplicate the filtered fastq files
derepFs <- derepFastq(filtFs, verbose=TRUE)
derepRs <- derepFastq(filtRs, verbose=TRUE)
names(derepFs) <- sample.names
names(derepRs) <- sample.names
# Infer the sequence variants in each sample
dadaFs <- dada(derepFs, err=errF, multithread=TRUE)
dadaRs <- dada(derepRs, err=errR, multithread=TRUE)
# Inspecting the dada-class object returned by dada:
dadaFs[1](/meyermicrobiolab/Meyer_Lab_Resources/wiki/1)
# Merge the denoised forward and reverse reads:
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE)
# Inspect the merger data.frame from the first sample
head(mergers[1](/meyermicrobiolab/Meyer_Lab_Resources/wiki/1))
# Construct sequence table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab))
colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled")
rownames(track) <- sample.names
head(track)
write.table(track, "dada_read_stats1.txt",sep="\t",col.names=NA)
saveRDS(seqtab, "~/Desktop/BS2s/cutadapt_NS2107_BS2_August/seqtab.rds")
I would recommend running the dada2 steps through taxonomy assignment and fixing the NAs in the taxa table on hipergator (shown below). Make sure you have the taxonomy training set in your directory (adjust path name). As of summer 2021, the most current taxonomy database you should be using is silva v138.1 (available in blue/juliemeyer/share). Be sure to save your R data (.rds file) and text files at the end.
taxa <- assignTaxonomy(seqtab.nochim, "blue/juliemeyer/juliemeyer/silva_nr99_v138.1_train_set.fa.gz", multithread=TRUE)
# FIX the NAs in the taxa table
taxon <- as.data.frame(taxa,stringsAsFactors=FALSE)
taxon$Phylum[is.na(taxon$Phylum)] <- taxon$Kingdom[is.na(taxon$Phylum)]
taxon$Class[is.na(taxon$Class)] <- taxon$Phylum[is.na(taxon$Class)]
taxon$Order[is.na(taxon$Order)] <- taxon$Class[is.na(taxon$Order)]
taxon$Family[is.na(taxon$Family)] <- taxon$Order[is.na(taxon$Family)]
taxon$Genus[is.na(taxon$Genus)] <- taxon$Family[is.na(taxon$Genus)]
write.table(taxon,"silva_taxa_table.txt",sep="\t",col.names=NA)
write.table(seqtab.nochim, "silva_otu_table.txt",sep="\t",col.names=NA)
Now you are ready to open the asv table and taxonomy table, along with your metadata, on your local computer and create a phyloseq object. I use phyloseq to remove chloroplasts and mitochondria, then export the asv and taxonomy tables again, along with the asv table converted to relative abundance for future reference.
# Create phyloseq object from otu and taxonomy tables from dada2, along with the sample metadata.
otu <- read.table("silva_otu_table.txt",sep="\t",header=TRUE, row.names=1)
taxon <- read.table("silva_taxa_table.txt",sep="\t",header=TRUE,row.names=1)
samples<-read.table("BS2_metadata.txt",sep="\t",header=T,row.names=1)
OTU = otu_table(otu, taxa_are_rows=FALSE)
taxon<-as.matrix(taxon)
TAX = tax_table(taxon)
sampledata = sample_data(samples)
ps <- phyloseq(otu_table(otu, taxa_are_rows=FALSE),
sample_data(samples),
tax_table(taxon))
ps
# remove chloroplasts and mitochondria and Eukaryota
get_taxa_unique(ps, "Family")
get_taxa_unique(ps, "Order")
get_taxa_unique(ps, "Kingdom")
ps <- subset_taxa(ps, Family !="Mitochondria")
ps <- subset_taxa(ps, Order !="Chloroplast")
ps <- subset_taxa(ps, Kingdom !="Eukaryota")
ps <- subset_taxa(ps, Kingdom !="NA")
get_taxa_unique(ps, "Family")
get_taxa_unique(ps, "Order")
get_taxa_unique(ps, "Kingdom")
ps
# Now export cleaned otu and taxa tables from phyloseq for future reference
otu = as(otu_table(ps), "matrix")
taxon = as(tax_table(ps), "matrix")
metadata = as(sample_data(ps), "matrix")
write.table(otu,"silva_nochloronomito_otu_table.txt",sep="\t",col.names=NA)
write.table(taxon,"silva_nochloronomito_taxa_table.txt",sep="\t",col.names=NA)
# export ASV table as relative abundance
ps_ra<-transform_sample_counts(ps, function(OTU) OTU/sum(OTU))
otu_ra = as(otu_table(ps_ra), "matrix")
write.table(otu_ra,"silva_nochloronomito_otu_table_RA.txt",sep="\t",col.names=NA)