Input files for the work below can be found and downloaded from here.
Load required libraries
library(dplyr); packageVersion("dplyr") # '0.8.5'
library(vegan); packageVersion("vegan") # '2.5.6'
library(RColorBrewer); packageVersion("RColorBrewer") # '1.1.2'
library(ggplot2); packageVersion("ggplot2") # '3.3.0'
library(readxl); packageVersion("readxl") # '1.3.1'
library(ggpubr); packageVersion("ggpubr") # '0.3.0'
library(knitr);packageVersion("knitr") # '1.28'
Load custom functions
# Custom function for melting bc distances
melt.dist <- function(x,only_identity = FALSE,omit_identity=TRUE) {
library(reshape2)
if(!is.matrix(x)) {
x <- as.matrix(x)
}
y <- melt(x)
if(omit_identity) {
l <- list();
nr <- nrow(x);
nin <- c();
for(i in 0:(nr - 1)){
rs <- (i * nr)+1;
if(only_identity==FALSE) {
l[i+1](/AstrobioMike/JPL-HBCU-2020/wiki/i+1) <- (rs+i):(rs+nr-1)
} else {
l[i+1](/AstrobioMike/JPL-HBCU-2020/wiki/i+1) <- (rs+i)
}
}
l <- -c(unlist(l))
y <- y[l,]
}
return(y)
}
# Custom function for importing report files (3 arguments, file names, how many characters to keep, include header or not)
my_read_txt <- function(x, n, h) {
out <- read.delim(x, sep = "\t", quote = "", stringsAsFactors = FALSE, header = h)
sample <- substr(basename(x), 1, n) # basename removes directory, and substr selects 1:7 characters here
cbind(sample=sample, out) # adding sample name as a column
}
Input Zymo mmc data
zymo_mmc_levels <- read.delim("zymo_mmc_logII.txt", sep = "\t", quote = "", stringsAsFactors = FALSE)
zymo_pct <- zymo_mmc_levels
colnames(zymo_pct) <- c("Var1", "Var2", "value") # For combining with others down the line
Input sample metadata
meta <- read_excel("pipeline_comp_meta.xlsx")
sample_desc <- read_excel("sample_desc.xlsx")
kable(sample_desc, caption = "Sample Details")
Sample Details
name |
desc |
trimmed_reads |
sample1 |
Zymo_mmc_05_1_undil |
4283096 |
sample2 |
Zymo_mmc_05_2_undil |
4327888 |
sample3 |
Zymo_mmc_05_1_undilcopy |
4283096 |
sample4 |
Zymo_mmc_06_1_diluted |
5270348 |
sample5 |
ML_even_hiseq_sim |
4998731 |
sample6 |
ML_even_perfect_sim |
4998731 |
sample7 |
ML_uneven_hiseq_sim |
4998734 |
sample8 |
ML_uneven_perfect_sim |
4998734 |
We have 4 samples which are from Zymo Model Microbial Communities (MMC).
Sample 1 and Sample 2 are 2 unique such samples. They are a known mix of 10 species of bacteria and fungi. So, for these samples we know what exact species to expect.
Sample 3 is a technical replicate of sample 1, we wanted to see how well the software performs with 2 exact replicas.
Sample 4 is also a MMC sample, but it has been diluted with water, which introduces some contaminants in it, so we expect to find more than just the 10 listed species there.
Sample 5-8 are Synthetic Metagenomes.
These samples were artificially created by cutting genomes into pieces computationally. We know exactly what species, and how much of each, are present.
Samples 5 & 6 have an even distribution of the constituent species.
Samples 7 and 8 have a staggered distribution.
Samples 6 & 8 are "perfect" samples, while 5 & 7 are made to replicate how sequencing errors might look like.
Read synthetic metagenome sample (sample 5:8) original data
# Input synthetic reads
syn.tax <- read.delim("target-genome-info.tsv", sep = "\t", quote = "", stringsAsFactors = FALSE)
# List all files in the bracken_out directory
syn_files <- list.files(path="synthetic_samples", pattern="*.tsv", full.names=TRUE)
# Load tables & clean columns
syn.m <- bind_rows(lapply(syn_files, my_read_txt, 8, FALSE)) # apply the custom txt reading function to each file, and bind the list by row
colnames(syn.m) <- c("sample", "accession", "pct")
syn.m$accession <- gsub(".fa", "", syn.m$accession)
syn.m$accession <- gsub("-trimmed", "", syn.m$accession)
syn.m$species <- syn.tax$specific_name[match(syn.m$accession, syn.tax$accession)]
# Make species counts table for Kraken2
syn_pct <- dcast(syn.m, sample ~ species, sum, value.var = "pct")
row.names(syn_pct) <- syn_pct$sample
syn_pct$sample <- NULL
syn_pct[is.na(syn_pct)] <- 0
# dim(syn_pct) # 4 48
# Order by abundance
syn_pct <- syn_pct[, order(colMeans(syn_pct), decreasing = T)]
# Display table
kable(syn_pct[, c(1:6)])
|
Alcaligenes faecalis |
Mandrillus leucophaeus |
Mesorhizobium australicum |
Sphingobacterium sp. 1.A.4 |
Staphylococcus epidermidis |
Ralstonia mannitolilytica |
even_err |
0.0333333 |
0.0166667 |
0.0166667 |
0.0166667 |
0.0333333 |
0.0333333 |
even_per |
0.0333333 |
0.0166667 |
0.0166667 |
0.0166667 |
0.0333333 |
0.0333333 |
stag_err |
0.0592251 |
0.0711611 |
0.0691178 |
0.0663078 |
0.0434145 |
0.0408736 |
stag_per |
0.0592251 |
0.0711611 |
0.0691178 |
0.0663078 |
0.0434145 |
0.0408736 |
Import Bracken (& Kraken2) results
# Inpput Kracken2/Bracken results
# List all files in the bracken_out directory
brep_files <- list.files(path="bracken_out", pattern="*.txt", full.names=TRUE)
brep.m <- bind_rows(lapply(brep_files, my_read_txt, 7, TRUE)) # apply the custom txt reading function to each file, and bind the list by row
# Make species counts table for Bracken estimates
brep_counts <- dcast(brep.m, sample ~ name, sum, value.var = "new_est_reads")
row.names(brep_counts) <- brep_counts$sample
brep_counts$sample <- NULL
brep_counts[is.na(brep_counts)] <- 0
row.names(brep_counts) <- paste0(row.names(brep_counts), "_br")
dim(brep_counts) # 8 5366
## [1] 8 5366
# Convert to Relative Abundance
brep_pct <- decostand(brep_counts, method = "total")
# Order by abundance
brep_pct <- brep_pct[, order(colMeans(brep_pct), decreasing = T)]
# Check data
kable(brep_pct[, c(1:6)])
|
Homo sapiens |
Lactobacillus fermentum |
Salmonella enterica |
Escherichia coli |
Pseudomonas aeruginosa |
Listeria monocytogenes |
sample1_br |
0.0722376 |
0.1643045 |
0.1354720 |
0.1187102 |
0.0623060 |
0.1010660 |
sample2_br |
0.0182907 |
0.1760192 |
0.1529426 |
0.1389259 |
0.0699399 |
0.1071486 |
sample3_br |
0.0722376 |
0.1643045 |
0.1354720 |
0.1187102 |
0.0623060 |
0.1010660 |
sample4_br |
0.0604164 |
0.1232946 |
0.1344693 |
0.1045271 |
0.0616600 |
0.0872228 |
sample5_br |
0.1191714 |
0.0000003 |
0.0000389 |
0.0000950 |
0.0652800 |
0.0000044 |
sample6_br |
0.1190907 |
0.0000000 |
0.0000227 |
0.0000901 |
0.0675425 |
0.0000047 |
sample7_br |
0.1685095 |
0.0000003 |
0.0000346 |
0.0000517 |
0.0153391 |
0.0000037 |
sample8_br |
0.1677246 |
0.0000000 |
0.0000187 |
0.0000540 |
0.0168036 |
0.0000029 |
Import Ganon results
# List all files in the bracken_out directory
gan_files <- list.files(path="ganon_out", pattern="*.txt", full.names=TRUE)
gan_files
## [1] "ganon_out/sample1_ganon_species.txt" "ganon_out/sample2_ganon_species.txt" "ganon_out/sample3_ganon_species.txt" "ganon_out/sample4_ganon_species.txt"
## [5] "ganon_out/sample5_ganon_species.txt" "ganon_out/sample6_ganon_species.txt" "ganon_out/sample7_ganon_species.txt" "ganon_out/sample8_ganon_species.txt"
# Read all files and add sample name column
gan.m <- bind_rows(lapply(gan_files, my_read_txt, 7, FALSE)) # apply the custom txt reading function to each file, and bind the list by row
colnames(gan.m) <- c("sample", "rank", "target", "taxid_lineage", "target_scientific_name", "unique_assignments", "reads_assigned",
"cumulative_assignments", "cumulative_assignments_percent")
gan.m <- gan.m[gan.m$rank == "species", c(1:6)]
gan.m$unique_assignments <- as.numeric(gan.m$unique_assignments)
# Make species counts table for Bracken estimates
gan_counts <- dcast(gan.m, sample ~ target_scientific_name, sum, value.var = "unique_assignments")
row.names(gan_counts) <- gan_counts$sample
gan_counts$sample <- NULL
gan_counts[is.na(gan_counts)] <- 0
row.names(gan_counts) <- paste0(row.names(gan_counts), "_gan")
dim(gan_counts) # 8 3450
## [1] 8 3450
# Convert to Relative Abundance
gan_pct <- decostand(gan_counts, method = "total")
# Order by abundance
gan_pct <- gan_pct[, order(colMeans(gan_pct), decreasing = T)]
# Check data
kable(gan_pct[, c(1:6)])
|
Lactobacillus fermentum |
Listeria monocytogenes |
Salmonella enterica |
Enterococcus faecalis |
Alcaligenes faecalis |
Moraxella osloensis |
sample1_gan |
0.2881820 |
0.1743708 |
0.1580712 |
0.1140167 |
0.0002860 |
0.0000221 |
sample2_gan |
0.2856433 |
0.1708909 |
0.1693516 |
0.1051977 |
0.0003631 |
0.0001242 |
sample3_gan |
0.2881820 |
0.1743708 |
0.1580712 |
0.1140167 |
0.0002860 |
0.0000221 |
sample4_gan |
0.2259734 |
0.1576569 |
0.1609495 |
0.0971979 |
0.0005950 |
0.0000757 |
sample5_gan |
0.0000000 |
0.0000004 |
0.0000065 |
0.0000000 |
0.0690810 |
0.1055499 |
sample6_gan |
0.0000000 |
0.0000000 |
0.0000078 |
0.0000000 |
0.0691389 |
0.1056169 |
sample7_gan |
0.0000000 |
0.0000000 |
0.0000112 |
0.0000004 |
0.1194713 |
0.0675964 |
sample8_gan |
0.0000000 |
0.0000000 |
0.0000151 |
0.0000000 |
0.1195075 |
0.0676703 |
Import Centrifuge results
# List all files in the directory
cent_files <- list.files(path="cent_out", pattern="*.txt", full.names=TRUE)
cent_files
## [1] "cent_out/sample1_centrifuge_reformatted_out.txt" "cent_out/sample2_centrifuge_reformatted_out.txt" "cent_out/sample3_centrifuge_reformatted_out.txt"
## [4] "cent_out/sample4_centrifuge_reformatted_out.txt" "cent_out/sample5_centrifuge_reformatted_out.txt" "cent_out/sample6_centrifuge_reformatted_out.txt"
## [7] "cent_out/sample7_centrifuge_reformatted_out.txt" "cent_out/sample8_centrifuge_reformatted_out.txt"
# Read all files and add sample name column
cent.m <- bind_rows(lapply(cent_files, my_read_txt, 7, FALSE)) # apply the custom txt reading function to each file, and bind the list by row
colnames(cent.m) <- c("sample", "percent_frag", "numb_frag_clade", "numb_frag_taxon", "tax_rank", "taxid", "name")
cent.m$name <- trimws(cent.m$name) # Remove leading whitespaces
# Subset for species ranks
cent.m <- cent.m[cent.m$tax_rank == "S",]
cent.m$numb_frag_taxon <- as.numeric(cent.m$numb_frag_taxon)
# Make species counts table for Bracken estimates
cent_counts <- dcast(cent.m, sample ~ name, sum, value.var = "numb_frag_taxon")
row.names(cent_counts) <- cent_counts$sample
cent_counts$sample <- NULL
cent_counts[is.na(cent_counts)] <- 0
row.names(cent_counts) <- paste0(row.names(cent_counts), "_cent")
dim(cent_counts) # 8 6119
## [1] 8 6119
# Convert to Relative Abundance
cent_pct <- decostand(cent_counts, method = "total")
# Order by abundance
cent_pct <- cent_pct[, order(colMeans(cent_pct), decreasing = T)]
# Check data
kable(cent_pct[, c(1:6)])
|
Homo sapiens |
Lactobacillus fermentum |
Listeria monocytogenes |
Salmonella enterica |
Enterococcus faecalis |
Alcaligenes faecalis |
sample1_cent |
0.1224718 |
0.2622438 |
0.1608562 |
0.1163070 |
0.1052551 |
0.0003150 |
sample2_cent |
0.0344582 |
0.2962357 |
0.1789922 |
0.1405057 |
0.1105359 |
0.0004731 |
sample3_cent |
0.1224718 |
0.2622438 |
0.1608562 |
0.1163070 |
0.1052551 |
0.0003150 |
sample4_cent |
0.1030041 |
0.1931445 |
0.1366424 |
0.1097844 |
0.0842205 |
0.0007813 |
sample5_cent |
0.1531038 |
0.0000126 |
0.0000126 |
0.0000345 |
0.0000053 |
0.0533062 |
sample6_cent |
0.1534780 |
0.0000103 |
0.0000134 |
0.0000404 |
0.0000073 |
0.0535282 |
sample7_cent |
0.2284607 |
0.0000121 |
0.0000089 |
0.0000339 |
0.0000089 |
0.0993442 |
sample8_cent |
0.2280644 |
0.0000115 |
0.0000065 |
0.0000348 |
0.0000111 |
0.0995740 |
Combine relative abundance data from all samples for comparison
# Melt & Combine for comp
all_samples_comp.m <- rbind(melt(as.matrix(syn_pct)), melt(as.matrix(brep_pct)), melt(as.matrix(gan_pct)), melt(as.matrix(cent_pct)), zymo_pct)
colnames(all_samples_comp.m) <- c("sample", "species", "relabs")
unique(all_samples_comp.m$sample) # 37
## [1] even_err even_per stag_err stag_per sample1_br sample2_br sample3_br sample4_br sample5_br sample6_br sample7_br sample8_br
## [13] sample1_gan sample2_gan sample3_gan sample4_gan sample5_gan sample6_gan sample7_gan sample8_gan sample1_cent sample2_cent sample3_cent sample4_cent
## [25] sample5_cent sample6_cent sample7_cent sample8_cent zymo_mmc
## 29 Levels: even_err even_per stag_err stag_per sample1_br sample2_br sample3_br sample4_br sample5_br sample6_br sample7_br sample8_br ... zymo_mmc
Distance Based Analysis
# Make species counts table for Bracken estimates
all_samples_comp_pct <- dcast(all_samples_comp.m, sample ~ species, sum, value.var = "relabs")
row.names(all_samples_comp_pct) <- all_samples_comp_pct$sample
all_samples_comp_pct$sample <- NULL
all_samples_comp_pct[is.na(all_samples_comp_pct)] <- 0
dim(all_samples_comp_pct) # 37 6779
## [1] 29 6779
# Calculate distance
all_samples.dist <- vegdist(all_samples_comp_pct, method = "bray")
all_samples.dist.m <- melt.dist(all_samples.dist)
colnames(all_samples.dist.m) <- c("samp1", "samp2", "dist")
# Add sample info
all_samples.dist.m$s1 <- meta$sample[match(all_samples.dist.m$samp1, meta$name)]
all_samples.dist.m$s2 <- meta$sample[match(all_samples.dist.m$samp2, meta$name)]
# Subset for same samples
same_samp_dist <- all_samples.dist.m[all_samples.dist.m$s1 == all_samples.dist.m$s2, ]
# Add type
same_samp_dist$type1 <- meta$type[match(same_samp_dist$samp1, meta$name)]
same_samp_dist$type2 <- meta$type[match(same_samp_dist$samp2, meta$name)]
# Subset synth
same_samp_dist_synth <- same_samp_dist[same_samp_dist$type1 == "synthetic", ]
same_samp_dist_synth$syn_type <- sapply(strsplit(as.character(same_samp_dist_synth$samp1), "_"), `[`, 2)
same_samp_dist_synth$syn_type <- gsub("err", "HiSeq Error", same_samp_dist_synth$syn_type)
same_samp_dist_synth$syn_type <- gsub("per", "Perfect", same_samp_dist_synth$syn_type)
# Add even info
same_samp_dist_synth$even <- sapply(strsplit(as.character(same_samp_dist_synth$samp1), "_"), `[`, 1)
# Clean names
same_samp_dist_synth$even <- gsub("even", "Even Distribution", same_samp_dist_synth$even)
same_samp_dist_synth$even <- gsub("stag", "Staggered Distribution", same_samp_dist_synth$even)
# Set factor levels
same_samp_dist_synth$type2 <- factor(same_samp_dist_synth$type2, levels = c("Centrifuge", "Ganon", "K2-Bracken"))
Box plot of distances between expected and predicted microbial communities for Synthetic Metagenomes
# Plot
set.seed(1234);ggplot(same_samp_dist_synth, aes(type2, dist)) + geom_boxplot(lwd=0.5) + scale_y_continuous(limits = c(0.48,0.72)) +
geom_jitter(aes(fill=even), height = 0, width = 0.2, alpha=0.6, pch=21, stroke=0.2, size =5) +
stat_compare_means(label.x.npc = 0.4, col="firebrick4") +
labs(fill="Syn. Metagenome Type", x="", y="Bray-Curtis Dissimilarities\n", title = "Accuracy Comparison: Synthetic Metagenomes\n") +
theme_classic() + theme(axis.text.x = element_text(size=12, face="bold", color = "black"), legend.position = "right", panel.grid = element_blank(),
axis.text.y = element_text(size=10, face="bold", color = "black"), plot.title = element_text(size=14, face="bold", hjust = 0.5),
axis.title = element_text(size=14, color="black", face="bold"), legend.key.size =unit(0.2,"cm"))
ggsave(file="pipeline_accuracy_comp.pdf", width = 8, height = 6, units = "in")
Compare the 4 MMC Samples
# Sample 1, 2 and 3 are zymo MMC undiluted. 3 is replicate of 1.
sample1234_pct <- all_samples_comp_pct[row.names(all_samples_comp_pct) %in% meta$name[meta$sample %in% c(1:4)], ]
sample1234_pct <- sample1234_pct[, colSums(sample1234_pct) > 0]
# Order species by abundance
sample1234_pct <- sample1234_pct[, order(colMeans(sample1234_pct), decreasing = T)]
# Zymo sp only
sample1234_pct_zym <- sample1234_pct[, colnames(sample1234_pct) %in% zymo_mmc_levels$species]
sample1234_pct_zym$Others <- rowSums(sample1234_pct[, !(colnames(sample1234_pct) %in% zymo_mmc_levels$species)])
# Melt
sample1234_pct_zym.m <- melt(as.matrix(sample1234_pct_zym))
colnames(sample1234_pct_zym.m) <- c("name", "species", "pct")
sample1234_pct_zym.m$pct <- sample1234_pct_zym.m$pct*100
# Add sample info
sample1234_pct_zym.m$sample <- meta$sample[match(sample1234_pct_zym.m$name, meta$name)]
# Add type info
sample1234_pct_zym.m$type <- meta$type[match(sample1234_pct_zym.m$name, meta$name)]
# Add sample details
sample1_4_desc <- read_excel("sample1_4_desc.xlsx")
sample1234_pct_zym.m$desc <- sample1_4_desc$details[match(sample1234_pct_zym.m$sample, sample1_4_desc$sample)]
# For labels
sample1234_pct_zym.m_oth <- sample1234_pct_zym.m[sample1234_pct_zym.m$species == "Others", ]
Bar Plot of relative abundances of the MMC samples
# Set colors
posctrl_colors = colorRampPalette(brewer.pal(12, "Paired"))
# Extraction Ctrl Barplot of rel abundance
ggplot(sample1234_pct_zym.m, aes(type, pct)) + facet_grid(~desc, scales="free_x", space="free_x") +
geom_bar(stat="identity", aes(fill=species), lwd=0.1, width = 0.8, color="black") +
scale_fill_manual(values=c(posctrl_colors(ncol(sample1234_pct_zym)-1), "grey50")) +
labs(x="", y ="Relative Abundance%\n", fill="", title = "Taxonomic Comparison: MMC Samples\n") +
scale_y_continuous(expand=c(0,1)) + guides(fill=guide_legend(ncol=1)) + theme_bw() +
#annotate("text", )
geom_text(data=sample1234_pct_zym.m_oth, aes(label=paste0(round(pct,1), "%"), y=2), size=3, color="white") +
theme(axis.text.x = element_text(size=12, face="bold", color = "black", angle=90, hjust = 1, vjust=0.5), legend.position = "right", panel.grid = element_blank(),
legend.text = element_text(face="italic"), axis.text.y = element_text(size=10, face="bold", color = "black"),
plot.title = element_text(size=14, face="bold", hjust = 0.5), panel.spacing = unit(1, "lines"), legend.key.size =unit(0.6,"cm"),
axis.title = element_text(size=14, color="black", face="bold"))
ggsave(file="sample1234_relabs_sp_zym.pdf", width = 12, height = 6, units = "in")
Resource Usage
resource_comp <- read_excel("resource_comp.xlsx")
resource_comp.m <- melt(resource_comp)
## Using Tool as id variables
ggplot(resource_comp.m, aes(Tool, value)) + geom_bar(stat = "identity", position = "dodge", aes(fill=Tool), col="black", lwd=0.2) +
facet_wrap(~variable, scales="free_y") + labs(x="", y ="", fill="", title = "Resource Usage Comparison\n") + scale_y_continuous(expand=c(0.01,0.01)) + theme_bw() +
theme(axis.text.x = element_text(size=12, face="bold", color = "black", angle=90, hjust = 1, vjust=0.5), legend.position = "Na", panel.grid = element_blank(),
legend.text = element_text(face="italic"), axis.text.y = element_text(size=10, face="bold", color = "black"),
strip.text = element_text(size=10, face="bold", color = "black"), plot.title = element_text(size=14, face="bold", hjust = 0.5),
axis.title = element_text(size=14, color="black", face="bold"))
ggsave(file="resource_use_comp.pdf", width = 12, height = 6, units = "in")