ANCOM.R - meyermicrobiolab/Meyer_Lab_Resources GitHub Wiki
ANCOM tests for differentially abundant microbial taxa, using a compositional data approach. You must first run the ANCOM function in this R script and then apply the function to your data set.
Establish ANCOM function
ANCOM is not in an R package, so we have to read the code into R before each use
#ANCOM Function - compare across multiple treatments groups using a compositional appproach
#https://sites.google.com/site/siddharthamandal1985/research
#Need to run this first in order to run ANCOM on data
ancom.W = function(otu_data,var_data,
adjusted,repeated,
main.var,adj.formula,
repeat.var,long,rand.formula,
multcorr,sig){
n_otu=dim(otu_data)[2]-1
otu_ids=colnames(otu_data)[-1]
if(repeated==F){
data_comp=data.frame(merge(otu_data,var_data,by="Sample.ID",all.y=T),row.names=NULL)
#data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var)],by="Sample.ID",all.y=T),row.names=NULL)
}else if(repeated==T){
data_comp=data.frame(merge(otu_data,var_data,by="Sample.ID"),row.names=NULL)
# data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var,repeat.var)],by="Sample.ID"),row.names=NULL)
}
base.formula = paste0("lr ~ ",main.var)
if(repeated==T){
repeat.formula = paste0(base.formula," | ", repeat.var)
}
if(adjusted==T){
adjusted.formula = paste0(base.formula," + ", adj.formula)
}
if( adjusted == F & repeated == F ){
fformula <- formula(base.formula)
} else if( adjusted == F & repeated == T & long == T ){
fformula <- formula(base.formula)
}else if( adjusted == F & repeated == T & long == F ){
fformula <- formula(repeat.formula)
}else if( adjusted == T & repeated == F ){
fformula <- formula(adjusted.formula)
}else if( adjusted == T & repeated == T ){
fformula <- formula(adjusted.formula)
}else{
stop("Problem with data. Dataset should contain OTU abundances, groups,
and optionally an ID for repeated measures.")
}
if( repeated==FALSE & adjusted == FALSE){
if( length(unique(data_comp[,which(colnames(data_comp)==main.var)]))==2 ){
tfun <- exactRankTests::wilcox.exact
} else{
tfun <- stats::kruskal.test
}
}else if( repeated==FALSE & adjusted == TRUE){
tfun <- stats::aov
}else if( repeated== TRUE & adjusted == FALSE & long == FALSE){
tfun <- stats::friedman.test
}else if( repeated== TRUE & adjusted == FALSE & long == TRUE){
tfun <- nlme::lme
}else if( repeated== TRUE & adjusted == TRUE){
tfun <- nlme::lme
}
logratio.mat <- matrix(NA, nrow=n_otu, ncol=n_otu)
for(ii in 1:(n_otu-1)){
for(jj in (ii+1):n_otu){
data.pair <- data_comp[,which(colnames(data_comp)%in%otu_ids[c(ii,jj)])]
lr <- log((1+as.numeric(data.pair[,1]))/(1+as.numeric(data.pair[,2])))
lr_dat <- data.frame( lr=lr, data_comp,row.names=NULL )
if(adjusted==FALSE&repeated==FALSE){ ## Wilcox, Kruskal Wallis
logratio.mat[ii,jj] <- tfun( formula=fformula, data = lr_dat)$p.value
}else if(adjusted==FALSE&repeated==TRUE&long==FALSE){ ## Friedman's
logratio.mat[ii,jj] <- tfun( formula=fformula, data = lr_dat)$p.value
}else if(adjusted==TRUE&repeated==FALSE){ ## ANOVA
model=tfun(formula=fformula, data = lr_dat,na.action=na.omit)
picker=which(gsub(" ","",row.names(summary(model)[1](/meyermicrobiolab/Meyer_Lab_Resources/wiki/1)))==main.var)
logratio.mat[ii,jj] <- summary(model)[1](/meyermicrobiolab/Meyer_Lab_Resources/wiki/1)["Pr(>F)"](/meyermicrobiolab/Meyer_Lab_Resources/wiki/"Pr(>F)")[picker]
}else if(repeated==TRUE&long==TRUE){ ## GEE
model=tfun(fixed=fformula,data = lr_dat,
random = formula(rand.formula),
correlation=corAR1(),
na.action=na.omit)
picker=which(gsub(" ","",row.names(anova(model)))==main.var)
logratio.mat[ii,jj] <- anova(model)["p-value"](/meyermicrobiolab/Meyer_Lab_Resources/wiki/"p-value")[picker]
}
}
}
ind <- lower.tri(logratio.mat)
logratio.mat[ind] <- t(logratio.mat)[ind]
logratio.mat[which(is.finite(logratio.mat)==FALSE)] <- 1
mc.pval <- t(apply(logratio.mat,1,function(x){
s <- p.adjust(x, method = "BH")
return(s)
}))
a <- logratio.mat[upper.tri(logratio.mat,diag=FALSE)==TRUE]
b <- matrix(0,ncol=n_otu,nrow=n_otu)
b[upper.tri(b)==T] <- p.adjust(a, method = "BH")
diag(b) <- NA
ind.1 <- lower.tri(b)
b[ind.1] <- t(b)[ind.1]
### Conservative
if(multcorr==1){
W <- apply(b,1,function(x){
subp <- length(which(x<sig))
})
### Moderate
} else if(multcorr==2){
W <- apply(mc.pval,1,function(x){
subp <- length(which(x<sig))
})
### No correction
} else if(multcorr==3){
W <- apply(logratio.mat,1,function(x){
subp <- length(which(x<sig))
})
}
return(W)
}
ANCOM.main = function(OTUdat,Vardat,
adjusted,repeated,
main.var,adj.formula,
repeat.var,longitudinal,
random.formula,
multcorr,sig,
prev.cut){
p.zeroes=apply(OTUdat[,-1],2,function(x){
s=length(which(x==0))/length(x)
})
OTUdat.thinned=OTUdat[,c(1,1+which(p.zeroes<prev.cut))]
otu.names=colnames(OTUdat.thinned)[-1]
W.detected <- ancom.W(OTUdat.thinned,Vardat,
adjusted,repeated,
main.var,adj.formula,
repeat.var,longitudinal,random.formula,
multcorr,sig)
W_stat <- W.detected
W_frame = data.frame(otu.names,W_stat,row.names=NULL)
W_frame = W_frame[order(-W_frame$W_stat),]
W_frame$detected_0.9=rep(FALSE,dim(W_frame)[1])
W_frame$detected_0.8=rep(FALSE,dim(W_frame)[1])
W_frame$detected_0.7=rep(FALSE,dim(W_frame)[1])
W_frame$detected_0.6=rep(FALSE,dim(W_frame)[1])
W_frame$detected_0.9[which(W_frame$W_stat>0.9*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
W_frame$detected_0.8[which(W_frame$W_stat>0.8*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
W_frame$detected_0.7[which(W_frame$W_stat>0.7*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
W_frame$detected_0.6[which(W_frame$W_stat>0.6*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
final_results=list(W_frame)
names(final_results)=c("W.taxa")
return(final_results)
}
The above script is also found here: ANCOM.R
Format data for ANCOM
In the following example, I am first aggragating my ASVs at the family level, so I will use ANCOM to test differentially abundant bacterial families. You can apply ANCOM at any level of classification that you want (ASV, genus, family), this will depend on your research questions.
# Read in unfiltered ASV table (only chloroplasts and mitochondria removed)
otu <- read.table("silva_nochloronomito_otu_table.txt",sep="\t",header=TRUE, row.names=1)
taxon <- read.table("silva_nochloronomito_taxa_table.txt",sep="\t",header=TRUE,row.names=1)
samples<-read.table("metadata.txt",sep="\t",header=T,row.names=1)
# Create phyloseq object from 3 data tables
OTU = otu_table(otu, taxa_are_rows=FALSE)
taxon<-as.matrix(taxon)
TAX = tax_table(taxon)
sampledata = sample_data(samples)
ps <- phyloseq(otu_table(otu, taxa_are_rows=FALSE),
sample_data(samples),
tax_table(taxon))
ps
get_taxa_unique(ps, "Family")
# Group the ASVs based on family
dat <- tax_glom(ps, taxrank = "Family")
# Melt the data, so it's like a dataframe
datm <- psmelt(dat)
# Cast the new datatable with columns that are of interest. This will need to be modified for your project, example below from Nicole's nursery _Acropora_ project
datc <- data.table::dcast(datm, Sample + Genotype + Colony ~ Family, value.var = 'Abundance', fun.aggregate = sum)
# Get dimensions of the table
dim(datc)
# Adjust column selection according to number of families (557 families plus three columns of metadata=560)
otud <- datc[,c(1,4:560)] #select the first column, and then all of the taxa columns
colnames(otud)[1] <- "Sample.ID" #rename the first column to Sample.ID - this is to match ANCOM syntax
metadat <- sample_data(ps) #get the sample data
metadat <- as.data.frame(as.matrix(metadat)) #make into into a matrix
# at this point, my sample id numbers are the row names, not a separate column, move row names to column with dplylr
metadat <- tibble::rownames_to_column(metadat, "Sample.ID") #make sure the sample names column is called Sample.ID
names(otud) <- make.names(names(otud)) #get the names from the table for
otu_test <- otud #rename otud to otu_test, for syntax in ANCOM
metadat <- select(metadat, c("Sample.ID","Genotype","Colony")) # use select to only use treatment columns of interest
map_test <- metadat #rename map_TEst
Vardat <- map_test #specify that this for Vardat - ANCOM syntax
Perform ANCOM test
Finally, I will apply the ANCOM function to the prepped dataset
# Version 1 - with no adjustment for covariates
#### ANCOM test - not adjusted, more than 2 levels = Kruskal Wallis
comparison_test_treat=ANCOM.main(OTUdat=otu_test, #calling the OTU table
Vardat=map_test, #calling the metadata
adjusted=FALSE, #true if covariates are to be included for adjustment
repeated=FALSE, #repeated measure
main.var="Genotype", #main variable or fator
adj.formula= NULL, #other factors to include
repeat.var=FALSE, #repeated measure
long = FALSE, #longitudinal study
multcorr=2,
sig=0.05, #significance level
prev.cut=0.90) #OTUs with proportion of zeroes greater than prev.cut are not included in the analysis
res <- comparison_test_treat$W.taxa #taxa that significantly vary across factor level of interest
write.table(res,"ANCOM_family_KruskallWallis_Genotype.txt",sep="\t",col.names=NA)
res2 <- res[which(res$detected_0.7==TRUE),]
# Version 2 - adjusted by coral genotype
#### ANCOM test - Adjusted by Coral colony, ANOVA
comparison_test_treat=ANCOM.main(OTUdat=otu_test, #calling the OTU table
Vardat=map_test, #calling the metadata
adjusted=TRUE, #true if covariates are to be included for adjustment
repeated=FALSE, #repeated measure
main.var="Genotype", #main variable or fator
adj.formula= "Colony", #other factors to include
repeat.var=FALSE, #repeated measure
long = FALSE, #longitudinal study
multcorr=2,
sig=0.05, #significance level
prev.cut=0.90) #OTUs with proportion of zeroes greater than prev.cut are not included in the analysis
res3 <- comparison_test_treat$W.taxa #taxa that significantly vary across factor level of interest
write.table(res3,"ANCOM_family_ANOVA_Genotype_adjColony.txt",sep="\t",col.names=NA)
res4 <- res[which(res3$detected_0.7==TRUE),]
sig_sites <- glue::glue_collapse(droplevels(factor(res2$otu.names)), sep = ", ") #this is to get a list of the families that are different
# For plotting, I print out the family names so I can copy into column names
print(sig_sites)
#Alphaproteobacteria, Francisellaceae, Desulfobacteraceae, JGI_0000069.P22, Pirellulaceae, Pseudomonadaceae, Halomonadaceae, Woesearchaeia, Alteromonadaceae, Clade_III, Marinimicrobia_.SAR406_clade., Puniceicoccaceae, S25.593, Fusobacteriaceae, Xenococcaceae, Phycisphaeraceae, PB19
#Calculate relative abundance; remember to adjust number of columns for your dataset
datc_relabund <- sweep(datc[,4:560], 1, rowSums(datc[,4:560]), '/')
datc_relnames <- cbind(datc[,1:3],datc_relabund)
#only select the significant families; note that for family names that include punctuation you need to enclose the name with quotation marks
sig_dis <- select(datc_relnames, Sample, Genotype, Colony, Alphaproteobacteria, Francisellaceae, Desulfobacteraceae, "JGI_0000069-P22", Pirellulaceae, Pseudomonadaceae, Halomonadaceae, Woesearchaeia, Clade_III, "Marinimicrobia_(SAR406_clade)", Puniceicoccaceae, "S25-593", Fusobacteriaceae, Xenococcaceae, Phycisphaeraceae, PB19)
sig_long <- melt(sig_dis, id.vars=c("Sample","Genotype","Colony"),variable.name="Family",value.name="Proportion")
sum_sig <- Rmisc::summarySE(sig_long, measurevar = "Proportion", groupvars = c("Genotype","Family"), na.rm=TRUE)
# here I am setting colors for my dataset
cols<-c("G"="#009E73","R"="#D55E00","Y"="#F0E442")
sum_sig$Genotype<-factor(sum_sig$Genotype, levels=c("G","R","Y"))
# example figure from ANCOM test results
###################### FIGURE 5 Differentially abundant families by coral genotype
pdf("ANCOM_Families_Genotype.pdf",width=8.5)
fams <- ggplot(sum_sig, aes(x=Family, y=Proportion+0.001))+
geom_point(size=4, aes(fill=Genotype,shape=Genotype))+
scale_fill_manual(values=cols)+
scale_shape_manual(values=c(21,22,24))+
coord_flip()+
theme_bw()+
theme(axis.text.x=element_text(size=14))+
theme(axis.text.y=element_text(size=14))+
theme(axis.title.x=element_text(size=14))+
theme(axis.title.y=element_text(size=14))+
theme(legend.justification=c(1,1), legend.position=c(1,1))+
geom_errorbar(aes(ymin=Proportion+0.001-se, ymax=Proportion+0.001+se), width=.1)+
theme(legend.title = element_blank())+
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x)))+
theme(legend.text = element_text(size=12))
fams
dev.off()