ANCOM.R - meyermicrobiolab/Meyer_Lab_Resources GitHub Wiki

ANCOM tests for differentially abundant microbial taxa, using a compositional data approach. You must first run the ANCOM function in this R script and then apply the function to your data set.

Establish ANCOM function

ANCOM is not in an R package, so we have to read the code into R before each use

#ANCOM Function - compare across multiple treatments groups using a compositional appproach
#https://sites.google.com/site/siddharthamandal1985/research

#Need to run this first in order to run ANCOM on data
ancom.W = function(otu_data,var_data,
                   adjusted,repeated,
                   main.var,adj.formula,
                   repeat.var,long,rand.formula,
                   multcorr,sig){

  n_otu=dim(otu_data)[2]-1

  otu_ids=colnames(otu_data)[-1]

  if(repeated==F){
    data_comp=data.frame(merge(otu_data,var_data,by="Sample.ID",all.y=T),row.names=NULL)
    #data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var)],by="Sample.ID",all.y=T),row.names=NULL)
  }else if(repeated==T){
    data_comp=data.frame(merge(otu_data,var_data,by="Sample.ID"),row.names=NULL)
    # data_comp=data.frame(merge(otu_data,var_data[,c("Sample.ID",main.var,repeat.var)],by="Sample.ID"),row.names=NULL)
  }

  base.formula = paste0("lr ~ ",main.var)
  if(repeated==T){
    repeat.formula = paste0(base.formula," | ", repeat.var)
  }
  if(adjusted==T){
    adjusted.formula = paste0(base.formula," + ", adj.formula)
  }

  if( adjusted == F & repeated == F ){
    fformula  <- formula(base.formula)
  } else if( adjusted == F & repeated == T & long == T ){
    fformula  <- formula(base.formula)
  }else if( adjusted == F & repeated == T & long == F ){
    fformula  <- formula(repeat.formula)
  }else if( adjusted == T & repeated == F  ){
    fformula  <- formula(adjusted.formula)
  }else if( adjusted == T & repeated == T  ){
    fformula  <- formula(adjusted.formula)
  }else{
    stop("Problem with data. Dataset should contain OTU abundances, groups,
         and optionally an ID for repeated measures.")
  }


  if( repeated==FALSE & adjusted == FALSE){
    if( length(unique(data_comp[,which(colnames(data_comp)==main.var)]))==2 ){
      tfun <- exactRankTests::wilcox.exact
    } else{
      tfun <- stats::kruskal.test
    }
  }else if( repeated==FALSE & adjusted == TRUE){
    tfun <- stats::aov
  }else if( repeated== TRUE & adjusted == FALSE & long == FALSE){
    tfun <- stats::friedman.test
  }else if( repeated== TRUE & adjusted == FALSE & long == TRUE){
    tfun <- nlme::lme
  }else if( repeated== TRUE & adjusted == TRUE){
    tfun <- nlme::lme
  }

  logratio.mat <- matrix(NA, nrow=n_otu, ncol=n_otu)
  for(ii in 1:(n_otu-1)){
    for(jj in (ii+1):n_otu){
      data.pair <- data_comp[,which(colnames(data_comp)%in%otu_ids[c(ii,jj)])]
      lr <- log((1+as.numeric(data.pair[,1]))/(1+as.numeric(data.pair[,2])))

      lr_dat <- data.frame( lr=lr, data_comp,row.names=NULL )

      if(adjusted==FALSE&repeated==FALSE){  ## Wilcox, Kruskal Wallis
        logratio.mat[ii,jj] <- tfun( formula=fformula, data = lr_dat)$p.value
      }else if(adjusted==FALSE&repeated==TRUE&long==FALSE){ ## Friedman's
        logratio.mat[ii,jj] <- tfun( formula=fformula, data = lr_dat)$p.value
      }else if(adjusted==TRUE&repeated==FALSE){ ## ANOVA
        model=tfun(formula=fformula, data = lr_dat,na.action=na.omit)
        picker=which(gsub(" ","",row.names(summary(model)[1](/meyermicrobiolab/Meyer_Lab_Resources/wiki/1)))==main.var)
        logratio.mat[ii,jj] <- summary(model)[1](/meyermicrobiolab/Meyer_Lab_Resources/wiki/1)["Pr(>F)"](/meyermicrobiolab/Meyer_Lab_Resources/wiki/"Pr(>F)")[picker]
      }else if(repeated==TRUE&long==TRUE){ ## GEE
        model=tfun(fixed=fformula,data = lr_dat,
                   random = formula(rand.formula),
                   correlation=corAR1(),
                   na.action=na.omit)
        picker=which(gsub(" ","",row.names(anova(model)))==main.var)
        logratio.mat[ii,jj] <- anova(model)["p-value"](/meyermicrobiolab/Meyer_Lab_Resources/wiki/"p-value")[picker]
      }
    }
  }

  ind <- lower.tri(logratio.mat)
  logratio.mat[ind] <- t(logratio.mat)[ind]

  logratio.mat[which(is.finite(logratio.mat)==FALSE)] <- 1

  mc.pval <- t(apply(logratio.mat,1,function(x){
    s <- p.adjust(x, method = "BH")
    return(s)
  }))

  a <- logratio.mat[upper.tri(logratio.mat,diag=FALSE)==TRUE]

  b <- matrix(0,ncol=n_otu,nrow=n_otu)
  b[upper.tri(b)==T] <- p.adjust(a, method = "BH")
  diag(b)  <- NA
  ind.1    <- lower.tri(b)
  b[ind.1] <- t(b)[ind.1]
 
  ### Conservative
  if(multcorr==1){
    W <- apply(b,1,function(x){
      subp <- length(which(x<sig))
    })
    ### Moderate
  } else if(multcorr==2){
    W <- apply(mc.pval,1,function(x){
      subp <- length(which(x<sig))
    })
    ### No correction
  } else if(multcorr==3){
    W <- apply(logratio.mat,1,function(x){
      subp <- length(which(x<sig))
    })
  }

  return(W)
  }


ANCOM.main = function(OTUdat,Vardat,
                      adjusted,repeated,
                      main.var,adj.formula,
                      repeat.var,longitudinal,
                      random.formula,
                      multcorr,sig,
                      prev.cut){

  p.zeroes=apply(OTUdat[,-1],2,function(x){
    s=length(which(x==0))/length(x)
  })

  OTUdat.thinned=OTUdat[,c(1,1+which(p.zeroes<prev.cut))]

  otu.names=colnames(OTUdat.thinned)[-1]

  W.detected   <- ancom.W(OTUdat.thinned,Vardat,
                          adjusted,repeated,
                          main.var,adj.formula,
                          repeat.var,longitudinal,random.formula,
                          multcorr,sig)

  W_stat       <- W.detected

  W_frame = data.frame(otu.names,W_stat,row.names=NULL)
  W_frame = W_frame[order(-W_frame$W_stat),]

  W_frame$detected_0.9=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.8=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.7=rep(FALSE,dim(W_frame)[1])
  W_frame$detected_0.6=rep(FALSE,dim(W_frame)[1])

  W_frame$detected_0.9[which(W_frame$W_stat>0.9*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.8[which(W_frame$W_stat>0.8*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.7[which(W_frame$W_stat>0.7*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE
  W_frame$detected_0.6[which(W_frame$W_stat>0.6*(dim(OTUdat.thinned[,-1])[2]-1))]=TRUE

  final_results=list(W_frame)
  names(final_results)=c("W.taxa")
  return(final_results)
}

The above script is also found here: ANCOM.R

Format data for ANCOM

In the following example, I am first aggragating my ASVs at the family level, so I will use ANCOM to test differentially abundant bacterial families. You can apply ANCOM at any level of classification that you want (ASV, genus, family), this will depend on your research questions.

# Read in unfiltered ASV table (only chloroplasts and mitochondria removed)
otu <- read.table("silva_nochloronomito_otu_table.txt",sep="\t",header=TRUE, row.names=1)
taxon <- read.table("silva_nochloronomito_taxa_table.txt",sep="\t",header=TRUE,row.names=1)
samples<-read.table("metadata.txt",sep="\t",header=T,row.names=1)

# Create phyloseq object from 3 data tables
OTU = otu_table(otu, taxa_are_rows=FALSE)
taxon<-as.matrix(taxon)
TAX = tax_table(taxon)
sampledata = sample_data(samples)
ps <- phyloseq(otu_table(otu, taxa_are_rows=FALSE), 
               sample_data(samples), 
               tax_table(taxon))
ps
get_taxa_unique(ps, "Family")

# Group the ASVs based on family
dat <- tax_glom(ps, taxrank = "Family")

# Melt the data, so it's like a dataframe
datm <- psmelt(dat)

# Cast the new datatable with columns that are of interest. This will need to be modified for your project, example below from Nicole's nursery _Acropora_ project
datc <- data.table::dcast(datm, Sample + Genotype + Colony ~ Family, value.var = 'Abundance', fun.aggregate = sum)

# Get dimensions of the table
dim(datc) 

# Adjust column selection according to number of families (557 families plus three columns of metadata=560)
otud <- datc[,c(1,4:560)] #select the first column, and then all of the taxa columns  
colnames(otud)[1] <- "Sample.ID" #rename the first column to Sample.ID - this is to match ANCOM syntax

metadat <- sample_data(ps) #get the sample data
metadat <- as.data.frame(as.matrix(metadat)) #make into into a matrix
# at this point, my sample id numbers are the row names, not a separate column, move row names to column with dplylr

metadat <- tibble::rownames_to_column(metadat, "Sample.ID") #make sure the sample names column is called Sample.ID

names(otud) <- make.names(names(otud)) #get the names from the table for 
otu_test <- otud #rename otud to otu_test, for syntax in ANCOM

metadat <- select(metadat, c("Sample.ID","Genotype","Colony")) # use select to only use treatment columns of interest
map_test <- metadat #rename map_TEst
Vardat <- map_test #specify that this for Vardat - ANCOM syntax

Perform ANCOM test

Finally, I will apply the ANCOM function to the prepped dataset

# Version 1 - with no adjustment for covariates

#### ANCOM test - not adjusted, more than 2 levels = Kruskal Wallis
comparison_test_treat=ANCOM.main(OTUdat=otu_test, #calling the OTU table
                                 Vardat=map_test, #calling the metadata
                                 adjusted=FALSE, #true if covariates are to be included for adjustment
                                 repeated=FALSE, #repeated measure
                                 main.var="Genotype", #main variable or fator
                                 adj.formula= NULL, #other factors to include
                                 repeat.var=FALSE, #repeated measure
                                 long = FALSE, #longitudinal study
                                 multcorr=2,
                                 sig=0.05, #significance level
                                 prev.cut=0.90) #OTUs with proportion of zeroes greater than prev.cut are not included in the analysis

res <- comparison_test_treat$W.taxa #taxa that significantly vary across factor level of interest
write.table(res,"ANCOM_family_KruskallWallis_Genotype.txt",sep="\t",col.names=NA)
res2 <- res[which(res$detected_0.7==TRUE),] 

# Version 2 - adjusted by coral genotype

#### ANCOM test - Adjusted by Coral colony, ANOVA
comparison_test_treat=ANCOM.main(OTUdat=otu_test, #calling the OTU table
                                 Vardat=map_test, #calling the metadata
                                 adjusted=TRUE, #true if covariates are to be included for adjustment
                                 repeated=FALSE, #repeated measure
                                 main.var="Genotype", #main variable or fator
                                 adj.formula= "Colony", #other factors to include
                                 repeat.var=FALSE, #repeated measure
                                 long = FALSE, #longitudinal study
                                 multcorr=2,
                                 sig=0.05, #significance level
                                 prev.cut=0.90) #OTUs with proportion of zeroes greater than prev.cut are not included in the analysis

res3 <- comparison_test_treat$W.taxa #taxa that significantly vary across factor level of interest
write.table(res3,"ANCOM_family_ANOVA_Genotype_adjColony.txt",sep="\t",col.names=NA)
res4 <- res[which(res3$detected_0.7==TRUE),] 

sig_sites <- glue::glue_collapse(droplevels(factor(res2$otu.names)), sep = ", ") #this is to get a list of the families that are different
# For plotting, I print out the family names so I can copy into column names
print(sig_sites)
#Alphaproteobacteria, Francisellaceae, Desulfobacteraceae, JGI_0000069.P22, Pirellulaceae, Pseudomonadaceae, Halomonadaceae, Woesearchaeia, Alteromonadaceae, Clade_III, Marinimicrobia_.SAR406_clade., Puniceicoccaceae, S25.593, Fusobacteriaceae, Xenococcaceae, Phycisphaeraceae, PB19

#Calculate relative abundance; remember to adjust number of columns for your dataset
datc_relabund <-  sweep(datc[,4:560], 1, rowSums(datc[,4:560]), '/')
datc_relnames <- cbind(datc[,1:3],datc_relabund)

#only select the significant families; note that for family names that include punctuation you need to enclose the name with quotation marks
sig_dis <- select(datc_relnames, Sample, Genotype, Colony, Alphaproteobacteria, Francisellaceae, Desulfobacteraceae, "JGI_0000069-P22", Pirellulaceae, Pseudomonadaceae, Halomonadaceae, Woesearchaeia, Clade_III, "Marinimicrobia_(SAR406_clade)", Puniceicoccaceae, "S25-593", Fusobacteriaceae, Xenococcaceae, Phycisphaeraceae, PB19)
sig_long <- melt(sig_dis, id.vars=c("Sample","Genotype","Colony"),variable.name="Family",value.name="Proportion")
sum_sig <- Rmisc::summarySE(sig_long, measurevar = "Proportion", groupvars = c("Genotype","Family"), na.rm=TRUE)

# here I am setting colors for my dataset
cols<-c("G"="#009E73","R"="#D55E00","Y"="#F0E442")
sum_sig$Genotype<-factor(sum_sig$Genotype, levels=c("G","R","Y"))

# example figure from ANCOM test results
###################### FIGURE 5 Differentially abundant families by coral genotype 
  
pdf("ANCOM_Families_Genotype.pdf",width=8.5)
fams <- ggplot(sum_sig, aes(x=Family, y=Proportion+0.001))+
  geom_point(size=4, aes(fill=Genotype,shape=Genotype))+
  scale_fill_manual(values=cols)+
  scale_shape_manual(values=c(21,22,24))+
  coord_flip()+
  theme_bw()+
  theme(axis.text.x=element_text(size=14))+
  theme(axis.text.y=element_text(size=14))+
  theme(axis.title.x=element_text(size=14))+
  theme(axis.title.y=element_text(size=14))+
  theme(legend.justification=c(1,1), legend.position=c(1,1))+
  geom_errorbar(aes(ymin=Proportion+0.001-se, ymax=Proportion+0.001+se), width=.1)+
  theme(legend.title = element_blank())+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x)))+
  theme(legend.text = element_text(size=12))
fams
dev.off()