2022 Biol 320 Population genomic analyses with R - barrettlab/2021-Genomics-bootcamp GitHub Wiki
A great resource for population genomics in R by Tom Jenkins
Data file for Lonicera japonica in vcf format
Strata file for delimiting individuals, sites, and regions in .txt format
# install.packages(c("vcfR","poppr","adegenet","ape","mmod","magrittr","hierfstat","pegas","qvalue","ggplot2","plot3D","plotly","igraph","RColorBrewer","grid","R.devices","reshape2"))
library(vcfR); library(poppr); library(adegenet); library(ape); library(mmod);
library(magrittr); library(hierfstat); library(pegas); library(qvalue);
library(ggplot2); library(plot3D); library(plotly); library(igraph);
library(RColorBrewer); library(grid); library(R.devices); library(reshape2)
filtered_vcf <- read.vcfR("imputed.vcf")
### convert to genlight and genind formats so we can analyze in R/adegenet
my_genlight <- vcfR2genlight(filtered_vcf)
my_genind <- vcfR2genind(filtered_vcf)
pop_data <- read.table("ljstrata.txt", sep = "\t", header = FALSE)
### have a look at the pop data: this tells R how your samples are divided up into sites and counties/stataes
head(pop_data)
### C01A-1-1-WV-Mon-Urban urban monogalia-wv C01A-1-1-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-2-WV-Mon-Urban urban monogalia-wv C01A-1-2-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-3-WV-Mon-Urban urban monogalia-wv C01A-1-3-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-4-WV-Mon-Urban urban monogalia-wv C01A-1-4-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-5-WV-Mon-Urban urban monogalia-wv C01A-1-5-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-6-WV-Mon-Urban urban monogalia-wv C01A-1-6-WV-Mon-Urban_urban_monogalia-wv
### C01A-2-10-WV-Mon-Natural natural monogalia-wv C01A-2-10-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-11-WV-Mon-Natural natural monogalia-wv C01A-2-11-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-12-WV-Mon-Natural natural monogalia-wv C01A-2-12-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-7-WV-Mon-Natural natural monogalia-wv C01A-2-7-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-8-WV-Mon-Natural natural monogalia-wv C01A-2-8-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-9-WV-Mon-Natural natural monogalia-wv C01A-2-9-WV-Mon-Natural_natural_monogalia-wv
### make sure they are in the same order as in your SNP data
[email protected]
### [1] "C01A-1-1-WV-Mon-Urban_S1" "C01A-1-2-WV-Mon-Urban_S2"
### [3] "C01A-1-3-WV-Mon-Urban_S3" "C01A-1-4-WV-Mon-Urban_S4"
### [5] "C01A-1-5-WV-Mon-Urban_S5" "C01A-1-6-WV-Mon-Urban_S6"
### [7] "C01A-2-10-WV-Mon-Natural_S10" "C01A-2-11-WV-Mon-Natural_S11"
### [9] "C01A-2-12-WV-Mon-Natural_S12" "C01A-2-7-WV-Mon-Natural_S7"
### [11] "C01A-2-8-WV-Mon-Natural_S8" "C01A-2-9-WV-Mon-Natural_S9"
### [13] "C01B-1-1-WV_S13" "C01B-1-2-WV_S14"
15. Set ploidy to 2 (diploid and assign your groupings of interest for downstream analysis. Here, we are interested in 'sites'
ploidy(my_genlight) <- 2;
ploidy(my_genind) <- 2;
pop(my_genlight) <- pop_data$V2;
pop(my_genind) <- pop_data$V2
samplingstrata <- as.data.frame(pop_data$V4);
strata(my_genlight) <- samplingstrata;
strata(my_genind) <- samplingstrata;
splitStrata(my_genlight) <- ~individual/site/region;
splitStrata(my_genind) <- ~individual/site/region
# individual = the unique sample
# site = your sampling site (e.g. arboretumtop, arboretumbot, arboretummid)
# region = braod scale region (e.g. monongalia-wv, fayette-pa, shelby-tn)
# below, this will be useful in analyzing clonal populations?
my_snpclone <- poppr::as.snpclone(my_genlight)
my_genclone <- poppr::as.genclone(my_genind)
# Check out the my_genclone object
my_genclone
### This is a genclone object
### -------------------------
### Genotype information:
###
### 216 original multilocus genotypes
### 216 diploid individuals
### 220 codominant loci
###
### Population information:
###
### 3 strata - Individual, Locality, Region
### 37 populations defined - urban, natural, midlat, ..., littlefalls, starcity, uffington
#Check out the different sites across all the triad data and choose yours
popNames(my_genclone)
# output:
### [1] "urban" "natural" "midlat" "highlat"
### [5] "lowlat" "ripley" "fairmont" "coopersrock"
### [9] "smithfield" "mosondixon" "arboretumedge" "oktibbeha"
### [13] "arboretumtop" "arboretummid" "arboretumbot" "arboretumhill"
### [17] "arboretum" "wvulsblot" "cheatlake" "rockforge"
### [21] "edinburg" "roadside" "apts" "stillwater"
### [25] "ptmarion" "railtrail" "montgomery" "arboretumundist"
### [29] "railtraildisturbed" "overtonpark" "durhamnc" "alleghenypa"
### [33] "chisholm" "deckerscreek" "littlefalls" "starcity"
### [37] "uffington"
# The command to subset:
arb_gen_sub = popsub(my_genclone, sublist = c("arboretumbot","arboretummid","arboretumtop"))
arb_gen_sub
### This is a genclone object
### -------------------------
### Genotype information:
###
### 18 original multilocus genotypes
### 18 diploid individuals
### 220 codominant loci
###
### Population information:
###
### 3 strata - Individual, Locality, Region
### 3 populations defined - arboretumtop, arboretummid, arboretumbot
# number of alleles per locus
table(arb_gen_sub$loc.fac)
# sample size for each site
summary(arb_gen_sub$pop)
# allelic richness per site (total class data, then go find your sites)
allelic.richness(genind2hierfstat(my_genind))$Ar %>%
apply(MARGIN = 2, FUN = mean) %>%
round(digits = 3)
# observed heterozygosity
basic_arb = basic.stats(arb_gen_sub, diploid = TRUE)
Ho_arb = apply(basic_arb$Ho, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
round(digits = 2)
Ho_arb
# expected heterozygosity
He_arb = apply(basic_arb$Hs, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
round(digits = 2)
He_arb
Het_arb_df = data.frame(Site = names(Ho_arb), Ho = Ho_arb, He = He_arb) %>%
melt(id.vars = "Site")
custom_theme = theme(
axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, face = "bold"),
axis.text.y = element_text(size = 20),
axis.title.y = element_text(size = 20),
axis.title.x = element_blank(),
axis.line.y = element_line(size = 1),
legend.title = element_blank(),
legend.text = element_text(size = 20),
panel.grid = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust = 0.5, size = 20, face="bold")
)
# Italic label
hetlab.o = expression(italic("H")[o])
hetlab.e = expression(italic("H")[e])
ggplot(data = Het_arb_df, aes(x = Site, y = value, fill = variable))+
geom_bar(stat = "identity", position = position_dodge(width = 0.6), colour = "black")+
scale_y_continuous(expand = c(0,0), limits = c(0,0.50))+
scale_fill_manual(values = c("dodgerblue", "red"), labels = c(hetlab.o, hetlab.e))+
ylab("Heterozygosity")+
ggtitle("Observed and expected heterozygosity for Lonicera japonica")+
custom_theme
# positive Fis = excess of homozygotes
# negative Fis = excess of heterozygotes
apply(basic_arb$Fis, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
round(digits = 3)
# 21A. Fst
arb_fst = genet.dist(arb_gen_sub, method = "WC84")
arb_fst
### What is the output? If you see negative numbers, that means Fst ~0
# 21B. AMOVA: how is the variation partitioned across sites, among individuals within sites, and within diploid individuals?
arb_AMOVA <- poppr.amova(my_genclone, ~region/site, within = TRUE)
arbsignif <- randtest(arb_AMOVA, nrepet = 999)
### Get various outputs
striata_AMOVA$results
striata_AMOVA$componentsofcovariance
striata_AMOVA$statphi
arbsignif
### plot the null distributions vs. observed values for each level of variation and their significance
plot(arbsignif)
# Variations within samples = variation within individuals
# Variations between samples = variation among individuals within sites
# Variations between sites = variation among sites
# Convert dist object to data.frame
fst.matrix = as.matrix(arb_fst)
ind = which( upper.tri(fst.matrix), arr.ind = TRUE)
fst.df = data.frame(Site1 = dimnames(fst.matrix)[[2]][ind[,2]],
Site2 = dimnames(fst.matrix)[[1]][ind[,1]],
Fst = fst.matrix[ ind ] %>% round(digits = 3))
# Convert minus values to zero
fst.df$Fst[fst.df$Fst < 0] = 0
# Print data.frame summary
fst.df %>% str
# Fst italic label
fst.label = expression(italic("F")[ST])
# Extract middle Fst value for gradient argument
mid = max(fst.df$Fst) / 2
# Plot heatmap
ggplot(data = fst.df, aes(x = Site1, y = Site2, fill = Fst))+
geom_tile(colour = "black")+
geom_text(aes(label = Fst), color="black", size = 10)+
scale_fill_gradient2(low = "dodgerblue", mid = "pink", high = "red", midpoint = mid, name = fst.label, limits = c(0, max(fst.df$Fst)), breaks = c(0, 0.05, 0.10, 0.15))+
scale_x_discrete(expand = c(0,0))+
scale_y_discrete(expand = c(0,0), position = "right")+
theme(axis.text = element_text(colour = "black", size = 20, face = "bold"),
axis.title = element_blank(),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20)
)
23. Conduct Discriminant Analysis of Principal Components (DAPC). This will tell you more about population structure and admixture proportions
# Perform cross validation to find the optimal number of PCs to retain in DAPC
set.seed(123)
x = tab(arb_gen_sub, NA.method = "mean")
crossval = xvalDapc(x, arb_gen_sub$pop, result = "groupMean", xval.plot = TRUE)
crossval$`Number of PCs Achieving Lowest MSE`
### This tells you that keeping 10 PCs is optimal. Think of it this way: how much of the total variation do you need to capture the optimal population structure without overfitting?
### Run the DAPC analysis
numPCs = as.numeric(crossval$`Number of PCs Achieving Lowest MSE`)
dapc1 = dapc(arb_gen_sub, arb_gen_sub$pop, n.pca = numPCs, n.da = 3)
### Here is where you plot the results
### You can change the colors to anything you want to (col = c())
### You can also change the point size (cex = )
scatter(dapc1, col=c("red","green","blue"), cex = 4)
### Voila!!!!
compoplot(dapc1, lab = rownames(x), col.pal = rainbow)
install.packages("remotes")
remotes::install_github("zkamvar/ggcompoplot")
library(ggcompoplot)
### The initial ancestry plot
compoplot(dapc1,col="rainbow")
### Organize by site
ggcompoplot(dapc1, arb_gen_sub,col=3) + theme(axis.text.x = element_blank())
# Voila!!
# install.packages(c("vcfR","poppr","adegenet","ape","mmod","magrittr","hierfstat","pegas","qvalue","ggplot2","plot3D","plotly","igraph","RColorBrewer","grid","R.devices","reshape2")) # Remove leading pound sign (#) to install, otherwise read as a comment
library(vcfR); library(poppr); library(adegenet); library(ape); library(mmod);
library(magrittr); library(hierfstat); library(pegas); library(qvalue);
library(ggplot2); library(plot3D); library(plotly); library(igraph);
library(RColorBrewer); library(grid); library(R.devices); library(reshape2)
filtered_vcf <- read.vcfR("lonicera_snps_tassel_2025.vcf")
pop_data <- read.table("lj_strata.txt", sep = "\t", header = FALSE)
my_genlight <- vcfR2genlight(filtered_vcf)
my_genind <- vcfR2genind(filtered_vcf)
ploidy(my_genlight) <- 2;
ploidy(my_genind) <- 2;
pop(my_genlight) <- pop_data$V2;
pop(my_genind) <- pop_data$V2
samplingstrata <- as.data.frame(pop_data$V3);
strata(my_genlight) <- samplingstrata;
strata(my_genind) <- samplingstrata;
splitStrata(my_genlight) <- ~individual/site;
splitStrata(my_genind) <- ~individual/site
my_snpclone <- poppr::as.snpclone(my_genlight)
my_genclone <- poppr::as.genclone(my_genind)
my_genclone
This is a genclone object
-------------------------
Genotype information:
364 original multilocus genotypes
364 diploid individuals
2185 codominant loci
Population information:
2 strata - individual, site
27 populations defined -
NC-Durham, MS-Okt, WV-Mon-Motel, ..., WV-Mon-GuthrieLoop, WV-Mon-UniversityAve, TN-Shelby
#Check out the different sites across all the triad data and choose yours
popNames(my_genclone)
# output:
[1] "NC-Durham" "MS-Okt" "WV-Mon-Motel" "WV-Mon-Granville-Is"
[5] "WV-Mon-Ackerman" "WV-Mon-Waterplant" "WV-Mon-CheatLake" "PA-Lawrence"
[9] "WV-Mon-MtValleyApts" "WV-Mon-LittleFalls" "WV-Mar-Glendale" "WV-Mon-WestRun"
[13] "PA-Wes-Greensburg" "WV-Mon-GrantSt" "WV-Mon-OutlookSt" "WV-Mon-RailTrail"
[17] "WV-Mon-KreppsPark" "WV-Mon-Sabraton" "WV-Mon-WestRunRd2" "WV-Mon-RailTrail2"
[21] "WV-Mon-Arboretum" "WV-Mon-RailTrail3" "WV-Mon-Fallingrun" "WV-Mon-SabratonRT"
[25] "WV-Mon-GuthrieLoop" "WV-Mon-UniversityAve" "TN-Shelby"
# The command to subset:
arb_gen_sub = popsub(my_genclone, sublist = c("NC-Durham", "MS-Okt", "WV-Mon-Motel"))
arb_gen_sub2 = popsub(my_snpclone, sublist = c("NC-Durham", "MS-Okt", "WV-Mon-Motel"))
# Literally just copy your population names, use quotes and commas!
# Now, check out your subsetted object:
arb_gen_sub
# Output:
This is a genclone object
-------------------------
Genotype information:
24 original multilocus genotypes
24 diploid individuals
2185 codominant loci
Population information:
2 strata - individual, site
3 populations defined - NC-Durham, MS-Okt, WV-Mon-Motel
# number of alleles per locus
table(arb_gen_sub$loc.fac)
# sample size for each site
summary(arb_gen_sub$pop)
# allelic richness per site (total class data, then go find your sites)
allelic.richness(genind2hierfstat(my_genind))$Ar %>%
apply(MARGIN = 2, FUN = mean) %>%
round(digits = 3)
# observed heterozygosity
basic_arb = basic.stats(arb_gen_sub, diploid = TRUE)
Ho_arb = apply(basic_arb$Ho, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
round(digits = 2)
Ho_arb
# expected heterozygosity
He_arb = apply(basic_arb$Hs, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
round(digits = 2)
He_arb
Het_arb_df = data.frame(Site = names(Ho_arb), Ho = Ho_arb, He = He_arb) %>%
melt(id.vars = "Site")
custom_theme = theme(
axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, face = "bold"),
axis.text.y = element_text(size = 20),
axis.title.y = element_text(size = 20),
axis.title.x = element_blank(),
axis.line.y = element_line(size = 1),
legend.title = element_blank(),
legend.text = element_text(size = 20),
panel.grid = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust = 0.5, size = 20, face="bold")
)
# Italic label
hetlab.o = expression(italic("H")[o])
hetlab.e = expression(italic("H")[e])
ggplot(data = Het_arb_df, aes(x = Site, y = value, fill = variable))+
geom_bar(stat = "identity", position = position_dodge(width = 0.6), colour = "black")+
scale_y_continuous(expand = c(0,0), limits = c(0,0.50))+
scale_fill_manual(values = c("dodgerblue", "red"), labels = c(hetlab.o, hetlab.e))+
ylab("Heterozygosity")+
ggtitle("Observed and expected heterozygosity for Lonicera japonica")+
custom_theme
# positive Fis = excess of homozygotes
# negative Fis = excess of heterozygotes
apply(basic_arb$Fis, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
round(digits = 3)
# Calculate Weir & Cockeram (1984) metric of Fst, which measures population differentiation between all population pairs
arb_fst = genet.dist(arb_gen_sub, method = "WC84")
arb_fst
### What is the output? If you see negative numbers, that means Fst ~0
AMOVA: how is the variation partitioned across sites, among individuals within sites, and within diploid individuals?
arb_AMOVA <- poppr.amova(my_snpclone, ~ site) # For the whole class data
#### ****This step may take a long time, you could cut to 499 reps!
arbsignif <- randtest(arb_AMOVA, nrepet = 999)
### Get various outputs
arb_AMOVA$results
arb_AMOVA$componentsofcovariance
arb_AMOVA$statphi
arbsignif
#### **** Repeat this with arb_gen_sub2 instead of my_snpclone
arb_AMOVA2 <- poppr.amova(arb_gen_sub2 , ~ site) # For the your group data
arbsignif2 <- randtest(arb_AMOVA2, nrepet = 999)
### Get various outputs
arb_AMOVA2$results
arb_AMOVA2$componentsofcovariance
arb_AMOVA2$statphi
arbsignif2
### plot the null distributions vs. observed values for each level of variation and their significance
plot(arbsignif)
plot(arbsignif2)
# Variations within samples = variation within individuals
# Variations between samples = variation among individuals within sites
# Variations between sites = variation among sites
# Convert dist object to data.frame
fst.matrix = as.matrix(arb_fst)
ind = which( upper.tri(fst.matrix), arr.ind = TRUE)
fst.df = data.frame(Site1 = dimnames(fst.matrix)[[2]][ind[,2]],
Site2 = dimnames(fst.matrix)[[1]][ind[,1]],
Fst = fst.matrix[ ind ] %>% round(digits = 3))
# Convert minus values to zero
fst.df$Fst[fst.df$Fst < 0] = 0
# Print data.frame summary
fst.df %>% str
# Fst italic label
fst.label = expression(italic("F")[ST])
# Extract middle Fst value for gradient argument
mid = max(fst.df$Fst) / 2
# Plot heatmap
ggplot(data = fst.df, aes(x = Site1, y = Site2, fill = Fst))+
geom_tile(colour = "black")+
geom_text(aes(label = Fst), color="black", size = 10)+
scale_fill_gradient2(low = "dodgerblue", mid = "pink", high = "red", midpoint = mid, name = fst.label, limits = c(0, max(fst.df$Fst)), breaks = c(0, 0.05, 0.10, 0.15))+
scale_x_discrete(expand = c(0,0))+
scale_y_discrete(expand = c(0,0), position = "right")+
theme(axis.text = element_text(colour = "black", size = 20, face = "bold"),
axis.title = element_blank(),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "right",
legend.title = element_text(size = 20, face = "bold"),
legend.text = element_text(size = 20)
)
Conduct Discriminant Analysis of Principal Components (DAPC). This will tell you more about population structure and admixture proportions
# Perform cross validation to find the optimal number of PCs to retain in DAPC
set.seed(123)
x = tab(arb_gen_sub, NA.method = "mean")
crossval = xvalDapc(x, arb_gen_sub$pop, result = "groupMean", xval.plot = TRUE)
crossval$`Number of PCs Achieving Lowest MSE`
### This tells you that keeping 14 PCs is optimal (Your results will likely differ!). Think of it this way: how much of the total variation do you need to capture the optimal population structure without overfitting?
### Run the DAPC analysis
numPCs = as.numeric(crossval$`Number of PCs Achieving Lowest MSE`)
dapc1 = dapc(arb_gen_sub, arb_gen_sub$pop, n.pca = numPCs, n.da = 3)
### Here is where you plot the results
### You can change the colors to anything you want to (col = c())
### You can also change the point size (cex = )
scatter(dapc1, col=c("red","green","blue"), cex = 4)
### Voila!!!!
compoplot(dapc1, lab = rownames(x), col.pal = rainbow)
install.packages("remotes")
remotes::install_github("zkamvar/ggcompoplot")
library(ggcompoplot)
### The initial ancestry plot
compoplot(dapc1,col="rainbow")
### Organize by site
ggcompoplot(dapc1, arb_gen_sub,col=3) + theme(axis.text.x = element_blank())
# Voila!!
Really, we want the analysis to TELL us how many total ancestral genetic clusters there are, then plot these clusters, and plot ancestry proportions of the clusters for our individuals and populations.
# Find the optimal # of clusters
grp <- find.clusters(my_genclone, max.n.clust=8)
# Choose the optimal # of PCs to retain (press 200 and enter, where the cumulative variance hits a plateau)
# 200 then enter
# Now, enter the # of clusters that is optimal, where the BIC values is minimized (lowest). BIC = Bayesian Information Criterion.
# Press 5, or whatever the lowest values is in the plot
dapc1 <- dapc(my_genclone, grp$grp)
# Choose the optimal # of PCs to retain (again, press 200 and enter, where the cumulative variance hits a plateau)
# Then, choose to keep three Discriminant Functio axes (DFs) # press 3 then enter
# This next step plots the clusters along the first 2 DAPCs, but doesn't tell you anything about the population information!
scatter(dapc1)
# Now, let's plot ancestry coefficients just for you population
arb_inds <- indNames(arb_gen_sub) # names of individuals to include
# Extract the posterior (ancestry) probabilities for just those individuals
posterior_sub <- dapc1$posterior[arb_inds, ]
# Order individuals by population (optional)
ord <- order(arb_gen_sub$pop)
posterior_ord <- posterior_sub[ord, ]
labels_ord <- arb_gen_sub$pop[ord]
# Set up layout: 2 rows, first for legend (1/4 height), second for plot (3/4)
layout(matrix(c(1, 2), nrow = 2), heights = c(1, 4))
# Colors for clusters
cols <- viridis::viridis(ncol(posterior_ord))
# 1. Custom legend (top)
par(mar = c(0, 0, 1, 0)) # no margins
plot.new()
legend("center", legend = colnames(posterior_ord), fill = cols,
border = NA, bty = "n", horiz = TRUE, cex = 0.8)
# 2. compoplot (bottom)
par(mar = c(5, 4, 1, 2)) # reset margins
compoplot(posterior_ord,
col = cols,
show.lab = TRUE,
lab = labels_ord,
legend = FALSE) # turn off default legend
Voila!!!
###############################
###############################
# install.packages(c("ape","dplyr","ggplot2"))
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("ggtree")
# Load libraries
library(ape)
library(ggtree)
library(dplyr)
library(ggplot2)
# Step 1: Read in the tree
tree <- read.tree("Lonicera_FastTree.tre")
# Step 2: Read metadata (3 columns, no headers)
meta <- read.table("lj_strata.txt", header = FALSE, stringsAsFactors = FALSE)
colnames(meta) <- c("sample", "population", "group")
# Step 3: List unique populations
unique(meta$population)
# Step 4: define populations of interest (****change the names to your populations!****)
pops_of_interest <- c("NC-Durham", "MS-Okt", "TN-Shelby")
# Subset metadata to those
meta_sub <- meta %>% filter(population %in% pops_of_interest)
# Step 5: Join tip labels to population data
tree_data <- data.frame(label = tree$tip.label)
tree_data <- left_join(tree_data, meta_sub, by = c("label" = "sample"))
# Plot with colored tips for selected populations (****change the names to your populations and any colors you want!****)
ggtree(tree) %<+% tree_data + # pipe metadata to ggtree
geom_tiplab(aes(color = population), size = 2.5, show.legend = TRUE) +
scale_color_manual(values = c("NC-Durham" = "red", "MS-Okt" = "blue", "TN-Shelby" = "purple")) +
theme_tree2() +
ggtitle("Tree with Colored Tip Labels for Selected Populations")