2022 Biol 320 Population genomic analyses with R - barrettlab/2021-Genomics-bootcamp GitHub Wiki

Analyses in R: population genomics

A great resource for population genomics in R by Tom Jenkins

Data file for Lonicera japonica in vcf format

Strata file for delimiting individuals, sites, and regions in .txt format

12. Load libraries

# install.packages(c("vcfR","poppr","adegenet","ape","mmod","magrittr","hierfstat","pegas","qvalue","ggplot2","plot3D","plotly","igraph","RColorBrewer","grid","R.devices","reshape2"))

library(vcfR); library(poppr); library(adegenet); library(ape); library(mmod); 
library(magrittr); library(hierfstat); library(pegas); library(qvalue); 
library(ggplot2); library(plot3D); library(plotly); library(igraph);
library(RColorBrewer); library(grid); library(R.devices); library(reshape2)

13. Read in your variants file in vcf format

filtered_vcf <- read.vcfR("imputed.vcf")

### convert to genlight and genind formats so we can analyze in R/adegenet

my_genlight <- vcfR2genlight(filtered_vcf)
my_genind <- vcfR2genind(filtered_vcf)

14. Read in the population strata file

pop_data <- read.table("ljstrata.txt", sep = "\t", header = FALSE)

### have a look at the pop data: this tells R how your samples are divided up into sites and counties/stataes
head(pop_data)

### C01A-1-1-WV-Mon-Urban	urban	monogalia-wv	C01A-1-1-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-2-WV-Mon-Urban	urban	monogalia-wv	C01A-1-2-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-3-WV-Mon-Urban	urban	monogalia-wv	C01A-1-3-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-4-WV-Mon-Urban	urban	monogalia-wv	C01A-1-4-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-5-WV-Mon-Urban	urban	monogalia-wv	C01A-1-5-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-6-WV-Mon-Urban	urban	monogalia-wv	C01A-1-6-WV-Mon-Urban_urban_monogalia-wv
### C01A-2-10-WV-Mon-Natural	natural	monogalia-wv	C01A-2-10-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-11-WV-Mon-Natural	natural	monogalia-wv	C01A-2-11-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-12-WV-Mon-Natural	natural	monogalia-wv	C01A-2-12-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-7-WV-Mon-Natural	natural	monogalia-wv	C01A-2-7-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-8-WV-Mon-Natural	natural	monogalia-wv	C01A-2-8-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-9-WV-Mon-Natural	natural	monogalia-wv	C01A-2-9-WV-Mon-Natural_natural_monogalia-wv


### make sure they are in the same order as in your SNP data
[email protected]

###   [1] "C01A-1-1-WV-Mon-Urban_S1"                   "C01A-1-2-WV-Mon-Urban_S2"                  
###   [3] "C01A-1-3-WV-Mon-Urban_S3"                   "C01A-1-4-WV-Mon-Urban_S4"                  
###   [5] "C01A-1-5-WV-Mon-Urban_S5"                   "C01A-1-6-WV-Mon-Urban_S6"                  
###   [7] "C01A-2-10-WV-Mon-Natural_S10"               "C01A-2-11-WV-Mon-Natural_S11"              
###   [9] "C01A-2-12-WV-Mon-Natural_S12"               "C01A-2-7-WV-Mon-Natural_S7"                
###  [11] "C01A-2-8-WV-Mon-Natural_S8"                 "C01A-2-9-WV-Mon-Natural_S9"                
###  [13] "C01B-1-1-WV_S13"                            "C01B-1-2-WV_S14"           

15. Set ploidy to 2 (diploid and assign your groupings of interest for downstream analysis. Here, we are interested in 'sites'

ploidy(my_genlight) <- 2; 
ploidy(my_genind) <- 2;
pop(my_genlight) <- pop_data$V2;
pop(my_genind) <- pop_data$V2

16. Apply the sampling strata and name the different hierarchical categories

samplingstrata <- as.data.frame(pop_data$V4);
strata(my_genlight) <- samplingstrata;
strata(my_genind) <- samplingstrata;
splitStrata(my_genlight) <- ~individual/site/region;
splitStrata(my_genind) <- ~individual/site/region

# individual = the unique sample
# site = your sampling site (e.g. arboretumtop, arboretumbot, arboretummid)
# region = braod scale region (e.g. monongalia-wv, fayette-pa, shelby-tn) 

# below, this will be useful in analyzing clonal populations?
my_snpclone <- poppr::as.snpclone(my_genlight)
my_genclone <- poppr::as.genclone(my_genind)

# Check out the my_genclone object

my_genclone


###   This is a genclone object
###   -------------------------
###   Genotype information:
###   
###      216 original multilocus genotypes 
###      216 diploid individuals
###      220 codominant loci
###   
###   Population information:
###   
###        3 strata - Individual, Locality, Region
###       37 populations defined - urban, natural, midlat, ..., littlefalls, starcity, uffington

17. Subset your data so only your sites are included:


#Check out the different sites across all the triad data and choose yours

popNames(my_genclone)

# output:

###  [1] "urban"              "natural"            "midlat"             "highlat"           
###  [5] "lowlat"             "ripley"             "fairmont"           "coopersrock"       
###  [9] "smithfield"         "mosondixon"         "arboretumedge"      "oktibbeha"         
### [13] "arboretumtop"       "arboretummid"       "arboretumbot"       "arboretumhill"     
### [17] "arboretum"          "wvulsblot"          "cheatlake"          "rockforge"         
### [21] "edinburg"           "roadside"           "apts"               "stillwater"        
### [25] "ptmarion"           "railtrail"          "montgomery"         "arboretumundist"   
### [29] "railtraildisturbed" "overtonpark"        "durhamnc"           "alleghenypa"       
### [33] "chisholm"           "deckerscreek"       "littlefalls"        "starcity"          
### [37] "uffington"  


# The command to subset:

arb_gen_sub = popsub(my_genclone, sublist = c("arboretumbot","arboretummid","arboretumtop"))

arb_gen_sub

### This is a genclone object
### -------------------------
### Genotype information:
### 
###     18 original multilocus genotypes 
###     18 diploid individuals
###    220 codominant loci
### 
### Population information:
### 
###      3 strata - Individual, Locality, Region
###      3 populations defined - arboretumtop, arboretummid, arboretumbot

18. Get some basic stats on your subsampled data:


# number of alleles per locus
table(arb_gen_sub$loc.fac)

# sample size for each site
summary(arb_gen_sub$pop)

# allelic richness per site (total class data, then go find your sites)

allelic.richness(genind2hierfstat(my_genind))$Ar %>%
  apply(MARGIN = 2, FUN = mean) %>% 
  round(digits = 3)

# observed heterozygosity

basic_arb = basic.stats(arb_gen_sub, diploid = TRUE)
Ho_arb = apply(basic_arb$Ho, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 2)
Ho_arb

# expected heterozygosity
He_arb = apply(basic_arb$Hs, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 2)
He_arb

19. Make a nice figure comparing Ho and He across sites:

Het_arb_df = data.frame(Site = names(Ho_arb), Ho = Ho_arb, He = He_arb) %>%
  melt(id.vars = "Site")

custom_theme = theme(
  axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, face = "bold"),
  axis.text.y = element_text(size = 20),
  axis.title.y = element_text(size = 20),
  axis.title.x = element_blank(),
  axis.line.y = element_line(size = 1),
  legend.title = element_blank(),
  legend.text = element_text(size = 20),
  panel.grid = element_blank(),
  panel.background = element_blank(),
  plot.title = element_text(hjust = 0.5, size = 20, face="bold")
  )

# Italic label
hetlab.o = expression(italic("H")[o])
hetlab.e = expression(italic("H")[e])

ggplot(data = Het_arb_df, aes(x = Site, y = value, fill = variable))+
  geom_bar(stat = "identity", position = position_dodge(width = 0.6), colour = "black")+
  scale_y_continuous(expand = c(0,0), limits = c(0,0.50))+
  scale_fill_manual(values = c("dodgerblue", "red"), labels = c(hetlab.o, hetlab.e))+
  ylab("Heterozygosity")+
  ggtitle("Observed and expected heterozygosity for Lonicera japonica")+
  custom_theme

20. Calculate mean Fis across all sites


# positive Fis = excess of homozygotes
# negative Fis = excess of heterozygotes

apply(basic_arb$Fis, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 3)

21. Run a pairwise Fst analysis and Analysis of Molecular Variance (AMOVA):


# 21A. Fst

arb_fst = genet.dist(arb_gen_sub, method = "WC84")

arb_fst

### What is the output? If you see negative numbers, that means Fst ~0

# 21B. AMOVA: how is the variation partitioned across sites, among individuals within sites, and within diploid individuals?

arb_AMOVA <- poppr.amova(my_genclone, ~region/site, within = TRUE)
arbsignif <- randtest(arb_AMOVA, nrepet = 999)

### Get various outputs
striata_AMOVA$results
striata_AMOVA$componentsofcovariance
striata_AMOVA$statphi
arbsignif

### plot the null distributions vs. observed values for each level of variation and their significance

plot(arbsignif)

# Variations within samples   = variation within individuals
# Variations between samples  = variation among individuals within sites
# Variations between sites    = variation among sites


22. If you have > 2 sites to compare, you can make a nice figure:

# Convert dist object to data.frame
fst.matrix = as.matrix(arb_fst)
ind = which( upper.tri(fst.matrix), arr.ind = TRUE)
fst.df = data.frame(Site1 = dimnames(fst.matrix)[[2]][ind[,2]],
                    Site2 = dimnames(fst.matrix)[[1]][ind[,1]],
                    Fst = fst.matrix[ ind ] %>% round(digits = 3))

# Convert minus values to zero
fst.df$Fst[fst.df$Fst < 0] = 0

# Print data.frame summary
fst.df %>% str

# Fst italic label
fst.label = expression(italic("F")[ST])

# Extract middle Fst value for gradient argument
mid = max(fst.df$Fst) / 2

# Plot heatmap
ggplot(data = fst.df, aes(x = Site1, y = Site2, fill = Fst))+
  geom_tile(colour = "black")+
  geom_text(aes(label = Fst), color="black", size = 10)+
  scale_fill_gradient2(low = "dodgerblue", mid = "pink", high = "red", midpoint = mid, name = fst.label, limits = c(0, max(fst.df$Fst)), breaks = c(0, 0.05, 0.10, 0.15))+
  scale_x_discrete(expand = c(0,0))+
  scale_y_discrete(expand = c(0,0), position = "right")+
  theme(axis.text = element_text(colour = "black", size = 20, face = "bold"),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_text(size = 20, face = "bold"),
        legend.text = element_text(size = 20)
        )

23. Conduct Discriminant Analysis of Principal Components (DAPC). This will tell you more about population structure and admixture proportions


# Perform cross validation to find the optimal number of PCs to retain in DAPC
set.seed(123)
x = tab(arb_gen_sub, NA.method = "mean")
crossval = xvalDapc(x, arb_gen_sub$pop, result = "groupMean", xval.plot = TRUE)
crossval$`Number of PCs Achieving Lowest MSE`

### This tells you that keeping 10 PCs is optimal. Think of it this way: how much of the total variation do you need to capture the optimal population structure without overfitting?

24. Visualize the DAPC plot (scatter)


### Run the DAPC analysis

numPCs = as.numeric(crossval$`Number of PCs Achieving Lowest MSE`)
dapc1 = dapc(arb_gen_sub, arb_gen_sub$pop, n.pca = numPCs, n.da = 3)

### Here is where you plot the results
### You can change the colors to anything you want to (col = c())
### You can also change the point size (cex = )

scatter(dapc1, col=c("red","green","blue"), cex = 4)

### Voila!!!!

25. Now, let's plot the DAPC ancestry coefficients (structure-like plot)


compoplot(dapc1,  lab = rownames(x),  col.pal = rainbow)

26. A nicer way to present the plot with ggcompoplot


install.packages("remotes")
remotes::install_github("zkamvar/ggcompoplot")
library(ggcompoplot)

### The initial ancestry plot
compoplot(dapc1,col="rainbow")

### Organize by site
ggcompoplot(dapc1, arb_gen_sub,col=3) + theme(axis.text.x = element_blank())

# Voila!!

START 2025 Instructions HERE!!!!!

Load libraries

# install.packages(c("vcfR","poppr","adegenet","ape","mmod","magrittr","hierfstat","pegas","qvalue","ggplot2","plot3D","plotly","igraph","RColorBrewer","grid","R.devices","reshape2"))    # Remove leading pound sign (#) to install, otherwise read as a comment

library(vcfR); library(poppr); library(adegenet); library(ape); library(mmod); 
library(magrittr); library(hierfstat); library(pegas); library(qvalue); 
library(ggplot2); library(plot3D); library(plotly); library(igraph);
library(RColorBrewer); library(grid); library(R.devices); library(reshape2)

Read in data, metadata, set strata

filtered_vcf <- read.vcfR("lonicera_snps_tassel_2025.vcf")

pop_data <- read.table("lj_strata.txt", sep = "\t", header = FALSE)

my_genlight <- vcfR2genlight(filtered_vcf)
my_genind <- vcfR2genind(filtered_vcf)

ploidy(my_genlight) <- 2; 
ploidy(my_genind) <- 2;
pop(my_genlight) <- pop_data$V2;
pop(my_genind) <- pop_data$V2

samplingstrata <- as.data.frame(pop_data$V3);
strata(my_genlight) <- samplingstrata;
strata(my_genind) <- samplingstrata;
splitStrata(my_genlight) <- ~individual/site;
splitStrata(my_genind) <- ~individual/site


my_snpclone <- poppr::as.snpclone(my_genlight)
my_genclone <- poppr::as.genclone(my_genind)

Check out the my_genclone object

my_genclone

This is a genclone object
-------------------------
Genotype information:

    364 original multilocus genotypes 
    364 diploid individuals
   2185 codominant loci

Population information:

      2 strata - individual, site
     27 populations defined - 
NC-Durham, MS-Okt, WV-Mon-Motel, ..., WV-Mon-GuthrieLoop, WV-Mon-UniversityAve, TN-Shelby

Subset your data so only your sites are included:


#Check out the different sites across all the triad data and choose yours

popNames(my_genclone)

# output:

 [1] "NC-Durham"            "MS-Okt"               "WV-Mon-Motel"         "WV-Mon-Granville-Is" 
 [5] "WV-Mon-Ackerman"      "WV-Mon-Waterplant"    "WV-Mon-CheatLake"     "PA-Lawrence"         
 [9] "WV-Mon-MtValleyApts"  "WV-Mon-LittleFalls"   "WV-Mar-Glendale"      "WV-Mon-WestRun"      
[13] "PA-Wes-Greensburg"    "WV-Mon-GrantSt"       "WV-Mon-OutlookSt"     "WV-Mon-RailTrail"    
[17] "WV-Mon-KreppsPark"    "WV-Mon-Sabraton"      "WV-Mon-WestRunRd2"    "WV-Mon-RailTrail2"   
[21] "WV-Mon-Arboretum"     "WV-Mon-RailTrail3"    "WV-Mon-Fallingrun"    "WV-Mon-SabratonRT"   
[25] "WV-Mon-GuthrieLoop"   "WV-Mon-UniversityAve" "TN-Shelby" 


# The command to subset:

arb_gen_sub = popsub(my_genclone, sublist = c("NC-Durham", "MS-Okt", "WV-Mon-Motel"))
arb_gen_sub2 = popsub(my_snpclone, sublist = c("NC-Durham", "MS-Okt", "WV-Mon-Motel"))
# Literally just copy your population names, use quotes and commas!

# Now, check out your subsetted object:

arb_gen_sub

# Output:
This is a genclone object
-------------------------
Genotype information:

     24 original multilocus genotypes 
     24 diploid individuals
   2185 codominant loci

Population information:

      2 strata - individual, site
      3 populations defined - NC-Durham, MS-Okt, WV-Mon-Motel

Get some basic stats on your subsampled data:


# number of alleles per locus
table(arb_gen_sub$loc.fac)

# sample size for each site
summary(arb_gen_sub$pop)

# allelic richness per site (total class data, then go find your sites)

allelic.richness(genind2hierfstat(my_genind))$Ar %>%
  apply(MARGIN = 2, FUN = mean) %>% 
  round(digits = 3)

# observed heterozygosity

basic_arb = basic.stats(arb_gen_sub, diploid = TRUE)
Ho_arb = apply(basic_arb$Ho, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 2)
Ho_arb

# expected heterozygosity
He_arb = apply(basic_arb$Hs, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 2)
He_arb

Make a nice figure comparing Ho and He across sites:

Het_arb_df = data.frame(Site = names(Ho_arb), Ho = Ho_arb, He = He_arb) %>%
  melt(id.vars = "Site")

custom_theme = theme(
  axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, face = "bold"),
  axis.text.y = element_text(size = 20),
  axis.title.y = element_text(size = 20),
  axis.title.x = element_blank(),
  axis.line.y = element_line(size = 1),
  legend.title = element_blank(),
  legend.text = element_text(size = 20),
  panel.grid = element_blank(),
  panel.background = element_blank(),
  plot.title = element_text(hjust = 0.5, size = 20, face="bold")
  )

# Italic label
hetlab.o = expression(italic("H")[o])
hetlab.e = expression(italic("H")[e])

ggplot(data = Het_arb_df, aes(x = Site, y = value, fill = variable))+
  geom_bar(stat = "identity", position = position_dodge(width = 0.6), colour = "black")+
  scale_y_continuous(expand = c(0,0), limits = c(0,0.50))+
  scale_fill_manual(values = c("dodgerblue", "red"), labels = c(hetlab.o, hetlab.e))+
  ylab("Heterozygosity")+
  ggtitle("Observed and expected heterozygosity for Lonicera japonica")+
  custom_theme

Calculate mean Fis across all sites


# positive Fis = excess of homozygotes
# negative Fis = excess of heterozygotes

apply(basic_arb$Fis, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 3)

Run a pairwise Fst analysis and Analysis of Molecular Variance (AMOVA):


# Calculate Weir & Cockeram (1984) metric of Fst, which measures population differentiation between all population pairs

arb_fst = genet.dist(arb_gen_sub, method = "WC84")

arb_fst

### What is the output? If you see negative numbers, that means Fst ~0

AMOVA: how is the variation partitioned across sites, among individuals within sites, and within diploid individuals?

arb_AMOVA <- poppr.amova(my_snpclone, ~ site)  # For the whole class data

#### ****This step may take a long time, you could cut to 499 reps!
arbsignif <- randtest(arb_AMOVA, nrepet = 999)

### Get various outputs
arb_AMOVA$results
arb_AMOVA$componentsofcovariance
arb_AMOVA$statphi
arbsignif

#### **** Repeat this with arb_gen_sub2 instead of my_snpclone

arb_AMOVA2 <- poppr.amova(arb_gen_sub2 , ~ site)  # For the your group data
arbsignif2 <- randtest(arb_AMOVA2, nrepet = 999)

### Get various outputs
arb_AMOVA2$results
arb_AMOVA2$componentsofcovariance
arb_AMOVA2$statphi
arbsignif2

### plot the null distributions vs. observed values for each level of variation and their significance

plot(arbsignif)
plot(arbsignif2)
# Variations within samples   = variation within individuals
# Variations between samples  = variation among individuals within sites
# Variations between sites    = variation among sites


If you have > 2 sites to compare, you can make a nice figure:

# Convert dist object to data.frame
fst.matrix = as.matrix(arb_fst)
ind = which( upper.tri(fst.matrix), arr.ind = TRUE)
fst.df = data.frame(Site1 = dimnames(fst.matrix)[[2]][ind[,2]],
                    Site2 = dimnames(fst.matrix)[[1]][ind[,1]],
                    Fst = fst.matrix[ ind ] %>% round(digits = 3))

# Convert minus values to zero
fst.df$Fst[fst.df$Fst < 0] = 0

# Print data.frame summary
fst.df %>% str

# Fst italic label
fst.label = expression(italic("F")[ST])

# Extract middle Fst value for gradient argument
mid = max(fst.df$Fst) / 2

# Plot heatmap
ggplot(data = fst.df, aes(x = Site1, y = Site2, fill = Fst))+
  geom_tile(colour = "black")+
  geom_text(aes(label = Fst), color="black", size = 10)+
  scale_fill_gradient2(low = "dodgerblue", mid = "pink", high = "red", midpoint = mid, name = fst.label, limits = c(0, max(fst.df$Fst)), breaks = c(0, 0.05, 0.10, 0.15))+
  scale_x_discrete(expand = c(0,0))+
  scale_y_discrete(expand = c(0,0), position = "right")+
  theme(axis.text = element_text(colour = "black", size = 20, face = "bold"),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_text(size = 20, face = "bold"),
        legend.text = element_text(size = 20)
        )

Conduct Discriminant Analysis of Principal Components (DAPC). This will tell you more about population structure and admixture proportions


# Perform cross validation to find the optimal number of PCs to retain in DAPC
set.seed(123)
x = tab(arb_gen_sub, NA.method = "mean")
crossval = xvalDapc(x, arb_gen_sub$pop, result = "groupMean", xval.plot = TRUE)
crossval$`Number of PCs Achieving Lowest MSE`

### This tells you that keeping 14 PCs is optimal (Your results will likely differ!). Think of it this way: how much of the total variation do you need to capture the optimal population structure without overfitting?

24. Visualize the DAPC plot (scatter)


### Run the DAPC analysis

numPCs = as.numeric(crossval$`Number of PCs Achieving Lowest MSE`)
dapc1 = dapc(arb_gen_sub, arb_gen_sub$pop, n.pca = numPCs, n.da = 3)

### Here is where you plot the results
### You can change the colors to anything you want to (col = c())
### You can also change the point size (cex = )

scatter(dapc1, col=c("red","green","blue"), cex = 4)

### Voila!!!!

25. Now, let's plot the DAPC ancestry coefficients (structure-like plot)


compoplot(dapc1,  lab = rownames(x),  col.pal = rainbow)

26. A nicer way to present the plot with ggcompoplot


install.packages("remotes")
remotes::install_github("zkamvar/ggcompoplot")
library(ggcompoplot)

### The initial ancestry plot
compoplot(dapc1,col="rainbow")

### Organize by site
ggcompoplot(dapc1, arb_gen_sub,col=3) + theme(axis.text.x = element_blank())

# Voila!!

OK, that's nice, but it does not take full advantage of what DAPC does.

Really, we want the analysis to TELL us how many total ancestral genetic clusters there are, then plot these clusters, and plot ancestry proportions of the clusters for our individuals and populations.

# Find the optimal # of clusters
grp <- find.clusters(my_genclone, max.n.clust=8)

# Choose the optimal # of PCs to retain (press 200 and enter, where the cumulative variance hits a plateau)
# 200 then enter
# Now, enter the # of clusters that is optimal, where the BIC values is minimized (lowest). BIC = Bayesian Information Criterion.
# Press 5, or whatever the lowest values is in the plot

dapc1 <- dapc(my_genclone, grp$grp)
# Choose the optimal # of PCs to retain (again, press 200 and enter, where the cumulative variance hits a plateau)
# Then, choose to keep three Discriminant Functio axes (DFs) # press 3 then enter

# This next step plots the clusters along the first 2 DAPCs, but doesn't tell you anything about the population information!
scatter(dapc1)

# Now, let's plot ancestry coefficients just for you population
arb_inds <- indNames(arb_gen_sub)  # names of individuals to include

# Extract the posterior (ancestry) probabilities for just those individuals
posterior_sub <- dapc1$posterior[arb_inds, ]

# Order individuals by population (optional)
ord <- order(arb_gen_sub$pop)
posterior_ord <- posterior_sub[ord, ]
labels_ord <- arb_gen_sub$pop[ord]

# Set up layout: 2 rows, first for legend (1/4 height), second for plot (3/4)
layout(matrix(c(1, 2), nrow = 2), heights = c(1, 4))

# Colors for clusters
cols <- viridis::viridis(ncol(posterior_ord))

# 1. Custom legend (top)
par(mar = c(0, 0, 1, 0))  # no margins
plot.new()
legend("center", legend = colnames(posterior_ord), fill = cols,
       border = NA, bty = "n", horiz = TRUE, cex = 0.8)

# 2. compoplot (bottom)
par(mar = c(5, 4, 1, 2))  # reset margins
compoplot(posterior_ord,
          col = cols,
          show.lab = TRUE,
          lab = labels_ord,
          legend = FALSE)  # turn off default legend

Voila!!!

###############################

Plot tree with selected populations colored

###############################

# install.packages(c("ape","dplyr","ggplot2"))
# if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
# BiocManager::install("ggtree")


# Load libraries
library(ape)
library(ggtree)
library(dplyr)
library(ggplot2)

# Step 1: Read in the tree
tree <- read.tree("Lonicera_FastTree.tre")

# Step 2: Read metadata (3 columns, no headers)
meta <- read.table("lj_strata.txt", header = FALSE, stringsAsFactors = FALSE)
colnames(meta) <- c("sample", "population", "group")

# Step 3: List unique populations
unique(meta$population)

# Step 4: define populations of interest (****change the names to your populations!****)
pops_of_interest <- c("NC-Durham", "MS-Okt", "TN-Shelby")

# Subset metadata to those
meta_sub <- meta %>% filter(population %in% pops_of_interest)

# Step 5: Join tip labels to population data
tree_data <- data.frame(label = tree$tip.label)
tree_data <- left_join(tree_data, meta_sub, by = c("label" = "sample"))

# Plot with colored tips for selected populations (****change the names to your populations and any colors you want!****)
ggtree(tree) %<+% tree_data +  # pipe metadata to ggtree
    geom_tiplab(aes(color = population), size = 2.5, show.legend = TRUE) +
    scale_color_manual(values = c("NC-Durham" = "red", "MS-Okt" = "blue", "TN-Shelby" = "purple")) +
    theme_tree2() +
    ggtitle("Tree with Colored Tip Labels for Selected Populations")
⚠️ **GitHub.com Fallback** ⚠️