2022 Biol 320 Population genomic analyses with R - barrettlab/2021-Genomics-bootcamp GitHub Wiki

Analyses in R: population genomics

A great resource for population genomics in R by Tom Jenkins

Data file for Lonicera japonica in vcf format

Strata file for delimiting individuals, sites, and regions in .txt format

12. Load libraries

# install.packages(c("vcfR","poppr","adegenet","ape","mmod","magrittr","hierfstat","pegas","qvalue","ggplot2","plot3D","plotly","igraph","RColorBrewer","grid","R.devices","reshape2"))

library(vcfR); library(poppr); library(adegenet); library(ape); library(mmod); 
library(magrittr); library(hierfstat); library(pegas); library(qvalue); 
library(ggplot2); library(plot3D); library(plotly); library(igraph);
library(RColorBrewer); library(grid); library(R.devices); library(reshape2)

13. Read in your variants file in vcf format

filtered_vcf <- read.vcfR("imputed.vcf")

### convert to genlight and genind formats so we can analyze in R/adegenet

my_genlight <- vcfR2genlight(filtered_vcf)
my_genind <- vcfR2genind(filtered_vcf)

14. Read in the population strata file

pop_data <- read.table("ljstrata.txt", sep = "\t", header = FALSE)

### have a look at the pop data: this tells R how your samples are divided up into sites and counties/stataes
head(pop_data)

### C01A-1-1-WV-Mon-Urban	urban	monogalia-wv	C01A-1-1-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-2-WV-Mon-Urban	urban	monogalia-wv	C01A-1-2-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-3-WV-Mon-Urban	urban	monogalia-wv	C01A-1-3-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-4-WV-Mon-Urban	urban	monogalia-wv	C01A-1-4-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-5-WV-Mon-Urban	urban	monogalia-wv	C01A-1-5-WV-Mon-Urban_urban_monogalia-wv
### C01A-1-6-WV-Mon-Urban	urban	monogalia-wv	C01A-1-6-WV-Mon-Urban_urban_monogalia-wv
### C01A-2-10-WV-Mon-Natural	natural	monogalia-wv	C01A-2-10-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-11-WV-Mon-Natural	natural	monogalia-wv	C01A-2-11-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-12-WV-Mon-Natural	natural	monogalia-wv	C01A-2-12-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-7-WV-Mon-Natural	natural	monogalia-wv	C01A-2-7-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-8-WV-Mon-Natural	natural	monogalia-wv	C01A-2-8-WV-Mon-Natural_natural_monogalia-wv
### C01A-2-9-WV-Mon-Natural	natural	monogalia-wv	C01A-2-9-WV-Mon-Natural_natural_monogalia-wv


### make sure they are in the same order as in your SNP data
[email protected]

###   [1] "C01A-1-1-WV-Mon-Urban_S1"                   "C01A-1-2-WV-Mon-Urban_S2"                  
###   [3] "C01A-1-3-WV-Mon-Urban_S3"                   "C01A-1-4-WV-Mon-Urban_S4"                  
###   [5] "C01A-1-5-WV-Mon-Urban_S5"                   "C01A-1-6-WV-Mon-Urban_S6"                  
###   [7] "C01A-2-10-WV-Mon-Natural_S10"               "C01A-2-11-WV-Mon-Natural_S11"              
###   [9] "C01A-2-12-WV-Mon-Natural_S12"               "C01A-2-7-WV-Mon-Natural_S7"                
###  [11] "C01A-2-8-WV-Mon-Natural_S8"                 "C01A-2-9-WV-Mon-Natural_S9"                
###  [13] "C01B-1-1-WV_S13"                            "C01B-1-2-WV_S14"           

15. Set ploidy to 2 (diploid and assign your groupings of interest for downstream analysis. Here, we are interested in 'sites'

ploidy(my_genlight) <- 2; 
ploidy(my_genind) <- 2;
pop(my_genlight) <- pop_data$V2;
pop(my_genind) <- pop_data$V2

16. Apply the sampling strata and name the different hierarchical categories

samplingstrata <- as.data.frame(pop_data$V4);
strata(my_genlight) <- samplingstrata;
strata(my_genind) <- samplingstrata;
splitStrata(my_genlight) <- ~individual/site/region;
splitStrata(my_genind) <- ~individual/site/region

# individual = the unique sample
# site = your sampling site (e.g. arboretumtop, arboretumbot, arboretummid)
# region = braod scale region (e.g. monongalia-wv, fayette-pa, shelby-tn) 

# below, this will be useful in analyzing clonal populations?
my_snpclone <- poppr::as.snpclone(my_genlight)
my_genclone <- poppr::as.genclone(my_genind)

# Check out the my_genclone object

my_genclone


###   This is a genclone object
###   -------------------------
###   Genotype information:
###   
###      216 original multilocus genotypes 
###      216 diploid individuals
###      220 codominant loci
###   
###   Population information:
###   
###        3 strata - Individual, Locality, Region
###       37 populations defined - urban, natural, midlat, ..., littlefalls, starcity, uffington

17. Subset your data so only your sites are included:


#Check out the different sites across all the triad data and choose yours

popNames(my_genclone)

# output:

###  [1] "urban"              "natural"            "midlat"             "highlat"           
###  [5] "lowlat"             "ripley"             "fairmont"           "coopersrock"       
###  [9] "smithfield"         "mosondixon"         "arboretumedge"      "oktibbeha"         
### [13] "arboretumtop"       "arboretummid"       "arboretumbot"       "arboretumhill"     
### [17] "arboretum"          "wvulsblot"          "cheatlake"          "rockforge"         
### [21] "edinburg"           "roadside"           "apts"               "stillwater"        
### [25] "ptmarion"           "railtrail"          "montgomery"         "arboretumundist"   
### [29] "railtraildisturbed" "overtonpark"        "durhamnc"           "alleghenypa"       
### [33] "chisholm"           "deckerscreek"       "littlefalls"        "starcity"          
### [37] "uffington"  


# The command to subset:

arb_gen_sub = popsub(my_genclone, sublist = c("arboretumbot","arboretummid","arboretumtop"))

arb_gen_sub

### This is a genclone object
### -------------------------
### Genotype information:
### 
###     18 original multilocus genotypes 
###     18 diploid individuals
###    220 codominant loci
### 
### Population information:
### 
###      3 strata - Individual, Locality, Region
###      3 populations defined - arboretumtop, arboretummid, arboretumbot

18. Get some basic stats on your subsampled data:


# number of alleles per locus
table(arb_gen_sub$loc.fac)

# sample size for each site
summary(arb_gen_sub$pop)

# allelic richness per site (total class data, then go find your sites)

allelic.richness(genind2hierfstat(my_genind))$Ar %>%
  apply(MARGIN = 2, FUN = mean) %>% 
  round(digits = 3)

# observed heterozygosity

basic_arb = basic.stats(arb_gen_sub, diploid = TRUE)
Ho_arb = apply(basic_arb$Ho, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 2)
Ho_arb

# expected heterozygosity
He_arb = apply(basic_arb$Hs, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 2)
He_arb

19. Make a nice figure comparing Ho and He across sites:

Het_arb_df = data.frame(Site = names(Ho_arb), Ho = Ho_arb, He = He_arb) %>%
  melt(id.vars = "Site")

custom_theme = theme(
  axis.text.x = element_text(size = 20, angle = 90, vjust = 0.5, face = "bold"),
  axis.text.y = element_text(size = 20),
  axis.title.y = element_text(size = 20),
  axis.title.x = element_blank(),
  axis.line.y = element_line(size = 1),
  legend.title = element_blank(),
  legend.text = element_text(size = 20),
  panel.grid = element_blank(),
  panel.background = element_blank(),
  plot.title = element_text(hjust = 0.5, size = 20, face="bold")
  )

# Italic label
hetlab.o = expression(italic("H")[o])
hetlab.e = expression(italic("H")[e])

ggplot(data = Het_arb_df, aes(x = Site, y = value, fill = variable))+
  geom_bar(stat = "identity", position = position_dodge(width = 0.6), colour = "black")+
  scale_y_continuous(expand = c(0,0), limits = c(0,0.50))+
  scale_fill_manual(values = c("dodgerblue", "red"), labels = c(hetlab.o, hetlab.e))+
  ylab("Heterozygosity")+
  ggtitle("Observed and expected heterozygosity for Lonicera japonica")+
  custom_theme

20. Calculate mean Fis across all sites


# positive Fis = excess of homozygotes
# negative Fis = excess of heterozygotes

apply(basic_arb$Fis, MARGIN = 2, FUN = mean, na.rm = TRUE) %>%
  round(digits = 3)

21. Run a pairwise Fst analysis and Analysis of Molecular Variance (AMOVA):


# 21A. Fst

arb_fst = genet.dist(arb_gen_sub, method = "WC84")

arb_fst

### What is the output? If you see negative numbers, that means Fst ~0

# 21B. AMOVA: how is the variation partitioned across sites, among individuals within sites, and within diploid individuals?

arb_AMOVA <- poppr.amova(my_genclone, ~region/site, within = TRUE)
arbsignif <- randtest(arb_AMOVA, nrepet = 999)

### Get various outputs
striata_AMOVA$results
striata_AMOVA$componentsofcovariance
striata_AMOVA$statphi
arbsignif

### plot the null distributions vs. observed values for each level of variation and their significance

plot(arbsignif)

# Variations within samples   = variation within individuals
# Variations between samples  = variation among individuals within sites
# Variations between sites    = variation among sites


22. If you have > 2 sites to compare, you can make a nice figure:

# Convert dist object to data.frame
fst.matrix = as.matrix(arb_fst)
ind = which( upper.tri(fst.matrix), arr.ind = TRUE)
fst.df = data.frame(Site1 = dimnames(fst.matrix)[2](/barrettlab/2021-Genomics-bootcamp/wiki/2)[ind[,2]],
                    Site2 = dimnames(fst.matrix)[1](/barrettlab/2021-Genomics-bootcamp/wiki/1)[ind[,1]],
                    Fst = fst.matrix[ ind ] %>% round(digits = 3))

# Convert minus values to zero
fst.df$Fst[fst.df$Fst < 0] = 0

# Print data.frame summary
fst.df %>% str

# Fst italic label
fst.label = expression(italic("F")[ST])

# Extract middle Fst value for gradient argument
mid = max(fst.df$Fst) / 2

# Plot heatmap
ggplot(data = fst.df, aes(x = Site1, y = Site2, fill = Fst))+
  geom_tile(colour = "black")+
  geom_text(aes(label = Fst), color="black", size = 10)+
  scale_fill_gradient2(low = "dodgerblue", mid = "pink", high = "red", midpoint = mid, name = fst.label, limits = c(0, max(fst.df$Fst)), breaks = c(0, 0.05, 0.10, 0.15))+
  scale_x_discrete(expand = c(0,0))+
  scale_y_discrete(expand = c(0,0), position = "right")+
  theme(axis.text = element_text(colour = "black", size = 20, face = "bold"),
        axis.title = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.position = "right",
        legend.title = element_text(size = 20, face = "bold"),
        legend.text = element_text(size = 20)
        )

23. Conduct Discriminant Analysis of Principal Components (DAPC). This will tell you more about population structure and admixture proportions


# Perform cross validation to find the optimal number of PCs to retain in DAPC
set.seed(123)
x = tab(arb_gen_sub, NA.method = "mean")
crossval = xvalDapc(x, arb_gen_sub$pop, result = "groupMean", xval.plot = TRUE)
crossval$`Number of PCs Achieving Lowest MSE`

### This tells you that keeping 10 PCs is optimal. Think of it this way: how much of the total variation do you need to capture the optimal population structure without overfitting?

24. Visualize the DAPC plot (scatter)


### Run the DAPC analysis

numPCs = as.numeric(crossval$`Number of PCs Achieving Lowest MSE`)
dapc1 = dapc(arb_gen_sub, arb_gen_sub$pop, n.pca = numPCs, n.da = 3)

### Here is where you plot the results
### You can change the colors to anything you want to (col = c())
### You can also change the point size (cex = )

scatter(dapc1, col=c("red","green","blue"), cex = 4)

### Voila!!!!

25. Now, let's plot the DAPC ancestry coefficients (structure-like plot)


compoplot(dapc1,  lab = rownames(x),  col.pal = rainbow)

26. A nicer way to present the plot with ggcompoplot


install.packages("remotes")
remotes::install_github("zkamvar/ggcompoplot")
library(ggcompoplot)

### The initial ancestry plot
compoplot(dapc1,col="rainbow")

### Organize by site
ggcompoplot(dapc1, arb_gen_sub,col=3) + theme(axis.text.x = element_blank())

# Voila!!