4 | More use cases - MoritzBlumer/winpca GitHub Wiki
Below is the source code used to generate Fig. 1 from our recent preprint. All can be downloaded from public databases as described below. Even though all analyses should be reproducible by following the below instructions, we recommend completing the tutorial first. Some of the below analyses require increased computational resources and will not be feasible to run on a personal computer. All commands listed below are relative to your current working directory and could theoretically be run like a BASH script (except temporary changes to the WinPCA config in section 4.1).
Some auxiliary files (e.g. metadata, chromosome sizes) are available for download and are derived from the linked repositories and publications. First, download these prepared files:
# set up and enter working directory
mkdir -p wicpca_use_cases && cd wicpca_use_cases
# fetch and decompress auxiliary files
wget https://github.com/MoritzBlumer/misc/raw/refs/heads/main/aux.zip
unzip aux.zip && rm -f aux.zip
Data from the 1000 Genomes project (Byrska-Bishop et al. (2022))
Requirement: bcftools
# create a directory
mkdir -p 2504_humans
# fetch VCFs
FTP_PREFIX=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
wget -O 2504_humans/${CHROM}.raw.vcf.gz ${FTP_PREFIX}/1kGP_high_coverage_Illumina.${CHROM}.filtered.SNV_INDEL_SV_phased_panel.vcf.gz)
done < <(cat aux/humans.chrom_sizes.tsv)
# Subsample to 2504 core (high coverage WGS) samples and keep only biallellic SNPs and remove "raw" VCFs
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
bcftools view -O z -m 2 -M 2 -S aux/humans.samples.lst 2504_humans/chr${CHROM}.raw.vcf.gz > 2504_humans/chr${CHROM}.vcf.gz
rm -f 2504_humans/chr${CHROM}.raw.vcf.gz
done < <(cat aux/humans.chrom_sizes.tsv)
Since in this dataset PASS sites are not annotated as such in the filter column, set the --np
flag which disables the VCF_PASS_FILTER
setting in the WinPCA config file (winpca/modules/config.py
). Now process all chromosomes in a loop:
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca pca d2504_humans/${CHROM} 2504_humans/${CHROM}.vcf.gz ${CHROM}:1-${SIZE} -i 100000 --np
done < <(cat aux/humans.chrom_sizes.tsv)
GUIDE_SAMPLE= HG03078
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
winpca polarize 2504_humans/${CHROM} -c 1 -p guide_samples -g $GUIDE_SAMPLE
done < <(cat aux/humans.chrom_sizes.tsv)
winpca flip 2504_humans/chr1 --r -w 118500000,176500000,193500000,195500000,227500000
winpca flip 2504_humans/chr2 -w 61500000,91500000-241500000,175500000,192500000
winpca flip 2504_humans/chr3 -w 49500000-50500000,91500000,95500000,125500000,162500000-163500000
winpca flip 2504_humans/chr4 -w 57500000,103500000
winpca flip 2504_humans/chr5 -w 134700000
winpca flip 2504_humans/chr6 -w 63500000
winpca flip 2504_humans/chr7 -w 58500000-62500000,69500000,76500000
winpca flip 2504_humans/chr8 -w 10000000,23500000,34500000,36500000
winpca flip 2504_humans/chr10 --r
winpca flip 2504_humans/chr11 -w 89500000,37500000,86500000
winpca flip 2504_humans/chr12 -w 110500000
winpca flip 2504_humans/chr14 --r -w 67400000
winpca flip 2504_humans/chr15 -w 18600000
winpca flip 2504_humans/chr17 -w 26500000
winpca flip 2504_humans/chr18 -w 16500000-17500000,33500000
winpca flip 2504_humans/chr20 -w 20500000
winpca flip 2504_humans/chr11 -w 110500000
# set plot colors
COLORS='LWK:b7984d,GWD:e2b64c,MSL:d0b64d,ACB:d49a43,ASW:c16b36,YRI:e1b65a,ESN:ebcc51,PEL:b5302c,MXL:b0253a,CLM:b0253a,PUR:a34128,BEB:672a7b,STU:8c5793,ITU:8c3a5d,PJL:ad2f7f,GIH:5d4489,CHB:b8c750,KHV:7faa53,CHS:8daf4f,JPT:5e8a48,CDX:6e974b,FIN:81b6c2,GBR:96c3db,IBS:7a8ec0,CEU:3a4c92,TSI:2e3468'
# plot
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca chromplot 2504_humans/${chrom} ${CHROM}:1-${SIZE} -g population -m 2504_humans/humans.metadata.tsv -f PDF,HTML -i 5 -c $COLORS
done < <(cat aux/humans.chrom_sizes.tsv)
# compile comma-separated list of chromosomes
CHROMS=$(cut -f1 aux/humans.chrom_sizes.tsv | tr '\n' ',' | sed 's/.$//')
# plot
winpca genomeplot 2504_humans/ $CHROMS -g clade -m aux/humans.metadata.tsv -i 50 -f PDF,HTML -c $COLORS
# set PREFIX,CHROM, REGION
PREFIX=8p23_inv
CHROM=chr8
REGION=${CHROM}:6000000-14000000
# index VCF
bcftools index --threads 10 2504_humans/${CHROM}.vcf.gz
# subset to focal region
bcftools view -O z 2504_humans/${CHROM}.vcf.gz $REGION > 2504_humans/${PREFIX}.vcf.gz
# run windowed PCA
winpca pca 2504_humans/${PREFIX} 2504_humans/${PREFIX}.vcf.gz $REGION -w 100000 -i 10000 --np
# plot
winpca chromplot 2504_humans/${PREFIX} chr17:45150000-46600000 -g population -m aux/humans.metadata.tsv -c $COLORS -f HTML,PDF
Data from Ren et al. (2021)
# create a directory
mkdir -p 110_cannabis
# fetch VCF
wget -O 110_cannabis/cannabis.vcf.gz https://datadryad.org/stash/downloads/file_stream/844426
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca pca 110_cannabis/${CHROM} 110_cannabis/cannabis.vcf.gz ${CHROM}:1-${SIZE}
done < <(cat aux/cannabis.chrom_sizes.tsv)
GUIDE_SAMPLE=COA
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
winpca polarize 110_cannabis/${CHROM} -c 1 -p guide_samples -g $GUIDE_SAMPLE
done < <(cat aux/cannabis.chrom_sizes.tsv)
winpca flip 110_cannabis/01 -w aux/cannabis.01.flip.lst
winpca flip 110_cannabis/02 -w aux/cannabis.02.flip.lst
winpca flip 110_cannabis/03 -w aux/cannabis.03.flip.lst
winpca flip 110_cannabis/04 -w aux/cannabis.04.flip.lst
winpca flip 110_cannabis/05 -w aux/cannabis.05.flip.lst
winpca flip 110_cannabis/06 -w aux/cannabis.06.flip.lst
winpca flip 110_cannabis/07 -w aux/cannabis.07.flip.lst
winpca flip 110_cannabis/08 -w aux/cannabis.08.flip.lst
winpca flip 110_cannabis/09 -w aux/cannabis.09.flip.lst
winpca flip 110_cannabis/10 -w aux/cannabis.10.flip.lst
# set plot colors
COLORS='basal_cannabis:3884e0,hemp_type:bd4840,drug_type:178f2d,drug_type_feral:9ad134'
# plot
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca chromplot 110_cannabis/${CHROM} ${CHROM}:1-${SIZE} -g clade -m aux/cannabis.metadata.tsv -f PDF,HTML -i 5 -c $COLORS
done < <(cat aux/cannabis.chrom_sizes.tsv)
# compile comma-separated list of chromosomes
CHROMS=$(cut -f1 aux/cannabis.chrom_sizes.tsv | tr '\n' ',' | sed 's/.$//')
# plot
winpca genomeplot 110_cannabis/ $CHROMS -g clade -m aux/cannabis.metadata.tsv -i 50 -f PDF,HTML -c $COLORS
# set PREFIX,CHROM, REGION
PREFIX=chr9_inv
CHROM=09
REGION=${CHROM}:1-10000000
# run windowed PCA
winpca pca 110_cannabis/${PREFIX} 110_cannabis/cannabis.vcf.gz $REGION -w 500000 -i 50000
# reflect results to be consistent with the genome-wide plot
winpca flip $REGION 110_cannabis/${PREFIX} --r
# plot
winpca chromplot 110_cannabis/${NAME} 09:1-9000000 -g clade -m aux/cannabis.metadata.tsv -c $COLORS -f HTML,PDF
Data from Blumer et al. (2024)
# create a directory
mkdir -p 291_cichlids
# fetch VCF
wget -O 291_cichlids/cichlids.vcf.gz [**LINK**]
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca pca 291_cichlids/${CHROM} 291_cichlids/cichlids.vcf.gz ${CHROM}:1-${SIZE}
done < <(cat aux/cichlids.chrom_sizes.tsv)
GUIDE_SAMPLE=2021cicX11218981
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
winpca polarize 291_cichlids/${CHROM} -c 1 -p guide_samples -g $GUIDE_SAMPLE
done < <(cat aux/cichlids.chrom_sizes.tsv)
winpca flip 291_cichlids/chr1 --r -w 39500000-40650000
winpca flip 291_cichlids/chr3 --r -w 500000-1700000,2750000-6950000,31800000-51800000
winpca flip 291_cichlids/chr5 --r
winpca flip 291_cichlids/chr7 --r
winpca flip 291_cichlids/chr9 --r
winpca flip 291_cichlids/chr12 -w 37650000-38100000
winpca flip 291_cichlids/chr10 --r
winpca flip d291_cichlids/chr14 -w 34150000-34850000
winpca flip 291_cichlids/chr19 --r -w 27650000-30350000
winpca flip 291_cichlids/chr20 --r
# set plot colors
COLORS='F3:7a7a7a,F2:ababab,F1:5a7ce8,G0_F:f08018,G0_M:109410'
# plot
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca chromplot 291_cichlids/chr${CHROM} chr${CHROM}:1-${SIZE} -g generation -m aux/cichlids.metadata.tsv -f PDF,HTML -i 5 -c $COLORS
done < <(cat aux/cichlids.chrom_sizes.tsv)
# compile comma-separated list of chromosomes
CHROMS=$(cut -f1 aux/cichlids.chrom_sizes.tsv | tr '\n' ',' | sed 's/.$//')
# plot
winpca genomeplot 291_cichlids/ $CHROMS -g generation -m aux/cichlids.metadata.tsv -i 50 -f PDF,HTML -c $COLORS
Data from Quiroga-Carmona et al. (2024)
# create a directory
mkdir -p 71_phyllotis
# fetch VCF
wget -O 71_phyllotis/chr1.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr2.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr3.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr4.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr5.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr6.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr7.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr8.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr9.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr10.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr11.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr12.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr14.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr15.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr16.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr17.beagle.gz [**LINK**]
wget -O 71_phyllotis/chr18.beagle.gz [**LINK**]
while IFS= read -r LINE ; do
CHROM=$(echo "$LINE" | cut -f1)
SIZE=$(echo "$LINE" | cut -f2)
winpca pca 71_phyllotis/${CHROM} 71_phyllotis/${CHROM}.beagle.gz ${CHROM}:1-${SIZE} -i 25000 -v GL
done < <(aux/phyllotis.chrom_sizes.tsv)
winpca flip 71_phyllotis/chr2 --r
winpca flip 71_phyllotis/chr3 -w 103125000-157750000
winpca flip 71_phyllotis/chr4 -w 86175000-145300000
winpca flip 71_phyllotis/chr5 -w 83100000-147975000
winpca flip 71_phyllotis/chr6 --r
winpca flip 71_phyllotis/chr7 -w 75675000-115050000
winpca flip 71_phyllotis/chr8 -w 73900000-116275000
winpca flip 71_phyllotis/chr10 -w 69725000-106225000
winpca flip 71_phyllotis/chr11 --r
winpca flip 71_phyllotis/chr12 -w 67250000-119875000
winpca flip 71_phyllotis/chr15 -w 54125000-95500000
winpca flip 71_phyllotis/chr16 -w 49125000-87125000
winpca flip 71_phyllotis/chr19 --r
# set plot colors
COLORS='vaccarum:346beb,limatus:109410,hybrid:eb3434'
# plot
while IFS= read -r line ; do
CHROM=$(echo "$line" | cut -f1)
SIZE=$(echo "$line" | cut -f2)
winpca chromplot 71_phyllotis/${CHROM} ${CHROM}:1-${SIZE} -g species -m aux/phyllotis.metadata.tsv -f PDF,HTML -i 5 -c $COLORS
done < <(cat aux/phyllotis.chrom_sizes.tsv)
# compile comma-separated list of chromosomes
CHROMS=$(cut -f1 aux/phyllotis.chrom_sizes.tsv | tr '\n' ',' | sed 's/.$//')
# plot
winpca genomeplot 71_phyllotis/ $CHROMS -g species -m aux/phyllotis.metadata.tsv -i 10 -f PDF,HTML -c $COLORS