LDpred2 - declan93/PGS-LMM GitHub Wiki
LDpred2
We can replace the P+T step with LDpred2 https://privefl.github.io/bigsnpr/articles/LDpred2.html
https://choishingwan.github.io/PRS-Tutorial/ldpred/ provides another example of how to run LDpred2
As we are generating a number of association results from the same SNP set and in the ease of computational restraints we will create LOCO genotype objects, create LOCO LD correlations by first calculating per chromosome LD.
#generate LOCO Genotype objects. This needs to be the full GWAS sample set
library("bigsnpr")
args = commandArgs(trailingOnly=TRUE)
# args 1 Genotype file handle
# args 2 LOCO summary stats
# args 3 The left out chromosome number
fileIN <- paste(args[1],".bed",sep="")
fileRDS <- paste(args[1],".rds",sep="")
#snp_readBed(fileIN)
obj.bigSNP <- snp_attach(fileRDS)
G <- obj.bigSNP$genotypes
MAP <- obj.bigSNP$map
FAM <- obj.bigSNP$fam
CHR <- obj.bigSNP$map$chromosome
POS <- obj.bigSNP$map$physical.pos
# On a slurm system
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))
#NCORES <- nb_cores()
# Read in the LOCO summary stats
sumstats <- bigreadr::fread2(args[2])
names(sumstats) <- c("chr", "rsid", "pos", "a0", "a1","n_eff", "af", "beta", "beta_se", "p")
map <- obj.bigSNP$map[-(2:3)]
names(map) <- c("chr", "pos", "a0", "a1")
info_snp <- snp_match(sumstats, map)
MAP2 <- MAP[match(info_snp$rsid, obj.bigSNP$map$marker.ID),]
# Create a LOCO subset of the Genotypes.
G2 <- big_copy(G,ind.col=match(info_snp$rsid, obj.bigSNP$map$marker.ID), backingfile = paste("data/LOCO_",args[3], sep=""))
snp.list <- structure(list(genotypes = G2, fam = FAM, map = MAP2), class = "bigSNP")
saveRDS(snp.list,paste("data/LOCO_",args[3],".rds", sep=""))
Generate LOCO LD correlation matrices
We can speed up this computation by reducing the sample size to ~ 20,000.
# First write chromosome level genotypes to a format bigsnpR can read quickly
for (chr in 1:22) {
# preprocess the bed file (only need to do once for each data set)
# Assuming the file naming is HAPMAP_LDpred2_#.bed
snp_readBed(paste0("HAPMAP_LDPred2_",chr,".bed"))
}
Calculate LD correlations for each chromosome
library(magrittr)
library("bigsnpr")
library("data.table")
args = commandArgs(trailingOnly=TRUE)
# args 1 Summary statistics- full
# args 2 chromosome
# Get maximum amount of cores
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))
# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
sumstats <- bigreadr::fread2(args[1])
chr=args[2]
names(sumstats) <- c("chr", "rsid", "pos", "a0", "a1","n_eff", "af", "beta", "beta_se", "p")
# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL
# We want to know the ordering of samples in the bed file
info_snp <- NULL
fam.order <- NULL
# now attach the genotype object that we created above
obj.bigSNP <- snp_attach(paste0("HAPMAP_LDPred2_",chr,".rds"))
# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
tmp_snp <- snp_match(sumstats[sumstats$chr==chr,], map)
info_snp <- rbind(info_snp, tmp_snp)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
# Extract SNPs that are included in the chromosome
ind.chr <- which(tmp_snp$chr == chr)
ind.chr2 <- tmp_snp$`_NUM_ID_`[ind.chr]
# Calculate the LD
corr0 <- snp_cor(
genotype,
ind.col = ind.chr2,
ncores = NCORES,
infos.pos = POS2[ind.chr2],
size = 3 / 1000)
saveRDS(corr0, file = paste("data/corr/corr_",chr,".rds",sep=""))
saveRDS(ld, file = paste("data/corr/ld_",chr,".rds",sep=""))
Combine LOCO LD matrices
library("bigsnpr")
args = commandArgs(trailingOnly=TRUE)
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))
# concatenation will need to start at chr 2 for chr 1
x <- ifelse(args[1] == 1, 2, 1)
for (chr in 1:22) {
if (chr == args[1]){
next
}
## indices in 'sumstats'
corr0 <- readRDS(paste0("data/corr/corr_", chr, ".rds"))
print(str(corr0))
if (chr == x) {
# Declan removed '_NUM_UID_'
ld <- Matrix::colSums(corr0^2)
corr <- as_SFBM(corr0, tmp)
} else {
ld <- c(ld, Matrix::colSums(corr0^2))
corr$add_columns(corr0, nrow(corr))
}
}
saveRDS(corr, file = paste("data/corr/corr_LOCO",args[1],".rds",sep=""))
saveRDS(ld, file = paste("data/corr/ld_LOCO",args[1],".rds",sep=""))
Calculate LOCO PGS
library(bigsnpr)
library(ggpubr)
args = commandArgs(trailingOnly=TRUE)
# args 1 Pheno
# args 2 summary stats
# args 3 LOCO chromosome
fileRDS <- paste("data/LOCO_",args[3],".rds",sep="")
obj.bigSNP <- snp_attach(fileRDS)
G <- obj.bigSNP$genotypes
CHR <- obj.bigSNP$map$chromosome
POS <- obj.bigSNP$map$physical.pos
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))
sumstats <- bigreadr::fread2(args[2])
names(sumstats) <- c("chr", "rsid", "pos", "a0", "a1","n_eff", "af", "beta", "beta_se", "p")
set.seed(1)
map <- obj.bigSNP$map[-(2:3)]
names(map) <- c("chr", "pos", "a0", "a1")
info_snp <- snp_match(sumstats, map)
df_beta <- info_snp[, c("beta", "beta_se", "n_eff")]
ld <- readRDS(paste("data/corr/ld_LOCO",args[3],".rds",sep=""))
corr <- readRDS(paste("data/corr/corr_LOCO",args[3],".rds",sep=""))
(ldsc <- with(df_beta, snp_ldsc(ld, length(ld), chi2 = (beta / beta_se)^2, sample_size = n_eff, blocks = NULL)))
h2_est <- ldsc["h2"](/declan93/PGS-LMM/wiki/"h2")
# auto
multi_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = h2_est, vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES), ncores = NCORES)
beta_auto <- sapply(multi_auto, function(auto) auto$beta_est)
G2 <- snp_fastImputeSimple(G, ncores=NCORES)
pred_auto <- big_prodMat(G2, beta_auto)
sc <- apply(pred_auto, 2, sd)
keep <- abs(sc - median(sc)) < 3 * mad(sc)
final_beta_auto <- rowMeans(beta_auto[, keep])
final_pred_auto <- rowMeans(pred_auto[, keep])
PRS <- cbind(obj.bigSNP$fam$family.ID,final_pred_auto)
colnames(PRS) <- c("ID","PGS")
beta_out <- cbind(info_snp, final_beta_auto)
write.table(beta_out, paste("Pred/",args[1],"_pred_auto_LOCO_",args[3],".out",sep=""), quote=F, row.names=F)
write.table(PRS, paste("Pred/",args[1],"_pred_auto_LOCO_",args[3],".PRS.out",sep=""), quote=F, row.names=F)
auto <- multi_auto[1](/declan93/PGS-LMM/wiki/1)
p <- plot_grid(
qplot(y = auto$path_p_est) +
theme_bigstatsr() +
geom_hline(yintercept = auto$p_est, col = "blue") +
scale_y_log10() +
labs(y = "p"),
qplot(y = auto$path_h2_est) +
theme_bigstatsr() +
geom_hline(yintercept = auto$h2_est, col = "blue") +
labs(y = "h2"),
ncol = 1, align = "hv"
)
ggsave(plot=p,filename=paste("Pred/",args[1],"_LOCO_",args[3],".png",sep=""), device="png")