LDpred2 - declan93/PGS-LMM GitHub Wiki

LDpred2

We can replace the P+T step with LDpred2 https://privefl.github.io/bigsnpr/articles/LDpred2.html

https://choishingwan.github.io/PRS-Tutorial/ldpred/ provides another example of how to run LDpred2

As we are generating a number of association results from the same SNP set and in the ease of computational restraints we will create LOCO genotype objects, create LOCO LD correlations by first calculating per chromosome LD.

#generate LOCO Genotype objects. This needs to be the full GWAS sample set 
library("bigsnpr")

args = commandArgs(trailingOnly=TRUE)
# args 1 Genotype file handle
# args 2 LOCO summary stats
# args 3 The left out chromosome number 

fileIN <- paste(args[1],".bed",sep="")
fileRDS <- paste(args[1],".rds",sep="")
#snp_readBed(fileIN)

obj.bigSNP <- snp_attach(fileRDS)

G   <- obj.bigSNP$genotypes
MAP  <- obj.bigSNP$map
FAM  <- obj.bigSNP$fam
CHR <- obj.bigSNP$map$chromosome
POS <- obj.bigSNP$map$physical.pos

# On a slurm system 
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))
#NCORES <- nb_cores()

# Read in the LOCO summary stats
sumstats <- bigreadr::fread2(args[2]) 
names(sumstats) <- c("chr", "rsid", "pos", "a0", "a1","n_eff", "af", "beta", "beta_se", "p")

map <- obj.bigSNP$map[-(2:3)]
names(map) <- c("chr", "pos", "a0", "a1")
info_snp <- snp_match(sumstats, map)

MAP2 <- MAP[match(info_snp$rsid, obj.bigSNP$map$marker.ID),]

# Create a LOCO subset of the Genotypes. 
G2 <- big_copy(G,ind.col=match(info_snp$rsid, obj.bigSNP$map$marker.ID), backingfile = paste("data/LOCO_",args[3], sep=""))

snp.list <- structure(list(genotypes = G2, fam = FAM, map = MAP2), class = "bigSNP")
saveRDS(snp.list,paste("data/LOCO_",args[3],".rds", sep=""))

Generate LOCO LD correlation matrices

We can speed up this computation by reducing the sample size to ~ 20,000.

# First write chromosome level genotypes to a format bigsnpR can read quickly 
for (chr in 1:22) {
    # preprocess the bed file (only need to do once for each data set)
    # Assuming the file naming is HAPMAP_LDpred2_#.bed
    snp_readBed(paste0("HAPMAP_LDPred2_",chr,".bed"))

}

Calculate LD correlations for each chromosome

library(magrittr)
library("bigsnpr")
library("data.table")
args = commandArgs(trailingOnly=TRUE)
# args 1 Summary statistics- full
# args 2 chromosome
# Get maximum amount of cores
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))

# Open a temporary file
tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)

sumstats <- bigreadr::fread2(args[1])
chr=args[2]
names(sumstats) <- c("chr", "rsid", "pos", "a0", "a1","n_eff", "af", "beta", "beta_se", "p")

# Initialize variables for storing the LD score and LD matrix
corr <- NULL
ld <- NULL

# We want to know the ordering of samples in the bed file 
info_snp <- NULL
fam.order <- NULL


# now attach the genotype object that we created above
obj.bigSNP <- snp_attach(paste0("HAPMAP_LDPred2_",chr,".rds"))
# extract the SNP information from the genotype
map <- obj.bigSNP$map[-3]
names(map) <- c("chr", "rsid", "pos", "a1", "a0")
# perform SNP matching
tmp_snp <- snp_match(sumstats[sumstats$chr==chr,], map)
info_snp <- rbind(info_snp, tmp_snp)
# Assign the genotype to a variable for easier downstream analysis
genotype <- obj.bigSNP$genotypes
# Rename the data structures
CHR <- map$chr
POS <- map$pos
# get the CM information from 1000 Genome
# will download the 1000G file to the current directory (".")
POS2 <- snp_asGeneticPos(CHR, POS, dir = ".")
# calculate LD
# Extract SNPs that are included in the chromosome
ind.chr <- which(tmp_snp$chr == chr)
ind.chr2 <- tmp_snp$`_NUM_ID_`[ind.chr]
# Calculate the LD
corr0 <- snp_cor(
         genotype,
        ind.col = ind.chr2,
         ncores = NCORES,
         infos.pos = POS2[ind.chr2],
         size = 3 / 1000)
saveRDS(corr0, file = paste("data/corr/corr_",chr,".rds",sep=""))
saveRDS(ld, file = paste("data/corr/ld_",chr,".rds",sep=""))

Combine LOCO LD matrices

library("bigsnpr")

args = commandArgs(trailingOnly=TRUE)


tmp <- tempfile(tmpdir = "tmp-data")
on.exit(file.remove(paste0(tmp, ".sbk")), add = TRUE)
NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))

# concatenation will need to start at chr 2 for chr 1
x <- ifelse(args[1] == 1, 2, 1)
for (chr in 1:22) {
        if (chr == args[1]){
                next
        }


    ## indices in 'sumstats'
    corr0 <- readRDS(paste0("data/corr/corr_", chr, ".rds"))
    print(str(corr0))
    if (chr == x) {
        # Declan removed '_NUM_UID_'
      ld <- Matrix::colSums(corr0^2)
      corr <- as_SFBM(corr0, tmp)
    } else {
      ld <- c(ld, Matrix::colSums(corr0^2))
      corr$add_columns(corr0, nrow(corr))
    }
}
saveRDS(corr, file = paste("data/corr/corr_LOCO",args[1],".rds",sep=""))
saveRDS(ld, file = paste("data/corr/ld_LOCO",args[1],".rds",sep=""))

Calculate LOCO PGS

library(bigsnpr)
library(ggpubr)
args = commandArgs(trailingOnly=TRUE)
# args 1 Pheno
# args 2 summary stats
# args 3 LOCO chromosome

fileRDS <- paste("data/LOCO_",args[3],".rds",sep="")

obj.bigSNP <- snp_attach(fileRDS)

G   <- obj.bigSNP$genotypes
CHR <- obj.bigSNP$map$chromosome
POS <- obj.bigSNP$map$physical.pos

NCORES <- as.numeric(system2("echo", "$SLURM_JOB_CPUS_PER_NODE",stdout=TRUE))

sumstats <- bigreadr::fread2(args[2])
names(sumstats) <- c("chr", "rsid", "pos", "a0", "a1","n_eff", "af", "beta", "beta_se", "p")

set.seed(1)

map <- obj.bigSNP$map[-(2:3)]
names(map) <- c("chr", "pos", "a0", "a1")
info_snp <- snp_match(sumstats, map)

df_beta <- info_snp[, c("beta", "beta_se", "n_eff")]

ld <- readRDS(paste("data/corr/ld_LOCO",args[3],".rds",sep=""))
corr <- readRDS(paste("data/corr/corr_LOCO",args[3],".rds",sep=""))

(ldsc <- with(df_beta, snp_ldsc(ld, length(ld), chi2 = (beta / beta_se)^2, sample_size = n_eff, blocks = NULL)))
h2_est <- ldsc["h2"](/declan93/PGS-LMM/wiki/"h2")

# auto
multi_auto <- snp_ldpred2_auto(corr, df_beta, h2_init = h2_est, vec_p_init = seq_log(1e-4, 0.9, length.out = NCORES), ncores = NCORES)
beta_auto <- sapply(multi_auto, function(auto) auto$beta_est)

G2 <- snp_fastImputeSimple(G, ncores=NCORES)

pred_auto <- big_prodMat(G2, beta_auto)

sc <- apply(pred_auto, 2, sd)
keep <- abs(sc - median(sc)) < 3 * mad(sc)
final_beta_auto <- rowMeans(beta_auto[, keep])
final_pred_auto <- rowMeans(pred_auto[, keep])

PRS <- cbind(obj.bigSNP$fam$family.ID,final_pred_auto)
colnames(PRS) <- c("ID","PGS")

beta_out <- cbind(info_snp, final_beta_auto)
write.table(beta_out, paste("Pred/",args[1],"_pred_auto_LOCO_",args[3],".out",sep=""), quote=F, row.names=F)
write.table(PRS, paste("Pred/",args[1],"_pred_auto_LOCO_",args[3],".PRS.out",sep=""), quote=F, row.names=F)

auto <- multi_auto[1](/declan93/PGS-LMM/wiki/1)
p <- plot_grid(
  qplot(y = auto$path_p_est) + 
    theme_bigstatsr() + 
    geom_hline(yintercept = auto$p_est, col = "blue") +
    scale_y_log10() +
    labs(y = "p"),
  qplot(y = auto$path_h2_est) + 
    theme_bigstatsr() + 
    geom_hline(yintercept = auto$h2_est, col = "blue") +
    labs(y = "h2"),
  ncol = 1, align = "hv"
)
ggsave(plot=p,filename=paste("Pred/",args[1],"_LOCO_",args[3],".png",sep=""), device="png")