data encryption - reworkhow/JWAS.jl GitHub Wiki

JWAS (branch: encryption)

Raw data

data simulation

using JWAS,DataFrames,CSV,Statistics,Random

# Step 2: Read data 
n=50
p=100

genotypes_raw  = rand(MersenneTwister(1),[0.0,1.0,2.0],n,p)
genotypes_raw  = genotypes_raw.-mean(genotypes_raw,dims=1) #center
phenotypes_raw = DataFrame(ID=1:n,cg=[repeat(["cg1"],25);repeat(["cg2"],25)],batch=[repeat(["batch1"],20);repeat(["batch2"],30)],y_raw=rand(MersenneTwister(2),n))
phenotypes_raw[:,:y_raw]=phenotypes_raw[:,:y_raw].-mean(phenotypes_raw[:,:y_raw])  #center

run JWAS

G=0.001
genotypes_raw  = get_genotypes(genotypes_raw,G;G_is_marker_variance = true,
                              method="RR-BLUP",center=false,quality_control=false);

# Step 3: Build Model Equations
model_equation_raw  ="y_raw = cg + batch + genotypes_raw"
R=1.0
model_raw = build_model(model_equation_raw,R);

set_random(model_raw,"cg");
set_random(model_raw,"batch");

# Step 6: Run Analysis
out_raw=runMCMC(model_raw,phenotypes_raw,double_precision=true,chain_length=50000);

Encrypted data

data simulation

  • The names of the encrypted incidence matrix must be "name_i". For example, for "batch" with 2 levels, the names must be "batch_1","batch_2".
n=50
p=100

genotypes_raw  = rand(MersenneTwister(1),[0.0,1.0,2.0],n,p)
genotypes_raw  = genotypes_raw.-mean(genotypes_raw,dims=1) #center
phenotypes_raw = DataFrame(ID=1:n,cg=[repeat(["cg1"],25);repeat(["cg2"],25)],y_raw=rand(MersenneTwister(2),n))
phenotypes_raw[:,:y_raw]=phenotypes_raw[:,:y_raw].-mean(phenotypes_raw[:,:y_raw])  #center
phenotypes_raw[:,:y_raw]'phenotypes_raw[:,:y_raw]

#enctyption
P=lq(rand(MersenneTwister(3),n,n)).Q
P=Matrix(P)  #get orthogonal matrix from QR decomposition

genotypes_encrypted=P*genotypes_raw
incidence_matrix1=[ones(25)  zeros(25) 
                  zeros(25) ones(25)] 
incidence_matrix2=[ones(20) zeros(20)
                   zeros(30) ones(30)]
incidence_matrix=[incidence_matrix1 incidence_matrix2]

phenotypes_encrypted=DataFrame(ID=1:n,cg_1=P*incidence_matrix[:,1],cg_2=P*incidence_matrix[:,2],
                              batch_1=P*incidence_matrix[:,3],batch_2=P*incidence_matrix[:,4],
                              y_encrypted=P*phenotypes_raw[:,:y_raw])

Run JWAS

  • set encryption=true in runMCMC()
G=0.001
genotypes_encrypted  = get_genotypes(genotypes_encrypted,G;G_is_marker_variance = true,
                                     method="RR-BLUP",center=false,quality_control=false);

# Step 3: Build Model Equations
model_equation_encrypted  ="y_encrypted = cg + batch + genotypes_encrypted"
R=1.0
model_encrypted = build_model(model_equation_encrypted,R);

set_random(model_encrypted,"cg");
set_random(model_encrypted,"batch");

# Step 6: Run Analysis
out_encrypted=runMCMC(model_encrypted,phenotypes_encrypted,encryption=true,double_precision=true,chain_length=50000);

Compare results from raw v.s. encrypted

ebv_raw=out_raw["EBV_y_raw"][:,:EBV]
ebv_encrypted=out_encrypted["EBV_y_encrypted"][:,:EBV]
ebv_encrypted=P'*ebv_encrypted

cor(marker_effect_raw,marker_effect_encrypted)  #0.9775076
cor(ebv_raw,ebv_encrypted)                      #0.9924604
#also similar residual variance, genetic varance

using Plots
scatter(marker_effect_raw,marker_effect_encrypted)