FW: An R package for Genomic Pedigree and Spatial analysis using the Finlay Wilkinson Regression - lian0090/FW GitHub Wiki

The Finlay-Wilkinson Regression is a popular method among plant breeders to describe genotype by environment interaction. The standard implementation is a two-step procedure that uses environment (sample) means as known covariates in a within-line ordinary least squares (OLS) regression. This procedure can be suboptimal for at least four reasons: (i) in the first step environmental means are computed without considering genotype effects, (ii) in the second step, uncertainty about the environmental means is ignored, (ii) estimation is performed regarding lines and environment as fixed effects and (iv) the procedure does not incorporate genetic (either pedigree-derived or marker derived) relationships. Su et al. propose to address these problems using a Bayesian method that allows simultaneous estimation of environmental and genotype parameters, and allows incorporation of pedigree information. We extended the model presented by Su et al. to allow integration of genomic (e.g., SNP) and spatial information, and implemented both the OLS method and the Bayesian method in this package. For computational efficiency, the Gibbs method was implemented in compiled C code.

Installation

In R type the following command:

library(devtools)
install_github("lian0090/FW")

Loading the package and data

library(FW)
data(wheat)

Example analysis of wheat data set

attach(wheat.Y)
OLS=FW(y=y,VAR=VAR,ENV=ENV, method="OLS")
GibbsI=FW( y=y,VAR=VAR,ENV=ENV,   method="Gibbs",seed=12345,saveAt="GibbsI")
GibbsA=FW(y=y,VAR=VAR,ENV=ENV, method="Gibbs",A=wheat.G,seed=12345, saveAt="GibbsA")



#correlation between cell means of y and yhat for training data sets and validation data sets
whNA=which(is.na(yNA))
#cell means for training data and validation data
ymean_train=aggregate(y[-whNA],by=list(VAR[-whNA],ENV[-whNA]),mean)[,3]
ymean_vali=aggregate(y[whNA],by=list(VAR[whNA],ENV[whNA]),mean)[,3]
#yhat and cor(yhat, ymean) for lm1  
yhat_train=aggregate(lm1$yhat[-whNA],by=list(VAR[-whNA],ENV[-whNA]),mean)[,3]
yhat_vali=aggregate(lm1$yhat[whNA],by=list(VAR[whNA],ENV[whNA]),mean)[,3]
cor(ymean_train,yhat_train)
[1] 0.9397626
cor(yhat_vali,ymean_vali)
[1] 0.5454864
#yhat and cor(yhat, ymean) for lm2 
yhat_train=aggregate(lm2$yhat[-whNA],by=list(VAR[-whNA],ENV[-whNA]),mean)[,3]
yhat_vali=aggregate(lm2$yhat[whNA],by=list(VAR[whNA],ENV[whNA]),mean)[,3]
cor(yhat_train,ymean_train)
[1] 0.9332163
cor(ymean_vali,yhat_vali)
[1] 0.6714431

#yhat and cor(yhat, ymean) for lm3 
yhat_train=aggregate(lm3$yhat[-whNA],by=list(VAR[-whNA],ENV[-whNA]),mean)[,3]
yhat_vali=aggregate(lm3$yhat[whNA],by=list(VAR[whNA],ENV[whNA]),mean)[,3]
cor(ymean_train,yhat_train)
[1] 0.9225863
cor(yhat_vali,ymean_vali)
[1] 0.7001197

#To plot the fitted and observed variety performance across different environments
plot(lm1)

plot lm1

#To plot only a few lines
plot(lm1,plotVAR=c("V1","V4","V5","V9"))

plot of lm1 few line

Example 3 run Finlay-Wilkinson Regression by Gibbs Sampler with different hyper-parameters

vary=var(yNA,na.rm=T)
lm4=FW(y=yNA,VAR=VAR,ENV=ENV,df=1,dfg=1,dfb=1,dfh=1,seed=1,saveAt="example2/lm4")

lmer1=lmer(yNA~(1|VAR)+(1|ENV))
summary(lmer1)
lm5=FW(y=yNA,VAR=VAR,ENV=ENV,df=1,dfg=1,dfb=1,dfh=1,priorVar_e=0.36,priorVar_g=0.18,priorVar_h=0.74, priorVar_b=0.18,seed=1,saveAt="example2/lm5")


#yhat and cor(yhat, ymean) for lm4  
yhat_train=aggregate(lm4$yhat[-whNA],by=list(VAR[-whNA],ENV[-whNA]),mean)[,3]
yhat_vali=aggregate(lm4$yhat[whNA],by=list(VAR[whNA],ENV[whNA]),mean)[,3]
> cor(ymean_train,yhat_train)
[1]  0.932468
> cor(yhat_vali,ymean_vali)
[1] 0.6776913
#yhat and cor(yhat, ymean) for lm5 
yhat_train=aggregate(lm5$yhat[-whNA],by=list(VAR[-whNA],ENV[-whNA]),mean)[,3]
yhat_vali=aggregate(lm5$yhat[whNA],by=list(VAR[whNA],ENV[whNA]),mean)[,3]
> cor(ymean_train,yhat_train)
[1] 0.9296543
> cor(yhat_vali,ymean_vali)
[1] 0.690864
##fit more than one chain
lm6=FW(y=yNA,VAR=VAR,ENV=ENV,nchain=2,seed=c(1:2),saveAt="lm6")
lm7=FW(y=y,VAR=VAR,ENV=ENV,nchain=2,seed=c(1:2),saveAt="lm7")
lm8=FW(y=y,VAR=VAR,ENV=ENV,nchain=2,seed=c(1:2),saveAt="lm8",A=wheat.G50)

#yhat and cor(yhat, ymean) for lm6 
yhat_train=aggregate(lm6$yhat[-whNA,],by=list(VAR[-whNA],ENV[-whNA]),mean)[,-c(1:2)]
yhat_vali=aggregate(lm6$yhat[whNA,],by=list(VAR[whNA],ENV[whNA]),mean)[,-c(1:2)]
cor(ymean_train,yhat_train)
cor(yhat_vali,ymean_vali)
load('lm6samps.rda')
plot(samps)

plot of Bayesian samples