#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Genes & SEM workshop - Day 3
#GRM-SEM #R practical
#Beate St Pourcain
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Software installation (do not run):
#devtools::install_git('https://gitlab.gwdg.de/beate.stpourcain/grmsem')
#Note that this installation is not fully identical with the current one as we use R.4.0.3
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Download data and vignette for workshop:
#https://gitlab.gwdg.de/beate.stpourcain/genesandsem
#Linux code
git clone https://gitlab.gwdg.de/beate.stpourcain/genesandsem
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Start vanilla R
cd genesandsem
R
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#R
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
rm(list = ls())
require(ucminf)
require(numDeriv)
require(msm)
require(optimParallel)
require(stats)
require(grmsem)
require(lavaan)
require(Matrix)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 0: Load and inspect data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Load small dataset:
# 6 phenotypes, 2 uncorrelated factors, 5000 causal SNPs, 60 individuals
load("DATA_Y6_2f_j5000_N60_a+e_muzero.5.RData") #2 factors, 6 min, uncorrelated factorsv 0.09, best choice
# Inspect similated genetic and resodual covariance
ls()
#Dimensions of the GRM matrix G
dim(G)
#Dimension of the six-variate phenotype ph (Y1 to Y6)
dim(ph)
summary(ph)
#Simulated genetic variance A
Sigma.A
#Simulated residual variance E
Sigma.E
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 1: Fit Cholesky model to the data (saturated model)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
set.seed(123456)
cores<-1
#Time the fit
TIME1<-proc.time()
#Fitting the model
out <- grmsem.fit(ph, G, model="Cholesky", LogL = TRUE, estSE = TRUE, estmu = FALSE)
out
TIME2<-proc.time() - TIME1
TIME2
#Estimate the variance based on unstandardised factor loadings
var.out<-grmsem.var(out)
print(var.out)
#Estimate absolute measure of fit
out.chol.srmr <- grmsem.srmr(ph,var.out)
out.chol.srmr
#Standardise factor loadings so that phenotype sigma^2=1
stout<-grmsem.stpar(out)
print(stout)
#Estimate the variance based on standardised factor loadings
stvar.out <- grmsem.var(stout)
print(stvar.out)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 2: Estimate principal components and predict factor structure with EFA
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Genetic part
## Dissect predicted correlation matrix from the Cholesky model
corrMat <- var.out$RG
colnames(corrMat) <- rownames(corrMat)
## Spectral decomposition of the correlation matrix to calculate eigenvalues 
pca<-eigen(corrMat)
IPC.n.AC <- ifelse(length(which(pca$values >= 1))>1,length(which(pca$values >= 1)),1)
IPC.n.AC
#Q1: How many genetic eigenvalues are in the data?
## Covariance matrix
covMat <- var.out$VA
colnames(covMat) <- rownames(covMat)
#Nearest positive definite matrix
covMat.smooth <- as.matrix((nearPD(covMat))$mat)
# Weight matrix for DWLS (diagonally weighted least square)
wMat.tmp <- 1/((var.out$VA.se)^2) 
wMat <- diag(diag(wMat.tmp))
colnames(wMat) <- rownames(wMat) <- rownames(covMat)
wMat
# use lavaan to fit EFA with DWLS weight matrix: based on number eigenvalues IPC.n.AC
twofactormodel <- '
efa("efa")*f1 + efa("efa")*f2 =~ Y1 + Y2 + Y3 + Y4 + Y5 + Y6
     f1~~0*f2
     f1~~1*f1
     f2~~1*f2
 '
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Full weight matrix - DWLS - orthogonal rotation
efa.fit <- cfa(twofactormodel, sample.cov = covMat.smooth, rotation = "varimax", estimator='DWLS', WLS.V = wMat, information = "expected", sample.nobs = out$n.ind, rotation.args = list(orthogonal = TRUE))
sum.efa.fit<-summary(efa.fit)
#Observed sample covariance
efa.fit.obs <- lavInspect(efa.fit, "observed")
efa.fit.obs
#Fitted sample covariance
efa.fit.fitted <- lavInspect(efa.fit, "fitted")
efa.fit.fitted
#Note that the assessment of the EFA model fit may not be meaningful as estimated genetic covariance may have negative 
#residual covariance  #due to estimation error
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Full weight matrix - DWLS - oblique rotation
efa.fit.oblimin <- cfa(twofactormodel, sample.cov = covMat.smooth, rotation = "oblimin", estimator='DWLS', WLS.V = wMat, information = "expected", sample.nobs = out$n.ind, rotation.args = list(orthogonal = FALSE))
summary(efa.fit.oblimin)
#Observed sample covariance
efa.fit.oblimin.obs <- lavInspect(efa.fit.oblimin, "observed")
efa.fit.oblimin.obs
#Fitted sample covariance
efa.fit.oblimin.fitted <- lavInspect(efa.fit.oblimin, "fitted")
efa.fit.oblimin.fitted
#Q2) What are the correlations between the genetic factors predicted by efa.fit.oblimin? Should you fit the correlation between genetic factors with GRM-SEM?
#Q3) What model could you fit to prove that you don't need to fit the correlations?
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 3: Fit IPC model 
#Define starting values and constraints
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Define starting values
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Specific genetic variance
IPC.AS <- diag(efa.fit@Model@GLIST$theta)
#Set Heywood cases to zero
IPC.AS <- ifelse(IPC.AS<0,0,IPC.AS)
#Define starting values for specific genetic factor loadings based on EFA (lambda) and sqrt specific genetic variance
IPC.A.v.startval <- c(c(efa.fit@Model@GLIST$lambda),c(sqrt(IPC.AS)))
IPC.A.v.startval
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Define constraints
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Set GRM-SEM factor loadings for EFA common genetic factor loadings < 0.1 to zero (cut-off)
cutoff<- 0.1
IPC.A.v.free.common <- ifelse(c(abs(efa.fit@Model@GLIST$lambda))>cutoff,1,0)
IPC.A.v.free.common
#Fit all EFA specific genetic factor loadings 
IPC.A.v.free.specific <- rep(1, length(rownames(covMat))) #i.e. number of phenotypes
IPC.A.v.free <- c(IPC.A.v.free.common,IPC.A.v.free.specific)
IPC.A.v.free
#Set all starting values for constrained factor loadings to 0
IPC.A.v.startval <- replace(IPC.A.v.startval, (IPC.A.v.free==0), 0)
IPC.A.v.startval
#Residual starting values
IPC.E.v.startval <- out$model.out[out$model.in$part=="E",]$estimates
print(IPC.E.v.startval)
#Fit IPC model
model.IPC <- "IPC"
TIME3<-proc.time() 
out.ipc<-grmsem.fit(ph, G, model = model.IPC, cores = cores, n.AC = IPC.n.AC, A.v.free = IPC.A.v.free, A.v.startval = round(IPC.A.v.startval,2), E.v.startval = IPC.E.v.startval, LogL = TRUE, estSE = TRUE, estmu = FALSE)
print(out.ipc)
TIME4<-proc.time() - TIME3
TIME4
#Estimate the variance based on unstandardised factor loadings
var.out.ipc<-grmsem.var(out.ipc)
print(var.out.ipc)
#Estimate absolute measure of fit
out.ipc.srmr <- grmsem.srmr(ph,var.out.ipc)
out.ipc.srmr
#Q4) Based on SRMR, is the IPC model a good fit?
#Standardise factor loadings so that phenotype sigma^2=1
stout.ipc<-grmsem.stpar(out.ipc)
print(stout.ipc)
#Estimate the variance based on standardised factor loadings
stvar.out.ipc <- grmsem.var(stout.ipc)
print(stvar.out.ipc)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 4: Evaluate the relative goodness of fit 
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Saturated model
out$model.fit$Chi
out$model.fit$LL
out$model.fit$LL.AIC
out$model.fit$LL.BIC
out$model.fit$LL.c.AIC
out$model.fit$LL.c.BIC
#IPC model
out.ipc$model.fit$Chi
out.ipc$model.fit$LL
out.ipc$model.fit$LL.AIC
out.ipc$model.fit$LL.BIC
out.ipc$model.fit$LL.c.AIC
out.ipc$model.fit$LL.c.BIC
#Likelihood ratio test (LRT) Cholesky versus IPC
# LR = -2 ln (L_simple / L_saturated)
# LR = -2 (LL_simple - LL_saturated)
#Simplification
LR <- (out.ipc$model.fit$Chi - out$model.fit$Chi)
LR
#LL model formula
LR2 <- -2*(out.ipc$model.fit$LL - out$model.fit$LL)
LR2
# difference in df
df.diff <- out.ipc$model.fit$df - out$model.fit$df
# LRT p-value
LRT.p <- pchisq(LR, df.diff, lower.tail = FALSE)
LRT.p
#Q5 Extract AIC and BIC and SRMR indices for Cholesky and IPC models? What is your conclusion? 
#Q6 Now inspect the "genuine" genetic and residual covariance matrices and compare with the genetic variance predicted by the Cholesky (var.out$VA) and the IPC model.
#What is your conclusion, especially taking Q5 into account?
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 5: Assess the prediction accuracy of EFA with GRM-SEM factor loadings with ?
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
cor_efa_grmsem <- cor(IPC.A.v.startval,out.ipc$model.out$estimates[out.ipc$model.in$part == "A"], use="pairwise.complete.obs")
cor_efa_grmsem
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Step 6: What can we do with the model?
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#Factorial co-heritability:  proportion of genetic trait variance explained by a genetic factor
grmsem.fcoher(out.ipc)
grmsem.fcoher(out.ipc)$fcoher.model.out[,c(1,7:10)]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#