#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Genes & SEM workshop - Day 3 #GRM-SEM #R practical #Beate St Pourcain #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Software installation (do not run): #devtools::install_git('https://gitlab.gwdg.de/beate.stpourcain/grmsem') #Note that this installation is not fully identical with the current one as we use R.4.0.3 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Download data and vignette for workshop: #https://gitlab.gwdg.de/beate.stpourcain/genesandsem #Linux code git clone https://gitlab.gwdg.de/beate.stpourcain/genesandsem #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Start vanilla R cd genesandsem R #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #R #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# rm(list = ls()) require(ucminf) require(numDeriv) require(msm) require(optimParallel) require(stats) require(grmsem) require(lavaan) require(Matrix) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 0: Load and inspect data #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Load small dataset: # 6 phenotypes, 2 uncorrelated factors, 5000 causal SNPs, 60 individuals load("DATA_Y6_2f_j5000_N60_a+e_muzero.5.RData") #2 factors, 6 min, uncorrelated factorsv 0.09, best choice # Inspect similated genetic and resodual covariance ls() #Dimensions of the GRM matrix G dim(G) #Dimension of the six-variate phenotype ph (Y1 to Y6) dim(ph) summary(ph) #Simulated genetic variance A Sigma.A #Simulated residual variance E Sigma.E #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 1: Fit Cholesky model to the data (saturated model) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# set.seed(123456) cores<-1 #Time the fit TIME1<-proc.time() #Fitting the model out <- grmsem.fit(ph, G, model="Cholesky", LogL = TRUE, estSE = TRUE, estmu = FALSE) out TIME2<-proc.time() - TIME1 TIME2 #Estimate the variance based on unstandardised factor loadings var.out<-grmsem.var(out) print(var.out) #Estimate absolute measure of fit out.chol.srmr <- grmsem.srmr(ph,var.out) out.chol.srmr #Standardise factor loadings so that phenotype sigma^2=1 stout<-grmsem.stpar(out) print(stout) #Estimate the variance based on standardised factor loadings stvar.out <- grmsem.var(stout) print(stvar.out) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 2: Estimate principal components and predict factor structure with EFA #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Genetic part ## Dissect predicted correlation matrix from the Cholesky model corrMat <- var.out$RG colnames(corrMat) <- rownames(corrMat) ## Spectral decomposition of the correlation matrix to calculate eigenvalues pca<-eigen(corrMat) IPC.n.AC <- ifelse(length(which(pca$values >= 1))>1,length(which(pca$values >= 1)),1) IPC.n.AC #Q1: How many genetic eigenvalues are in the data? ## Covariance matrix covMat <- var.out$VA colnames(covMat) <- rownames(covMat) #Nearest positive definite matrix covMat.smooth <- as.matrix((nearPD(covMat))$mat) # Weight matrix for DWLS (diagonally weighted least square) wMat.tmp <- 1/((var.out$VA.se)^2) wMat <- diag(diag(wMat.tmp)) colnames(wMat) <- rownames(wMat) <- rownames(covMat) wMat # use lavaan to fit EFA with DWLS weight matrix: based on number eigenvalues IPC.n.AC twofactormodel <- ' efa("efa")*f1 + efa("efa")*f2 =~ Y1 + Y2 + Y3 + Y4 + Y5 + Y6 f1~~0*f2 f1~~1*f1 f2~~1*f2 ' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Full weight matrix - DWLS - orthogonal rotation efa.fit <- cfa(twofactormodel, sample.cov = covMat.smooth, rotation = "varimax", estimator='DWLS', WLS.V = wMat, information = "expected", sample.nobs = out$n.ind, rotation.args = list(orthogonal = TRUE)) sum.efa.fit<-summary(efa.fit) #Observed sample covariance efa.fit.obs <- lavInspect(efa.fit, "observed") efa.fit.obs #Fitted sample covariance efa.fit.fitted <- lavInspect(efa.fit, "fitted") efa.fit.fitted #Note that the assessment of the EFA model fit may not be meaningful as estimated genetic covariance may have negative #residual covariance #due to estimation error #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Full weight matrix - DWLS - oblique rotation efa.fit.oblimin <- cfa(twofactormodel, sample.cov = covMat.smooth, rotation = "oblimin", estimator='DWLS', WLS.V = wMat, information = "expected", sample.nobs = out$n.ind, rotation.args = list(orthogonal = FALSE)) summary(efa.fit.oblimin) #Observed sample covariance efa.fit.oblimin.obs <- lavInspect(efa.fit.oblimin, "observed") efa.fit.oblimin.obs #Fitted sample covariance efa.fit.oblimin.fitted <- lavInspect(efa.fit.oblimin, "fitted") efa.fit.oblimin.fitted #Q2) What are the correlations between the genetic factors predicted by efa.fit.oblimin? Should you fit the correlation between genetic factors with GRM-SEM? #Q3) What model could you fit to prove that you don't need to fit the correlations? #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 3: Fit IPC model #Define starting values and constraints #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Define starting values #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Specific genetic variance IPC.AS <- diag(efa.fit@Model@GLIST$theta) #Set Heywood cases to zero IPC.AS <- ifelse(IPC.AS<0,0,IPC.AS) #Define starting values for specific genetic factor loadings based on EFA (lambda) and sqrt specific genetic variance IPC.A.v.startval <- c(c(efa.fit@Model@GLIST$lambda),c(sqrt(IPC.AS))) IPC.A.v.startval #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Define constraints #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #Set GRM-SEM factor loadings for EFA common genetic factor loadings < 0.1 to zero (cut-off) cutoff<- 0.1 IPC.A.v.free.common <- ifelse(c(abs(efa.fit@Model@GLIST$lambda))>cutoff,1,0) IPC.A.v.free.common #Fit all EFA specific genetic factor loadings IPC.A.v.free.specific <- rep(1, length(rownames(covMat))) #i.e. number of phenotypes IPC.A.v.free <- c(IPC.A.v.free.common,IPC.A.v.free.specific) IPC.A.v.free #Set all starting values for constrained factor loadings to 0 IPC.A.v.startval <- replace(IPC.A.v.startval, (IPC.A.v.free==0), 0) IPC.A.v.startval #Residual starting values IPC.E.v.startval <- out$model.out[out$model.in$part=="E",]$estimates print(IPC.E.v.startval) #Fit IPC model model.IPC <- "IPC" TIME3<-proc.time() out.ipc<-grmsem.fit(ph, G, model = model.IPC, cores = cores, n.AC = IPC.n.AC, A.v.free = IPC.A.v.free, A.v.startval = round(IPC.A.v.startval,2), E.v.startval = IPC.E.v.startval, LogL = TRUE, estSE = TRUE, estmu = FALSE) print(out.ipc) TIME4<-proc.time() - TIME3 TIME4 #Estimate the variance based on unstandardised factor loadings var.out.ipc<-grmsem.var(out.ipc) print(var.out.ipc) #Estimate absolute measure of fit out.ipc.srmr <- grmsem.srmr(ph,var.out.ipc) out.ipc.srmr #Q4) Based on SRMR, is the IPC model a good fit? #Standardise factor loadings so that phenotype sigma^2=1 stout.ipc<-grmsem.stpar(out.ipc) print(stout.ipc) #Estimate the variance based on standardised factor loadings stvar.out.ipc <- grmsem.var(stout.ipc) print(stvar.out.ipc) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 4: Evaluate the relative goodness of fit #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Saturated model out$model.fit$Chi out$model.fit$LL out$model.fit$LL.AIC out$model.fit$LL.BIC out$model.fit$LL.c.AIC out$model.fit$LL.c.BIC #IPC model out.ipc$model.fit$Chi out.ipc$model.fit$LL out.ipc$model.fit$LL.AIC out.ipc$model.fit$LL.BIC out.ipc$model.fit$LL.c.AIC out.ipc$model.fit$LL.c.BIC #Likelihood ratio test (LRT) Cholesky versus IPC # LR = -2 ln (L_simple / L_saturated) # LR = -2 (LL_simple - LL_saturated) #Simplification LR <- (out.ipc$model.fit$Chi - out$model.fit$Chi) LR #LL model formula LR2 <- -2*(out.ipc$model.fit$LL - out$model.fit$LL) LR2 # difference in df df.diff <- out.ipc$model.fit$df - out$model.fit$df # LRT p-value LRT.p <- pchisq(LR, df.diff, lower.tail = FALSE) LRT.p #Q5 Extract AIC and BIC and SRMR indices for Cholesky and IPC models? What is your conclusion? #Q6 Now inspect the "genuine" genetic and residual covariance matrices and compare with the genetic variance predicted by the Cholesky (var.out$VA) and the IPC model. #What is your conclusion, especially taking Q5 into account? #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 5: Assess the prediction accuracy of EFA with GRM-SEM factor loadings with ? #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# cor_efa_grmsem <- cor(IPC.A.v.startval,out.ipc$model.out$estimates[out.ipc$model.in$part == "A"], use="pairwise.complete.obs") cor_efa_grmsem #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Step 6: What can we do with the model? #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Factorial co-heritability: proportion of genetic trait variance explained by a genetic factor grmsem.fcoher(out.ipc) grmsem.fcoher(out.ipc)$fcoher.model.out[,c(1,7:10)] #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#