#### Predictive model using stratified k-fold cross validation:
## split the data into k sections and run models holding one out at a time
## assess how models generalize to the remaining fold
## stratified = make sure folds are equal across predictors of interest
## runs various combinations of parameters for multiple repetitions
## set up packages
library(lme4)
source('I:/shared/Laurel/PartnerNaming/scripts/crossvalidation_funcs.R')
######## Specify Inputs #######
##read in data. to calculate MSPE, all NA values in predictors (or DVs) should be omitted.
vs <- read.table('I:/shared/Laurel/PartnerNaming/pnp/analysis/pnp_voiceData.txt',header=T)
## a list of all models to run over folds, including the basic one.
## left column is the name of the model (user specified), and the right column is the parameters to add in.
## here, I am running 3 series of models-- the first 7 get compared to the intercept only base model.
## the next 2 sets of 7 get compared to the first 7, respectively. each has an added parameter.
## to make it easy to pull these last ones out, I started all their names with the same character
mL <- rbind( c("intercept",""),
c("a_on_m", "a_on"),
c("a_off_m","a_off"),
c("r_a_on_m","rolling_a_on"),
c("r_a_off_m","rolling_a_off"),
c("a_on_off_m","a_on + a_off"),
c("r_a_on_of_m","rolling_a_on + rolling_a_off"),
c("Occl_m","occluded"),
c("O_a_on_m", "a_on + occluded"),
c("O_a_off_m","a_off + occluded"),
c("O_a_on_off_m","a_on + a_off + occluded"),
c("O_r_a_on_m","rolling_a_on + occluded"),
c("O_r_a_off_m","rolling_a_off + occluded"),
c("O_r_a_on_off_m","rolling_a_on + rolling_a_off + occluded"),
c("Bl_m","context"),
c("Bl_a_on_m", "a_on + context"),
c("Bl_a_off_m","a_off + context"),
c("Bl_a_on_off_m","a_on + a_off + context"),
c("Bl_r_a_on_m","rolling_a_on + context"),
c("Bl_r_a_off_m","rolling_a_off + context"),
c("Bl_r_a_on_off_m","rolling_a_on + rolling_a_off + context")
)
colnames(mL)<- c("model","added_params")
#### Run Function ####
## first three arguments are specified above:
## 1. the data set to use, called 'vs' -- this must have that name.
## then 2. the base model to compare to and 3. the lookup table for the models to run, containing two columns
## if base model is contained in lookup table, will compare MSPE for all models to it
## if not, that column will be NA in output
## stratified_by is the column or columns in data set to divide out equally across cross-validation folds.
## dv is the common dependant measure for all models
## reps = repetitions of cross-validation procedure
## folds = number of divisions of data to cross-validate over. 10 is a good number.
## output is a list with two tables in it
## [[1]]: output table of BIC, MSPE for each model and the comparisons to specfied comparison model
## [[2]]: the results from the last model run. Useful for plotting comparison to actual data
## options for stratified_by: NA (for no stratifying), a single variable, or a list of variables
## model type and formula for simplest, most basic model to add stuff to
## model needs to have at least one + for this code to work. We will add the extra params after the final +
mB <- lmer(b_offA ~ 1 + (1|subj),data=vs)
#mB <- lmer(evl ~ 1 + (1|subj),data=vs)
#in the works: a parallell version of the code... not yet fully implemented.
os <- cvLmer(vs,mBase=mB,modelLookup=mL,
stratified_by= c("a_syl", "context", "occluded"),
dv="b_offA",
reps=1,folds=10)
outa <- os[[1]]
vs2 <- os[[2]]
### for eye-voice lead
## to omit all rows with NA values:
vs <- na.omit(vs)
mB <- lmer(evl ~ 1 + (1|subj),data=vs)
os <- cvLmer(vs,mBase=mB,modelLookup=mL,
stratified_by= c("a_syl", "context", "occluded"),
dv="evl",
reps=500,folds=10)
outb <- os[[1]]
vs2 <- os[[2]]
# #### DOES NOT YET WORK
# ## paralell version for multiple runs. requires the libraries doParallel and foreach.
# ##this computer has 4 cores, so use 3 for simulating
# os <- cvLmerP(vs,mBase=mB,modelLookup=mL,
# stratified_by= c("a_syl", "context", "occluded"),
# dv="flB",
# reps=500,folds=10,NCores=3)
# out <- os[[1]]
# vs2 <- os[[2]]
### Glmer version: thinking about whether people look at unnecessary pic
## answer: well, (1) they don't, and (2) we can't predict it well.
## The dv atleast1 is cases where there was at least 1 look to the prompt pic A-- analyze with binomial regression
## The outputs in out[[1]] include BIC and the correct classification rate for held-out fold.
## The outputs added to the original data set in out[[2]] are predicted values on the default log-odds scale
mL2 <- rbind( c("intercept",""),
c("a_on_m", "a_on"),
c("a_off_m", "a_off"),
c("a_onoff_m", "a_on + a_off")
)
colnames(mL)<- c("model","added_params")
mB2 <- glmer(atleast1A ~ 1 + (1|subj),data=vs,family='binomial')
os <- cvGlmer(vs,mBase=mB2,modelLookup=mL2,
stratified_by= "subj",
dv="atleast1A",
reps=2,folds=10,fam='binomial')
out <- os[[1]]
vs2 <- os[[2]]
### I have not yet tested this code for other model families (like poisson)-- but I *think* it should work.
## The function just passes the argument directly on to the family in glmer()
#### For this paper, I have three sets of comparisons to make for whichever DV. I'm going to print them separately ####
## the first are comparisons to the pre-specified 'base' model.
## pull these out based on how I have named them. the consistency is that the ones that need to be pulled out start with the same character, 'O'
## and these ones all don't start with O
## pull them out, printing in order of the delta BIC, which has to be coerced into a number
o1 <- out[substr(out$model,1,1)!='O' & substr(out$model,1,1)!='B',]
o1[order(-(as.numeric(as.character(o1$DeltaBIC)))),]
## the occluded ones all do start with O
## for these, I actually want to compare them to the models in o1 that don't have the parameter 'occluded'
## here's how to change the model table post-hoc to do that
## for each model specified, there is a version with and without 'occluded', so this works.
o2<- out[substr(out$model,1,1)=='O',]
## figure out the model spec without the occluded parameter, and save in a vector
o2$c_added_params <- gsub("occluded","",o2$added_params)
## also get rid of trailing +
o2$c_added_params <- gsub(" \\+ $","",o2$c_added_params)
## then order both data frames in the same way,
o1 <- o1[order(o1$added_params),]
o2 <- o2[order(o2$c_added_params),]
## then correct the delta BIC and delta MSPE
o2[,4]<- as.numeric(as.character(o1[,3]))-as.numeric(as.character(o2[,3]))
o2[,6]<- as.numeric(as.character(o1[,5]))-as.numeric(as.character(o2[,5]))
### and print out ordered by delta BIC
o2[order(-(as.numeric(as.character(o2$DeltaBIC)))),]
## the block ones all do start with B
## for these, I actually want to compare them to the models in o1 that don't have the parameter 'context'
## do as above
o3<- out[substr(out$model,1,1)=='B',]
## figure out the model spec without the context parameter, and save in a vector
o3$c_added_params <- gsub("context","",o3$added_params)
## also get rid of trailing +
o3$c_added_params <- gsub(" \\+ $","",o3$c_added_params)
## then order both data frames in the same way,
o1 <- o1[order(o1$added_params),]
o3 <- o3[order(o3$c_added_params),]
## then correct the delta BIC and delta MSPE
o3[,4]<- as.numeric(as.character(o1[,3]))-as.numeric(as.character(o3[,3]))
o3[,6]<- as.numeric(as.character(o1[,5]))-as.numeric(as.character(o3[,5]))
### and print out ordered by delta BIC
o3[order(-(as.numeric(as.character(o3$DeltaBIC)))),]
#### Model fitting has allowed me to choose the model called "O_a_on_off_m" as the best ####
## It has the lowest overall BIC and MSPE.
## For this model, calculate some fit diagnostics:
## get the quantiles of the original data
qs <- quantile(vs2$b_offA,c(0.05,.95))
## figure out how good the fit is in the middle 90% of the data: MSPE
mean(vs2[vs2$b_offA> qs[1] & vs2$b_offA < qs[2],]$O_a_on_off_mErr)
mean(sqrt(vs2[vs2$b_offA> qs[1] & vs2$b_offA < qs[2],]$O_a_on_off_mErr))
## also evaluate the left tail
mean(vs2[vs2$b_offA< qs[1],]$O_a_on_off_mErr)
mean(sqrt(vs2[vs2$b_offA< qs[1],]$O_a_on_off_mErr))
## and the right tail
mean(vs2[vs2$b_offA> qs[2],]$O_a_on_off_mErr)
mean(sqrt(vs2[vs2$b_offA> qs[2],]$O_a_on_off_mErr))
### plot the best model by fold from the outputs saved from last run of the model, using ggplot:
library(ggplot2)
vs2$Fold <- as.factor(vs2$subset)
## make a color ramp from mpi greens of appropriate length
cs <- colorRampPalette(c("cyan","#007869"))(max(vs2$subset))
ggplot(vs2, aes(y=O_a_on_off_m,x=b_offA,pch=Fold,lty=Fold,color=Fold))+
geom_abline(intercept=0,slope=1)+
geom_point(alpha=.3)+
geom_smooth(se=F)+
scale_x_continuous("Observed Turn Gap",limits=c(-500,1150),breaks=c(seq(-400,1100,100)))+
scale_y_continuous("Predicted Turn Gap",limits=c(-500,1150),breaks=c(seq(-400,1100,100)))+
theme_bw()+
scale_color_manual(values=cs)+
scale_shape_manual(values=c(1,2,3,4,5,6,1,2,3,4))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
guides(colour = guide_legend(override.aes = list(alpha = 1)))