library(lme4)
source("~/Documents/RStats/bruteAMOC.R")
### simulations to demo break-point modeling for discrete/gradient predictor categorization
## Laurel Brehm, November 2015
## (you'll need the above materials. you can download bruteAMOC from my website!)
## Simulated data
# random terms for subjects and items
eS <- rnorm(40,sd=.1)
eI <- rnorm(40,sd=.1)
#combine terms to simulate data
d <- matrix(nrow=1600,ncol=5)
p <- vector(length=1600)
for (k in 1:40){ #subjects
sk <- runif(1,min=12,max=28)
aa <- c(rep(.3,sk),rep(.7,41-sk)) #data generated from a 30-70 split that has a break that varies by person between 12 and 28
for (j in 1:40){ #items
i = (k-1)*40 + j # indices
d[i,1] <- paste('i',j,sep=".") #subject codes
d[i,2] <- paste('s',k,sep=".") #item codes
#calculate probability of binomial outcome from predictor aa and random terms, keeping points between 0 and 1
p[i] <- aa[j] + eS[k] + eI[j] + rnorm(1,sd=.05)
p[i] <- ifelse(p[i]>1.0,1.00,p[i])
p[i] <- ifelse(p[i]<0.0,0.00,p[i])
d[i,3] <- aa[j] + runif(1,min=-.2,max=.2) - .5 #add uniform noise to aa to make it take the whole range, and center
#use probability to get binomial DV
d[i,4] <- rbinom(1,1,p[i])
#then add terms together with some more noise for continuous DV
d[i,5] <- 1000 + aa[j]*1000 + (eS[k] + eI[j])*1000 + rnorm(1,sd=200)
}}
d<- as.data.frame(d)
colnames(d)=c("item","subj","aa","Resp","RT")
d$aa <- round(as.numeric(as.character(d$aa)),2)
d$RT <- round(as.numeric(as.character(d$RT)),0)
### Demo: Find break point in continuous DV measure with latent categorical predictor
#Categorical model has 1 break point with slope of 0 on either side. Continuous model has 0 break points and a non-0 slope.
## First create a comparison model
RTComp <- lmer(RT~ 1+ aa + (1|subj) + (1|item), data=d)
summary(RTComp)
#Next create a set of piecewise models using grid search to find one break in the model.
#Here, we're looking for breaks in factor aa in the middle 20% of the data.
#Note: warning 'fixed-effect model matrix is rank deficient' is OK. the rank deficiency is because of the discontinuity we allowed in the piecewise model.
RTBreaks <- runAMOC(cmodel=RTComp,breakfactor='aa',onset=.4,offset=.6)
RTBreaks
#Re-run the best piecewise model and save it separately
#take the model that has highest LL and meets all convergence criteria
RTBroken <- eval(RTBreaks[5][[1]])
#Output of model coefficients (formatted so it's a little easier to read)
#Get CIs from model profile and append
labeledSummary(RTBroken,'aa',getCI=T)
#let's reprint the original model to compare
summary(RTComp)
#compared to the unbroken model, the broken model has a higher overall estimate of intercept, and a lower overall estimate of predictor aa. That's because these effects are actually confined to the left half of the model-- there's a large and reliable adjustment to the intercept in the case that aa is below the breakpoint. There is no reliable adjustment to the slope of aa, in contrast. These show recovery of the categorical predictor we used to generate the data.
#Now perform model fitting to confirm, comparing original model and broken one
#Highest log likelihood and lowest BIC is better model. Confirm with chi square on log likelihood ratio
adjABIC(RTBroken,RTComp)
#by likelihood tests, the broken model is better too.
###########
##### Demo: Find breakpoint for binomial DV
#Categorical predictor model has 1 break point with slope of 0 on either side. Continuous predictor model has 0 break points and a non-0 slope.
## First create a comparison model
RespComp <- glmer(Resp ~ 1 + aa + (1|subj) + (1|item), family='binomial',data=d)
summary(RespComp)
#Next create a set of piecewise models using grid search to find one break in the model.
#Here, we're looking for breaks in factor aa in the middle 20% of the data.
#Note: warning 'fixed-effect model matrix is rank deficient' is OK. the rank deficiency is because of the discontinuity we allowed in the piecewise model.
# I have found that glm generally spits out more errors than lme-- especially with more variables! this is due to the logistic link function. an error-free model has Warnings=0 in the tabulated output.
# if using in real data with a more complicated set of predictors, you'll probably need a fair number of observations. I have not run a full investigation of how many.
RespBreaks <- runAMOC(cmodel=RespComp,breakfactor='aa',onset=.4,offset=.6)
RespBreaks
#get more info about any warnings that were thrown
warnings()
#Re-run the best piecewise model (with full convergence) and save it separately
RespBroken <- eval(RespBreaks[5][[1]])
#Output of model coefficients (formatted so it's a little easier to read), with profile CIs if flag is set to T(warning: super slow code!)
labeledSummary(RespBroken,'aa',getCI=F)
## note that again, it's clear that predictor aa has a categorical effect on outcome-- it has an effect in only half of the data. This is confirmed by looking in the broken model at the overall intercept (higher than unbroken) and overeall estimate of aa (lower than unbroken), as well as the adjustment to intercept (highly reliable). This tells us that the intercept in both halves of the model is different--we have recovered the categorical predictor we generated the data from.
#Now perform model fitting to confirm, comparing original model and broken one
#Highest log likelihood and lowest BIC is better model. Confirm with chi square on log likelihood ratio
adjABIC(RespBroken,RespComp)
#and, the same thing with likelihood tests: broken model provides a better fit to the data.