library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, desc, failwith, id, mutate, summarise, summarize
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.7-29. For overview type 'help("mgcv-package")'.
The data:
nwords
= total number of words in speaker’s textgridUH
and UM
= number of “uhs” and “ums”sex
= sex of speakeryear
and age
= year of interview and age at interviewethnicity
= … too many single character codes to deal withschooling
= (estimated) years of schoolingtranscribed
= how many seconds of transcribed interview there istotal
= total duration of interview in secondsnvowels
= number of primary stressed vowelsum <- read.delim("um.txt")
head(um)
## nwords UH UM sex year age ethnicity schooling transcribed total nvowels
## 1 9292 90 60 m 2000 21 i/r 14 2811 2814 3078
## 2 8772 17 19 f 2000 52 o/i 12 2813 2826 3702
## 3 16462 22 69 m 2000 18 i 12 3706 3718 5885
## 4 11835 18 52 m 2000 18 i 12 2778 3957 3884
## 5 10212 43 7 m 2000 70 i 12 2740 2759 4119
## 6 3432 44 5 f 2000 65 a 12 1201 2689 1376
Fitting two quick linear models for each word. One has a three way interaction between age
, sex
, and year
, and one has only all two way interactions.
mod_um_max <- glm(UM ~ age*year*sex, offset = log(nwords), data = um, family = "poisson")
mod_uh_max <- glm(UH ~ age*year*sex, offset = log(nwords), data = um, family = "poisson")
mod_um_2way <- glm(UM ~ age*year + sex*age + sex*year,
offset = log(nwords), data = um, family = "poisson")
mod_uh_2way<- glm(UH ~ age*year + sex*age + sex*year,
offset = log(nwords), data = um, family = "poisson")
Liklihood Ratio tests and AIC comparsions seem like “UH” needs the three way interaction, while maybe “UM” doesn’t. I’ll do all the plotting with the 3 way interactions for both.
anova(mod_um_max, mod_um_2way, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: UM ~ age * year * sex
## Model 2: UM ~ age * year + sex * age + sex * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 387 5517
## 2 388 5518 -1 -0.511 0.47
anova(mod_uh_max, mod_uh_2way, test = "Chisq")
## Analysis of Deviance Table
##
## Model 1: UH ~ age * year * sex
## Model 2: UH ~ age * year + sex * age + sex * year
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 387 9155
## 2 388 9189 -1 -34.4 4.4e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(mod_uh_max, mod_uh_2way) %>%
do(data.frame(model = rownames(.),.))%>%
mutate(AIC_diff = c(NA, diff(AIC)))
## model df AIC AIC_diff
## 1 mod_uh_max 8 11174 NA
## 2 mod_uh_2way 7 11206 32.43
AIC(mod_um_max, mod_um_2way) %>%
do(data.frame(model = rownames(.),.))%>%
mutate(AIC_diff = c(NA, diff(AIC)))
## model df AIC AIC_diff
## 1 mod_um_max 8 6875 NA
## 2 mod_um_2way 7 6874 -1.489
Dummy data for prediction.
um_grid <- expand.grid(age = c(20,30,40,50,60),
year = c(1973,1983,1993,2003,2013),
nwords = 1000,
sex = c("f","m"),
word = "UM") %>% mutate(dob = year-age)
uh_grid <- expand.grid(age = c(20,30,40,50,60),
year = c(1973,1983,1993,2003,2013),
nwords = 1000,
sex = c("f","m"),
word = "UH") %>% mutate(dob = year-age)
um_grid$response <- predict(mod_um_max, newdata = um_grid, type = "resp")
uh_grid$response <- predict(mod_uh_max, newdata = uh_grid, type = "resp")
First, predicted frequency of UM per 1000 words, with the lines joined up according to year of study.
ggplot(um_grid, aes(age, response))+
geom_line(aes(group = year, color = year))+
ylab("expected UM per 1000 words")+
facet_grid(.~sex, labeller = label_both)
You can see the apparent-time change, which shifts upwards every year of the study.
Connecting up the predicted values by Date of Birth cohorts gives a different view.
ggplot(um_grid, aes(age, response))+
geom_line(aes(group = dob, color = dob))+
ylab("expected UM per 1000 words")+
facet_grid(.~sex, labeller = label_both)
It looks like if anything, across a DOB cohort’s lifespan, they shift more towards more UM, which is also the direction of the real & apparent time change.
And here are some UM plots, emphasizing the sex differences.
ggplot(um_grid, aes(age, response))+
geom_line(aes(group = sex, color = sex))+
ylab("expected UM per 1000 words")+
facet_grid(~year, labeller = label_both)
um_grid %>% filter(!dob %in% c(1913, 1993))%>%
ggplot(., aes(age, response))+
geom_line(aes(group = sex, color = sex))+
ylab("expected UM per 1000 words")+
facet_grid(~dob, labeller = label_both)
Similarly, here’s the plots for UH. Again, there’s the apparent time trend towards less UH, which advances in that direction over each year of the study.
ggplot(uh_grid, aes(age, response))+
geom_line(aes(group = year, color = year))+
ylab("expected UH per 1000 words")+
facet_grid(.~sex, labeller = label_both)
Connecting speakers up by DOB is… more complicated. Is the curved shape “real”? Probably not, I’d probably want to say these are effectively flat.
ggplot(uh_grid, aes(age, response))+
geom_line(aes(group = dob, color = dob))+
ylab("expected UH per 1000 words")+
facet_grid(.~sex, labeller = label_both)
ggplot(uh_grid, aes(age, response))+
geom_line(aes(group = sex, color = sex))+
ylab("expected UH per 1000 words")+
facet_grid(~year, labeller = label_both)
uh_grid %>% filter(!dob %in% c(1913, 1993))%>%
ggplot(., aes(age, response))+
geom_line(aes(group = sex, color = sex))+
ylab("expected UH per 1000 words")+
facet_grid(~dob, labeller = label_both)