Setup

library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, desc, failwith, id, mutate, summarise, summarize
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(mgcv)

## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.7-29. For overview type 'help("mgcv-package")'.

The data:

One row per speaker
nwords = total number of words in speaker’s textgrid
UH and UM = number of “uhs” and “ums”
sex= sex of speaker
year and age = year of interview and age at interview
ethnicity = … too many single character codes to deal with
schooling = (estimated) years of schooling
transcribed = how many seconds of transcribed interview there is
total = total duration of interview in seconds
nvowels = number of primary stressed vowels

um <- read.delim("um.txt")
head(um)

##   nwords UH UM sex year age ethnicity schooling transcribed total nvowels
## 1   9292 90 60   m 2000  21       i/r        14        2811  2814    3078
## 2   8772 17 19   f 2000  52       o/i        12        2813  2826    3702
## 3  16462 22 69   m 2000  18         i        12        3706  3718    5885
## 4  11835 18 52   m 2000  18         i        12        2778  3957    3884
## 5  10212 43  7   m 2000  70         i        12        2740  2759    4119
## 6   3432 44  5   f 2000  65         a        12        1201  2689    1376

Modelling

Fitting two quick linear models for each word. One has a three way interaction between age, sex, and year, and one has only all two way interactions.

mod_um_max <- glm(UM ~ age*year*sex, offset = log(nwords), data = um, family = "poisson")
mod_uh_max <- glm(UH ~ age*year*sex, offset = log(nwords), data = um, family = "poisson")

mod_um_2way <- glm(UM ~ age*year + sex*age + sex*year, 
                  offset = log(nwords), data = um, family = "poisson")
mod_uh_2way<- glm(UH ~ age*year + sex*age + sex*year, 
                  offset = log(nwords), data = um, family = "poisson")

Liklihood Ratio tests and AIC comparsions seem like “UH” needs the three way interaction, while maybe “UM” doesn’t. I’ll do all the plotting with the 3 way interactions for both.

anova(mod_um_max, mod_um_2way, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: UM ~ age * year * sex
## Model 2: UM ~ age * year + sex * age + sex * year
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       387       5517                     
## 2       388       5518 -1   -0.511     0.47

anova(mod_uh_max, mod_uh_2way, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: UH ~ age * year * sex
## Model 2: UH ~ age * year + sex * age + sex * year
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1       387       9155                         
## 2       388       9189 -1    -34.4  4.4e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC(mod_uh_max, mod_uh_2way) %>%
    do(data.frame(model = rownames(.),.))%>%
    mutate(AIC_diff = c(NA, diff(AIC)))

##         model df   AIC AIC_diff
## 1  mod_uh_max  8 11174       NA
## 2 mod_uh_2way  7 11206    32.43

AIC(mod_um_max, mod_um_2way) %>%
  do(data.frame(model = rownames(.),.))%>%
  mutate(AIC_diff = c(NA, diff(AIC)))

##         model df  AIC AIC_diff
## 1  mod_um_max  8 6875       NA
## 2 mod_um_2way  7 6874   -1.489

Dummy data for prediction.

um_grid <- expand.grid(age = c(20,30,40,50,60),
                       year = c(1973,1983,1993,2003,2013),
                       nwords = 1000,
                       sex = c("f","m"),
                       word = "UM") %>% mutate(dob = year-age)

uh_grid <- expand.grid(age = c(20,30,40,50,60),
                       year = c(1973,1983,1993,2003,2013),
                       nwords = 1000,
                       sex = c("f","m"),
                       word = "UH") %>% mutate(dob = year-age)

um_grid$response <- predict(mod_um_max, newdata = um_grid, type = "resp")
uh_grid$response <- predict(mod_uh_max, newdata = uh_grid, type = "resp")

Plotting

UM

First, predicted frequency of UM per 1000 words, with the lines joined up according to year of study.

ggplot(um_grid, aes(age, response))+
  geom_line(aes(group = year, color = year))+
  ylab("expected UM per 1000 words")+
  facet_grid(.~sex, labeller = label_both)

plot of chunk unnamed-chunk-6

You can see the apparent-time change, which shifts upwards every year of the study.

Connecting up the predicted values by Date of Birth cohorts gives a different view.

ggplot(um_grid, aes(age, response))+
  geom_line(aes(group = dob, color = dob))+
  ylab("expected UM per 1000 words")+
  facet_grid(.~sex, labeller = label_both)

plot of chunk unnamed-chunk-7

It looks like if anything, across a DOB cohort’s lifespan, they shift more towards more UM, which is also the direction of the real & apparent time change.

And here are some UM plots, emphasizing the sex differences.

ggplot(um_grid, aes(age, response))+
  geom_line(aes(group = sex, color = sex))+
  ylab("expected UM per 1000 words")+
  facet_grid(~year, labeller = label_both)

plot of chunk unnamed-chunk-8

um_grid %>% filter(!dob %in% c(1913, 1993))%>%
  ggplot(., aes(age, response))+
  geom_line(aes(group = sex, color = sex))+
  ylab("expected UM per 1000 words")+
  facet_grid(~dob, labeller = label_both)

plot of chunk unnamed-chunk-9

UH

Similarly, here’s the plots for UH. Again, there’s the apparent time trend towards less UH, which advances in that direction over each year of the study.

ggplot(uh_grid, aes(age, response))+
  geom_line(aes(group = year, color = year))+
  ylab("expected UH per 1000 words")+
  facet_grid(.~sex, labeller = label_both)

plot of chunk unnamed-chunk-10

Connecting speakers up by DOB is… more complicated. Is the curved shape “real”? Probably not, I’d probably want to say these are effectively flat.

ggplot(uh_grid, aes(age, response))+
  geom_line(aes(group = dob, color = dob))+
  ylab("expected UH per 1000 words")+
  facet_grid(.~sex, labeller = label_both)

plot of chunk unnamed-chunk-11

ggplot(uh_grid, aes(age, response))+
  geom_line(aes(group = sex, color = sex))+
  ylab("expected UH per 1000 words")+
  facet_grid(~year, labeller = label_both)

plot of chunk unnamed-chunk-12

uh_grid %>% filter(!dob %in% c(1913, 1993))%>%
  ggplot(., aes(age, response))+
    geom_line(aes(group = sex, color = sex))+
    ylab("expected UH per 1000 words")+
    facet_grid(~dob, labeller = label_both)

plot of chunk unnamed-chunk-13

Filled Pauses in the PNC

Josef Fruehwald

August 13, 2014

Setup

Modelling

Plotting

UM

UH