#
NTYPES=500
NTOKENS=1000000
WC <- matrix(nrow=NTYPES,ncol=1)
WLC <- matrix(nrow=NTYPES,ncol=1)
WDC <- matrix(nrow=NTYPES,ncol=1)
# 1/F distribution of word probabilities
FF <- 1:NTYPES
FF1 <- (1/FF)/sum(1/FF)
# [Independent] 1/F conditional probabilities of modification
# Note: sample() here accomplishes a random permutation
LL <- 1:NTYPES
LL1 <- sample((1/LL)/sum(1/LL), replace=FALSE)
# And a different one, as if for a different modifier
DD <- 1:NTYPES
DD1 <- sample((1/DD)/sum(1/DD), replace=FALSE)
#
# Get a random sample of size NTOKENS
X <- sample(1:NTYPES, NTOKENS, replace=TRUE, prob=FF1)
# Which of them are modified?
XL <- runif(NTOKENS) < LL1[X]
XD <- runif(NTOKENS) < DD1[X]
# Sort according to how often they are modified by L,
# along with their counts (with and without modification)
for(n in 1:NTYPES){
Wwhich <- (X == n)
WC[n] <- sum(Wwhich)
WLC[n] <- sum(XL[Wwhich])
WDC[n] <- sum(XD[Wwhich])
}
# counts of modification
SWLC <- sort.int(WLC, decreasing=TRUE, index.return=TRUE, method="quick")
SLC <- WC[SWLC$ix] # empirical "word" frequencies
png(filename="LiterallyRandom%d.png", width=600, height=600)
plot(log(SLC),SWLC$x/SLC, xlab="log frequency", ylab="P(modification)",
main="Random modification test 1: 500 types, 1M tokens",
sub="Blue o's are all word types, Red x's are 100 commonest",
type="p", pch="o", col="blue")
points(log(SLC[1:100]), (SWLC$x)[1:100]/SLC[1:100],
xlab="log frequency", ylab="P(modification)",
type="p", pch="x", col="red")
# Now plot as if for alternative modifier
plot(log(SLC),WDC[SWLC$ix]/SLC, xlab="log frequency", ylab="P(modification)",
main="Random modification test 2: 500 types, 1M tokens",
sub="Blue o's are all word types, Red x's are 100 commonest",
type="p", pch="o", col="blue")
points(log(SLC[1:100]), (WDC[SWLC$ix]/SLC)[1:100],
xlab="log frequency", ylab="P(modification)",
type="p", pch="x", col="red")