#
NTYPES=500
NTOKENS=1000000
WC <- matrix(nrow=NTYPES,ncol=1) 
WLC <- matrix(nrow=NTYPES,ncol=1)
WDC <- matrix(nrow=NTYPES,ncol=1)
# 1/F distribution of word probabilities
FF <- 1:NTYPES
FF1 <- (1/FF)/sum(1/FF)
# [Independent] 1/F conditional probabilities of modification
#    Note: sample() here accomplishes a random permutation
LL <- 1:NTYPES
LL1 <- sample((1/LL)/sum(1/LL), replace=FALSE)
# And a different one, as if for a different modifier
DD <- 1:NTYPES
DD1 <- sample((1/DD)/sum(1/DD), replace=FALSE)
#
# Get a random sample of size NTOKENS
X <- sample(1:NTYPES, NTOKENS, replace=TRUE, prob=FF1)
# Which of them are modified?
XL <- runif(NTOKENS) < LL1[X]
XD <- runif(NTOKENS) < DD1[X]
# Sort according to how often they are modified by L,
#      along with their counts (with and without modification)
for(n in 1:NTYPES){
 Wwhich <- (X == n)
 WC[n] <- sum(Wwhich)
 WLC[n] <- sum(XL[Wwhich])
 WDC[n] <- sum(XD[Wwhich])
}

# counts of modification

SWLC <- sort.int(WLC, decreasing=TRUE, index.return=TRUE, method="quick")

SLC <- WC[SWLC$ix]  # empirical "word" frequencies

png(filename="LiterallyRandom%d.png", width=600, height=600)

plot(log(SLC),SWLC$x/SLC, xlab="log frequency", ylab="P(modification)",
     main="Random modification test 1: 500 types, 1M tokens",
     sub="Blue o's are all word types, Red x's are 100 commonest", 
     type="p", pch="o", col="blue")

points(log(SLC[1:100]), (SWLC$x)[1:100]/SLC[1:100], 
        xlab="log frequency", ylab="P(modification)",
        type="p", pch="x", col="red")

# Now plot as if for alternative modifier
plot(log(SLC),WDC[SWLC$ix]/SLC, xlab="log frequency", ylab="P(modification)",
     main="Random modification test 2: 500 types, 1M tokens",
     sub="Blue o's are all word types, Red x's are 100 commonest", 
     type="p", pch="o", col="blue")

points(log(SLC[1:100]), (WDC[SWLC$ix]/SLC)[1:100], 
        xlab="log frequency", ylab="P(modification)",
        type="p", pch="x", col="red")