# NTYPES=500 NTOKENS=1000000 WC <- matrix(nrow=NTYPES,ncol=1) WLC <- matrix(nrow=NTYPES,ncol=1) WDC <- matrix(nrow=NTYPES,ncol=1) # 1/F distribution of word probabilities FF <- 1:NTYPES FF1 <- (1/FF)/sum(1/FF) # [Independent] 1/F conditional probabilities of modification # Note: sample() here accomplishes a random permutation LL <- 1:NTYPES LL1 <- sample((1/LL)/sum(1/LL), replace=FALSE) # And a different one, as if for a different modifier DD <- 1:NTYPES DD1 <- sample((1/DD)/sum(1/DD), replace=FALSE) # # Get a random sample of size NTOKENS X <- sample(1:NTYPES, NTOKENS, replace=TRUE, prob=FF1) # Which of them are modified? XL <- runif(NTOKENS) < LL1[X] XD <- runif(NTOKENS) < DD1[X] # Sort according to how often they are modified by L, # along with their counts (with and without modification) for(n in 1:NTYPES){ Wwhich <- (X == n) WC[n] <- sum(Wwhich) WLC[n] <- sum(XL[Wwhich]) WDC[n] <- sum(XD[Wwhich]) } # counts of modification SWLC <- sort.int(WLC, decreasing=TRUE, index.return=TRUE, method="quick") SLC <- WC[SWLC$ix] # empirical "word" frequencies png(filename="LiterallyRandom%d.png", width=600, height=600) plot(log(SLC),SWLC$x/SLC, xlab="log frequency", ylab="P(modification)", main="Random modification test 1: 500 types, 1M tokens", sub="Blue o's are all word types, Red x's are 100 commonest", type="p", pch="o", col="blue") points(log(SLC[1:100]), (SWLC$x)[1:100]/SLC[1:100], xlab="log frequency", ylab="P(modification)", type="p", pch="x", col="red") # Now plot as if for alternative modifier plot(log(SLC),WDC[SWLC$ix]/SLC, xlab="log frequency", ylab="P(modification)", main="Random modification test 2: 500 types, 1M tokens", sub="Blue o's are all word types, Red x's are 100 commonest", type="p", pch="o", col="blue") points(log(SLC[1:100]), (WDC[SWLC$ix]/SLC)[1:100], xlab="log frequency", ylab="P(modification)", type="p", pch="x", col="red")