# Vowel classification using knn on MFCCs
# This one compares
# (1) MFCCs at 50%
# (2) MFCCs at 50% with duration and mean F0
# (3) MFCCs at 20% and 80%
# (4) MFCCs at 20% and 80% with duration and mean F0
# (5) MFCCs at 20% 50% 80%
# (6) MFCC at 20%, 50%, 80% with duration and mean F0
# Data from Hillenbrand 1995
# http://homepages.wmich.edu/~hillenbr/Papers/HillenbrandGettyClarkWheeler.pdf
# http://homepages.wmich.edu/~hillenbr/voweldata.html
# MFCCs calculated by downsampling to 8 KHz, calculating 12 mel-frequency ceptral coeffs
# (see MFCCjob1, Xconfig.mfcc)
# Use 10-fold cross validation with random assignments of speakers to 10 groups
require(FNN) # for knn() etc.
KN <- 5 # K for K-nearest-neighbor
source("kclassify.R")
# Key file Hkey.dat
# column 1 - file ID - i.e. b01ae b01ah etc.
# col2: 1=man 2=woman 3=boy 4=girl
# col3: 1=ae 2=ah 3=aw 4=eh 5=er 6=ey
# 7=ih 8=iy 9=oa 10=oo 11=uh 12=uw
# col4: unique speaker ID, 1 to 139
Keys <- read.table("Hkey.dat")
rownames(Keys) <- Keys[,1]
colnames(Keys) <- c("File", "SpeakerType", "Vowel", "Speaker")
#
# MFCCs + duration + pitch data file H4KMFCC.dat
# col1: file prefix
# col2 to col13: mfccs at 20%
# col14 to col25: mfccs at 50%
# col26 to col37: mfccs at 80%
# col38: duration in msec
# col39: f0 at "steady state"
Data <- read.table("H4KMFCC.dat")
Ndatacols <- dim(Data)[2]
rownames(Data) <- Data[,1]
c("File",
paste(paste("m", 1:12, sep = ""), rep(c(2,5,8), each = 12), sep = "_"),
"Dur",
"F0")->colnames(Data)
# z-score normalization of data
for(n in 2:39){
cm <- mean(Data[,n])
csd <- sqrt(var(Data[,n]))
Data[,n] <- (Data[,n]-cm)/csd
}
# Sanity check on data and keys
Mismatches <- sum(Data[,1] != Keys[,1])
#
# Division into 10 groups for cross validation
Nspeakers <- max(Keys[,4])
Nrows <- dim(Keys)[1]
GroupLeads <- c(1, round(((1:10)/10)*(Nspeakers+1)))
# Index spans for mth cross-validation grous will be GroupLeads[n] <= m < GroupLeads[n+1]
# But indices must be scrambled to mix up speaker types
PermutedSpeakers <- sample(1:Nspeakers)
#
spanstart<-1:10
spanend<-1:10
Groups <- matrix(nrow=Nrows, ncol=10, data=FALSE)
for(n in 1:10){
spanstart <- GroupLeads[n]
spanend <- GroupLeads[n+1]-1
for(m in spanstart:spanend){
Groups[,n] <- Groups[,n] | (Keys[,"Speaker"] == PermutedSpeakers[m])
}
}
# Sanity checks on group assignments
InCommon <- matrix(nrow=10,ncol=10,data=0)
for(n in 1:10){
for(m in 1:10){
InCommon[n,m] <- sum(Groups[,n]&Groups[,m])
}
}
GSTypes <- matrix(nrow=10,ncol=4,data=0)
for(n in 1:10){
for(m in 1:4){
GSTypes[n,m] = sum(Keys[Groups[,n],"SpeakerType"]==m)
}
}
#
# Iterate over 10 train/test splits,
# for each of six choices of input data
# Note that we eliminate first column, which is "file_id"
# (1) MFCCs at 50%
# cols 14:25
# (2) MFCCs at 50% with duration and mean F0
# cols 14:25, 38, 39
# (3) MFCCs at 20% and 80%
# cols 2:13, 26:37
# (4) MFCCs at 20% and 80% with duration and mean F0
# cols 2:13, 26:37, 38, 39
# (5) MFCCs at 20% 50% 80%
# cols 2:37
# (6) MFCC at 20%, 50%, 80% with duration and mean F0
# cols 2:39
Ncorrect <- matrix(nrow=6,ncol=10, data=0)
Nitems <- matrix(nrow=6,ncol=10, data=0)
allgroups <- as.vector(1:10,mode="logical")
for(trial in 1:6){
if(trial == 1) ThisData <- Data[,14:25]
else if(trial == 2) ThisData <- Data[,c(14:25,38,39)]
else if(trial == 3) ThisData <- Data[,c(2:13, 26:37)]
else if(trial == 4) ThisData <- Data[,c(2:13, 26:37, 38, 39)]
else if(trial == 5) ThisData <- Data[,2:37]
else if(trial == 6) ThisData <- Data[,2:39]
#
for(group in 1:10){
traingroups <- allgroups; traingroups[group] <- FALSE
trainwhich <- apply(Groups[,traingroups],1,any)
traindata <- ThisData[trainwhich,]
trainkey <- Keys[trainwhich,"Vowel"]
testwhich <- Groups[,group]
testdata <- ThisData[testwhich,]
testkey <- Keys[testwhich,"Vowel"]
ncats <- max(trainkey)
Confusion <- kclassify(traindata,trainkey,testdata,testkey,KN)
Ncorrect[trial,group] <- sum(diag(Confusion))
Nitems[trial,group] <- length(testkey)
}
}