% Test of "Running Duration Measure"
% from Kochanski, Grabe, Coleman, and Rosner, "Loudness..." JASA 2005
% http://www.ling.upenn.edu/courses/Fall_2008/cogs501/Kochanski2005.pdf
%
% Test data from TIMIT TEST/DR1/FAKS0/SA1
% She had your dark suit in greasy wash water all year
% first four vowel centers are at 75 90 114 130 10-msec frames
%
Wsec = .030; % 30 msec window
Winc = .010; % 10 msec window increment
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
[x, FS, BITS] = wavread('SA1.wav');
%
% Make hamming window:
Wsamp = round(Wsec*FS); Hw = hamming(Wsamp);
%
% Input length in samples and in seconds:
Nsamp = length(x); Nsec = Nsamp/FS;
%
% How many Winc increments of a Wsec window will fit in Nsec?
% We want the biggest M such that round(M*Winc*FS)+Wsamp < Nsamp
% thus, round(M*Winc*FS) < Nsamp-Wsamp
Nframes = floor((Nsamp-Wsamp)/(Winc*FS));
%
% FFT size --
Nfft = 2^nextpow2(Wsamp);
% Frequency increment per fft sample:
Finc = FS/Nfft;
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% make the power spectra...
xx = zeros(Nfft,1);
% Matrix for the power spectra, one column per analysis frame:
PowSpec = zeros(Nfft/2, Nframes);
for (n = 1:Nframes)
samp1 = round((n-1)*Winc*FS)+1;
xx(1:Wsamp) = Hw.*x(samp1:(samp1+Wsamp-1));
XX = abs(fft(xx));
XXX = XX(1:(Nfft/2)).^2; XXX = XXX/sum(XXX); % normalized power
PowSpec(:,n) = XXX;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Now make perceptual spectra,
% sum from 300 to 5500 Hz in 1 Bark wide bins on 0.5 Bark center
% z = (26.81/(1 + 1960/f)) - 0.53 (works ~ 200 to 6700 Hz)
%
% So 300 Hz = 3.0288 bark
% and 5500 Hz = 19.236 bark
% Thus edges in Bark are
BE = 3:0.5:19.5;
% and with f = 1960/(26.81/(z + 0.53) - 1)
% edges in Hz are
HBE = 1960./(26.81./(BE+0.53) -1);
% and since the value in Hz for index m is f = (m-1)*Finc
% we have m = (f+Finc)/Finc
% and so edges in vector indices are
IBE = round((HBE+Finc)/Finc);
Nbands = length(IBE)-1; % number of bands in the perceptual spectrum
%
% Matrix for the perceptual spectra, one column per frame:
PerSpec = zeros(Nbands,Nframes);
% sum and take 1/3 power
for(f = 1:Nframes)
for(b = 1:(Nbands-1))
PerSpec(b,f) = sum(PowSpec(IBE(b):(IBE(b+1)-1),f))^(1/3);
end
end
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% plot log power spectrum & perceptual spectrum -- they look OK:
colormap(gray); figure(1)
imagesc(log(PowSpec((Nfft/2):-1:1,:)));
figure(2)
imagesc(PerSpec(Nbands:-1:1,:));
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Now accumulate (forward)
% E(i) = sum over j=(i+1):Nframes of of summed squared diffs in PerSpec
% and
% D(i) = sum over j=(i+1):Nframes of exp(-E/C) where C=600
C = 600;
Ef = zeros(Nframes,1); Df = zeros(Nframes,1);
for(f = 1:(Nframes-1))
Esum = 0; Dsum = 0;
for(g = (f+1):Nframes)
diff = PerSpec(:,f)-PerSpec(:,g);
Esum = Esum + diff'*diff;
Dsum = Dsum + exp(-Esum/C);
end
Ef(f) = Esum; Df(f) = Dsum;
end
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% And now accumulate backward:
Eb = zeros(Nframes,1); Db = zeros(Nframes,1);
for(f= Nframes:-1:2)
Esum = 0; Dsum = 0;
for(g = (f-1):-1:1)
diff = PerSpec(:,f)-PerSpec(:,g);
Esum = Esum + diff'*diff;
Dsum = Dsum + exp(-Esum/C);
end
Eb(f) = Esum; Db(f) = Dsum;
end
% Add the forward and backward results:
D = Df+Db;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% plot it:
figure(3)
plot(D)
% first four vowel centers are at 75 90 114 130 frames
% TIMIT durations (in msec) are 96 130 73 97
hold on
line([75 75], ylim, 'color', 'red')
line([90 90], ylim, 'color', 'red')
line([114 114], ylim, 'color', 'red')
line([130 130], ylim, 'color', 'red')
hold off
% But D values don't seem to correspond in any way to durations...
% ?????