% Test of "Running Duration Measure" % from Kochanski, Grabe, Coleman, and Rosner, "Loudness..." JASA 2005 % http://www.ling.upenn.edu/courses/Fall_2008/cogs501/Kochanski2005.pdf % % Test data from TIMIT TEST/DR1/FAKS0/SA1 % She had your dark suit in greasy wash water all year % first four vowel centers are at 75 90 114 130 10-msec frames % Wsec = .030; % 30 msec window Winc = .010; % 10 msec window increment %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% [x, FS, BITS] = wavread('SA1.wav'); % % Make hamming window: Wsamp = round(Wsec*FS); Hw = hamming(Wsamp); % % Input length in samples and in seconds: Nsamp = length(x); Nsec = Nsamp/FS; % % How many Winc increments of a Wsec window will fit in Nsec? % We want the biggest M such that round(M*Winc*FS)+Wsamp < Nsamp % thus, round(M*Winc*FS) < Nsamp-Wsamp Nframes = floor((Nsamp-Wsamp)/(Winc*FS)); % % FFT size -- Nfft = 2^nextpow2(Wsamp); % Frequency increment per fft sample: Finc = FS/Nfft; % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % make the power spectra... xx = zeros(Nfft,1); % Matrix for the power spectra, one column per analysis frame: PowSpec = zeros(Nfft/2, Nframes); for (n = 1:Nframes) samp1 = round((n-1)*Winc*FS)+1; xx(1:Wsamp) = Hw.*x(samp1:(samp1+Wsamp-1)); XX = abs(fft(xx)); XXX = XX(1:(Nfft/2)).^2; XXX = XXX/sum(XXX); % normalized power PowSpec(:,n) = XXX; end %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Now make perceptual spectra, % sum from 300 to 5500 Hz in 1 Bark wide bins on 0.5 Bark center % z = (26.81/(1 + 1960/f)) - 0.53 (works ~ 200 to 6700 Hz) % % So 300 Hz = 3.0288 bark % and 5500 Hz = 19.236 bark % Thus edges in Bark are BE = 3:0.5:19.5; % and with f = 1960/(26.81/(z + 0.53) - 1) % edges in Hz are HBE = 1960./(26.81./(BE+0.53) -1); % and since the value in Hz for index m is f = (m-1)*Finc % we have m = (f+Finc)/Finc % and so edges in vector indices are IBE = round((HBE+Finc)/Finc); Nbands = length(IBE)-1; % number of bands in the perceptual spectrum % % Matrix for the perceptual spectra, one column per frame: PerSpec = zeros(Nbands,Nframes); % sum and take 1/3 power for(f = 1:Nframes) for(b = 1:(Nbands-1)) PerSpec(b,f) = sum(PowSpec(IBE(b):(IBE(b+1)-1),f))^(1/3); end end % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % plot log power spectrum & perceptual spectrum -- they look OK: colormap(gray); figure(1) imagesc(log(PowSpec((Nfft/2):-1:1,:))); figure(2) imagesc(PerSpec(Nbands:-1:1,:)); % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Now accumulate (forward) % E(i) = sum over j=(i+1):Nframes of of summed squared diffs in PerSpec % and % D(i) = sum over j=(i+1):Nframes of exp(-E/C) where C=600 C = 600; Ef = zeros(Nframes,1); Df = zeros(Nframes,1); for(f = 1:(Nframes-1)) Esum = 0; Dsum = 0; for(g = (f+1):Nframes) diff = PerSpec(:,f)-PerSpec(:,g); Esum = Esum + diff'*diff; Dsum = Dsum + exp(-Esum/C); end Ef(f) = Esum; Df(f) = Dsum; end % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % And now accumulate backward: Eb = zeros(Nframes,1); Db = zeros(Nframes,1); for(f= Nframes:-1:2) Esum = 0; Dsum = 0; for(g = (f-1):-1:1) diff = PerSpec(:,f)-PerSpec(:,g); Esum = Esum + diff'*diff; Dsum = Dsum + exp(-Esum/C); end Eb(f) = Esum; Db(f) = Dsum; end % Add the forward and backward results: D = Df+Db; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % plot it: figure(3) plot(D) % first four vowel centers are at 75 90 114 130 frames % TIMIT durations (in msec) are 96 130 73 97 hold on line([75 75], ylim, 'color', 'red') line([90 90], ylim, 'color', 'red') line([114 114], ylim, 'color', 'red') line([130 130], ylim, 'color', 'red') hold off % But D values don't seem to correspond in any way to durations... % ?????