-
Notifications
You must be signed in to change notification settings - Fork 72
MFCC and Mel Spectrogram
Scheme:
- Spectrum: power spectrum (default) / magnitude spectrum
- Mel Filterbank: HTK-like (default in PyTorch)
- Nonlinearity: 10*Log10()
- DCT: 2N (default)
PyTorch:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 24
n_mfcc = 13
mfcc_transform = T.MFCC(
sample_rate=sample_rate,
n_mfcc=n_mfcc,
melkwargs={
'n_fft': n_fft,
'n_mels': n_mels,
'hop_length': hop_length,
'center': False
}
)
mfcc = mfcc_transform(signal)
librosa:
# unfortunately, you'll need to call two methods separately
# (because of two different meanings of the 'norm' parameter)
melspec = librosa.feature.melspectrogram(
y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft,
win_length=win_length, hop_length=hop_length,
n_mels=n_mels, htk=True, center=False, norm=None)
mfcc_librosa = librosa.feature.mfcc(
S=librosa.core.spectrum.power_to_db(melspec),
n_mfcc=n_mfcc, dct_type=2, norm='ortho')
NWaves:
var fftSize = 1024;
var melCount = 24;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameSize = fftSize,
HopSize = 512,
FilterBankSize = melCount,
SpectrumType = SpectrumType.Power,
NonLinearity = NonLinearityType.ToDecibel,
DctType = "2N",
FftSize = fftSize,
Window = WindowType.Hann,
LogFloor = 1e-10f
};
var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);
If your calculations are based on method librosa.feature.mfcc
like this:
librosa.feature.mfcc(
signal,
sample_rate,
n_mfcc=13,
dct_type=2,
norm='ortho',
window='hann',
htk=True,
n_mels=24,
n_fft=1024,
hop_length=512,
center=False)
and you want a full compliance with it, you can specifiy the following settings in NWaves:
var fftSize = 1024;
var melCount = 24;
var freqs = FilterBanks.MelBands(melCount, samplingRate);
var fbank = FilterBanks.Triangular(fftSize, samplingRate, freqs);
FilterBanks.Normalize(melCount, freqs, fbank);
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameSize = fftSize,
HopSize = 512,
FilterBank = fbank,
SpectrumType = SpectrumType.Power,
NonLinearity = NonLinearityType.ToDecibel,
DctType = "2N",
FftSize = fftSize,
Window = WindowType.Hann,
LogFloor = 1e-10f
};
var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);
Scheme:
- Spectrum: power spectrum (default) / magnitude spectrum
- Mel Filterbank: Slaney (default in librosa)
- Nonlinearity: 10*Log10()
- DCT: 2N (default)
librosa (htk=False
, i.e. Slaney mel-filterbank):
librosa.feature.mfcc(
signal,
sample_rate,
n_mfcc=13,
dct_type=2,
norm='ortho',
window='hann',
htk=False,
n_mels=40,
n_fft=1024,
hop_length=512,
center=False)
PyTorch:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
n_mfcc = 13
mfcc_transform = T.MFCC(
sample_rate=sample_rate,
n_mfcc=n_mfcc,
melkwargs={
'n_fft': n_fft,
'n_mels': n_mels,
'hop_length': hop_length,
'mel_scale': 'slaney',
'norm': 'slaney',
'center': False
}
)
mfcc = mfcc_transform(signal)
NWaves:
var fftSize = 1024;
var melCount = 40;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameSize = fftSize,
HopSize = 512,
FilterBank = FilterBanks.MelBankSlaney(melCount, fftSize, samplingRate),
SpectrumType = SpectrumType.Power,
NonLinearity = NonLinearityType.ToDecibel,
DctType = "2N",
FftSize = fftSize,
Window = WindowType.Hann,
LogFloor = 1e-10f
};
var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);
Scheme:
- Spectrum: power spectrum
- Mel Filterbank: HTK-like
- Nonlinearity: Ln()
- DCT: 2N
- and a lot of other parameters
# some parameters are omitted
# since in most cases their default values are used
torchaudio.compliance.kaldi.mfcc(
signal,
sample_frequency,
num_ceps = 13,
num_mel_bins = 23,
frame_length = 25.0,
frame_shift = 10.0,
low_freq = 20.0,
high_freq = 0.0,
window_type = 'hanning',
cepstral_lifter = 22.0,
preemphasis_coefficient = 0.97,
remove_dc_offset = False,
use_energy = False
)
Let's consider two cases separately:
remove_dc_offset=False
remove_dc_offset=True
In the former case there's no HTK-specific preprocessing of frames, so NWaves settings are simple:
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameDuration = 0.025,
HopDuration = 0.01,
FilterBankSize = 23,
LowFrequency = 20,
NonLinearity = NonLinearityType.LogE,
DctType = "2N",
Window = WindowType.Hann,
PreEmphasis = 0.97,
LifterSize = 22
};
var extractor = new MfccExtractor(mfccOptions);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);
In the latter case things get trickier. Use the following subclass of MfccExtractor
:
class MfccExtractorKaldi : MfccExtractor
{
private readonly new float[] _windowSamples;
private readonly float _pre;
public MfccExtractorKaldi(MfccOptions options, WindowType window) : base(options)
{
_windowSamples = Window.OfType(window, FrameSize);
_preEmphasis = 0;
_pre = (float)options.PreEmphasis;
}
public override void ProcessFrame(float[] block, float[] features)
{
var mean = block.Take(FrameSize).Average();
for (var i = 0; i < FrameSize; i++)
{
block[i] -= mean;
}
for (var k = FrameSize - 1; k >= 1; k--)
{
block[k] -= block[k - 1] * _pre;
}
block[0] *= 1 - _pre;
block.ApplyWindow(_windowSamples);
base.ProcessFrame(block, features);
}
}
Pay attention to the constructor: its second parameter is the window type. In MFCC options the rectangular window must be specified:
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 13,
FrameDuration = 0.025,
HopDuration = 0.01,
FilterBankSize = 23,
LowFrequency = 20,
NonLinearity = NonLinearityType.LogE,
DctType = "2N",
Window = WindowType.Rectangular, // !
PreEmphasis = 0.97,
LifterSize = 22
};
var extractor = new MfccExtractorKaldi(mfccOptions, WindowType.Hann);
var mfccVectors = extractor.ComputeFrom(signal);
// or
// var mfccVectors = extractor.ParallelComputeFrom(signal);
Scheme:
- Spectrum: power spectrum (normalized)
- Mel Filterbank: essentially HTK-like (but slightly different)
- Nonlinearity: Ln()
- DCT: 2N
# s = s / 32768 # if samples are not in range [-1, 1]
# (or don't normalize signal in NWaves)
mfcc(
signal,
samplerate=16000,
winlen=0.025,
winstep=0.01,
numcep=16,
nfilt=26,
nfft=512,
lowfreq=0,
highfreq=None,
preemph=0.97,
ceplifter=22,
appendEnergy=False,
winfunc=numpy.hanning
)
NWaves:
var samplingRate = 16000;
var fftSize = 512;
var melCount = 26;
var mfccOptions = new MfccOptions
{
SamplingRate = samplingRate,
FeatureCount = 16,
FrameDuration = 0.025,
HopDuration = 0.01,
FilterBankSize = melCount,
SpectrumType = SpectrumType.PowerNormalized,
NonLinearity = NonLinearityType.LogE,
DctType = "2N",
FftSize = fftSize,
PreEmphasis = 0.97,
LifterSize = 22,
IncludeEnergy = false,
Window = WindowType.Hann,
};
- Correction for the first coefficient:
- if
appendEnergy=False (IncludeEnergy = false)
// call this on already computed mfccVectors:
for (var i = 0; i < mfccVectors.Count; i++)
{
mfccVectors[i][0] -= (float)(Math.Log(2) * Math.Sqrt(melCount));
}
- if
appendEnergy=True (IncludeEnergy = true)
// call this on already computed mfccVectors:
for (var i = 0; i < mfccVectors.Count; i++)
{
mfccVectors[i][0] -= (float)Math.Log(2);
}
- Set the filterbank explicitly (instead of
FilterBankSize = melCount
)
FilterBank = PsfFilterbank(samplingRate, melCount, fftSize)
.
The code of PsfFilterbank
function:
/// <summary>
/// Generates filterbank with weights identical to python_speech_features.
/// </summary>
float[][] PsfFilterbank(int samplingRate, int filterbankSize, int fftSize, double lowFreq = 0, double highFreq = 0)
{
var filterbank = new float[filterbankSize][];
if (highFreq <= lowFreq)
{
highFreq = samplingRate / 2;
}
var low = NWaves.Utils.Scale.HerzToMel(lowFreq);
var high = NWaves.Utils.Scale.HerzToMel(highFreq);
var res = (fftSize + 1) / (float)samplingRate;
var bins = Enumerable
.Range(0, filterbankSize + 2)
.Select(i => (float)Math.Floor(res * NWaves.Utils.Scale.MelToHerz(low + i * (high - low) / (filterbankSize + 1))))
.ToArray();
for (var i = 0; i < filterbankSize; i++)
{
filterbank[i] = new float[fftSize / 2 + 1];
for (var j = (int)bins[i]; j < (int)bins[i + 1]; j++)
{
filterbank[i][j] = (j - bins[i]) / (bins[i + 1] - bins[i]);
}
for (var j = (int)bins[i + 1]; j < (int)bins[i + 2]; j++)
{
filterbank[i][j] = (bins[i + 2] - j) / (bins[i + 2] - bins[i + 1]);
}
}
return filterbank;
}
pyTorch:
import torchaudio.transforms as T
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
mel_spectrogram = T.MelSpectrogram(
sample_rate=sample_rate,
n_fft=n_fft,
win_length=win_length,
hop_length=hop_length,
power=2.0,
n_mels=n_mels,
center=False,
norm=None, # or 'slaney' if in librosa norm='slaney'
mel_scale='htk' # or 'slaney' if in librosa htk=False
)
melspec = mel_spectrogram(waveform)
librosa:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 40
melspec_librosa = librosa.feature.melspectrogram(
signal,
sr=sample_rate,
n_fft=n_fft,
hop_length=hop_length,
win_length=win_length,
center=False,
power=2.0,
n_mels=n_mels,
norm=None,
htk=True,
)
NWaves:
var fftSize = 1024;
var hopSize = 512;
var melCount = 40;
var mfccExtractor = new FilterbankExtractor(
new FilterbankOptions
{
SamplingRate = samplingRate,
FrameSize = fftSize,
FftSize = fftSize,
HopSize = hopSize,
Window = WindowType.Hann,
FilterBank = FilterBanks.Triangular(fftSize, samplingRate, FilterBanks.MelBands(melCount, samplingRate)),
// if power = 1.0
// SpectrumType = SpectrumType.Magnitude
});
Where possible, just set center=False
in librosa.
In librosa center=True
means that the input signal is padded so that frame t is centered at y[t * hop_length]. Hence, this parameter does not affect MFCC algorithm per se, it simply changes the offset of frames. The simplest solution in NWaves is to set the signal offset for computations and skip first MFCC vectors in librosa (computed from padded part of the signal). The offset can be calculated as follows:
var offset = hopSize - (fftSize2 / 2) % hopSize;
var mfccVectors = extractor.ComputeFrom(signal, offset, signal.Length);
// var mfccVectors = extractor.ParallelComputeFrom(signal, offset, signal.Length);
// for 1-to-1 comparison, skip the first skipCount vectors in librosa results:
// var skipCount = (offset + fftSize / 2) / hopSize;
// for example:
// fftSize = 1024
// hopSize = 400
// offset = 288
// skipCount = 2
// i.e. mfccVectors[0] corresponds to librosa_mfcc[:, 2]
// mfccVectors[1] corresponds to librosa_mfcc[:, 3]
// etc.