-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathmel_fbank.py
More file actions
294 lines (248 loc) · 12.1 KB
/
mel_fbank.py
File metadata and controls
294 lines (248 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# -*- coding:utf-8 -*-
# Author:凌逆战 | Never.Ling
# Date: 2022/9/11
"""
f, t, zxx = signal.stft(wav_preprocessed, fs, window, nperseg, noverlap=nperseg / 2)
"""
import numpy as np
import scipy
import torch
class MelFilterBank():
def __init__(self, nfilter=20, nfft=512, sr=16000, fmin=0, fmax=None, transpose=False):
"""
:param nfilter: filterbank中的滤波器数量
:param nfft: FFT size
:param sr: 采样率
:param fmin: Mel-filter的最低频带边缘
:param highfreq: Mel-filter的最高频带边缘,默认samplerate/2
"""
self.nfilter = nfilter
self.freq_bins = nfft // 2 + 1 # 频点数
fmax = fmax or sr / 2
self.transpose = transpose
# 按梅尔均匀间隔计算 点
lowmel, highmel = self.hz2mel(fmin), self.hz2mel(fmax)
melpoints = np.linspace(lowmel, highmel, nfilter + 2)
self.hz_points = self.mel2hz(melpoints) # 将mel频率再转到hz频率
# print("hz_points", len(self.hz_points), self.hz_points)
# bin = sr/2 / (NFFT+1)/2=sample_rate/(NFFT+1) # 每个频点的频率数
# bins = hz_points/bin=hz_points*NFFT/ sample_rate # hz_points对应第几个fft频点
self.bins_list = np.floor((nfft + 1) * self.hz_points / sr).astype(np.int32)
# print("self.bins_list", len(self.bins_list), self.bins_list)
def hz2mel(self, hz):
""" Hz to Mels """
return 2595 * np.log10(1 + hz / 700.0)
def mel2hz(self, mel):
""" Mels to HZ """
return 700 * (10 ** (mel / 2595.0) - 1)
def get_fbank(self):
""" 计算一个Mel-filterbank (M,F) """
fbank = np.zeros([self.nfilter, self.freq_bins], dtype="float32") # (m,f)
for i in range(0, self.nfilter):
for j in range(self.bins_list[i], self.bins_list[i + 1]):
fbank[i, j] = (j - self.bins_list[i]) / (self.bins_list[i + 1] - self.bins_list[i])
for j in range(self.bins_list[i + 1], self.bins_list[i + 2]):
fbank[i, j] = (self.bins_list[i + 2] - j) / (self.bins_list[i + 2] - self.bins_list[i + 1])
if self.transpose:
fbank = fbank.transpose(-1, -2)
return fbank
def interp_band_gain_numpy(self, mask):
""" 将22个点的mel特征,插值成241个点 (M,T)-->(F,T) """
restore_features = np.zeros((self.freq_bins, mask.shape[-1])) # (F,T)
for i in range(0, self.nfilter):
for j in range(self.bins_list[i], self.bins_list[i + 1]):
side = (j - self.bins_list[i]) / (self.bins_list[i + 1] - self.bins_list[i]) # 升
restore_features[j] += mask[i] * side
for k in range(self.bins_list[i + 1], self.bins_list[i + 2]):
side = (self.bins_list[i + 2] - k) / (self.bins_list[i + 2] - self.bins_list[i + 1]) # 降
restore_features[k] += mask[i] * side
return restore_features
def interp_band_gain(self, mask):
""" 将22个点的mel特征,插值成241个点 (B,M,T)-->(B,F,T) """
restore_features = torch.zeros((mask.shape[0], self.freq_bins, mask.shape[-1]), device=mask.device) # (F,T)
for i in range(0, self.nfilter):
for j in range(self.bins_list[i], self.bins_list[i + 1]):
side = (j - self.bins_list[i]) / (self.bins_list[i + 1] - self.bins_list[i]) # 升
restore_features[:, j] += mask[:, i] * side
for k in range(self.bins_list[i + 1], self.bins_list[i + 2]):
side = (self.bins_list[i + 2] - k) / (self.bins_list[i + 2] - self.bins_list[i + 1]) # 降
restore_features[:, k] += mask[:, i] * side
return restore_features
def get_mfcc(FBank_spec, num_cep=13, dct_type=2, norm='ortho', lifter=22):
"""
:param FBank_spec: (M,T)
:param num_cep: the number of cepstrum to return, default 13
:param dct_type:
:param norm:
:param lifter: apply a lifter to cepstral. 0 is no lifter. Default is 22.
由于系数低位的值过小,需要向前“抬升”,增加了高频DCT系数的幅度。源于线性预测分析。
原文:
he principal advantage of cepstral coefficients is that they are generally decorrelated and this allows
diagonal covariances to be used in the HMMs. However, one minor problem with them is that the higher order
cepstra are numerically quite small and this results in a very wide range of variances when going from the
low to high cepstral coefficients . HTK does not have a problem with this but for pragmatic reasons such as
displaying model parameters, flooring variances, etc, it is convenient to re-scale the cepstral coefficients
to have similar magnitudes. This is done by setting the configuration parameter CEPLIFTER to some value L
to lifter the cepstra according to the following formula.
来源:
http://www.ee.columbia.edu/ln/LabROSA/doc/HTKBook21/node53.html#eceplifter
:return:
"""
log_MelSpec = 10 * np.log10(FBank_spec ** 2 + 1e-10)
mfcc = scipy.fftpack.dct(log_MelSpec, axis=-2, type=dct_type, norm=norm)[:num_cep, :]
if lifter > 0:
ncoeff, nframes = mfcc.shape
# shape lifter for broadcasting
n = np.arange(ncoeff, dtype=mfcc.dtype)
LI = np.sin(np.pi * n / lifter)
LI = librosa.util.expand_to(LI, ndim=FBank_spec.ndim, axes=-2)
mfcc *= 1 + (lifter / 2) * LI
return mfcc
elif lifter == 0:
return mfcc
else:
raise f"MFCC lifter={lifter} must be a non-negative number"
def cal_delta(feat, N=2):
"""
:param feat: MFCC或一阶差分系数 (T,M)
:param N: 间隔帧数,一般为1或2; 对于每个帧,根据前面和N帧计算delta特征
:returns: 差分系数矩阵
https://github.com/Byshx/Poccala/blob/07d59a595eda27bebd04fd215f9de4f5583a777a/StatisticalModel/AudioProcessing.py
https://github.com/dannis999/trained_SpeechRecognition/blob/master/speech_features/base.py
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
num_frames = len(feat)
denominator = 2 * sum([i ** 2 for i in range(1, N + 1)])
delta_feat = np.empty_like(feat)
padded = np.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(num_frames):
delta_feat[t] = np.dot(np.arange(-N, N + 1),
padded[t: t + 2 * N + 1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
def get_delta(mfcc, d1=True, d2=None):
assert d1 or d2, "d1 and d2 cannot be None at the same time"
# 计算一阶差分系数
if d1 is True:
delta = cal_delta(mfcc)
coefficient = np.concatenate((mfcc, delta), 1)
# 计算二阶差分系数
if d2 is True:
delta_ = cal_delta(delta)
coefficient = np.concatenate((coefficient, delta_), 1)
return coefficient
def delta(feat, N):
""" 计算差分系数
:param feat: 标准梅尔倒谱系数或一阶差分系数 (M, T)
:param N: 间隔帧数,一般为1或2; 对于每个帧,根据前面和N帧计算delta特征
:returns: 差分系数矩阵
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
feat = feat.T
NUMFRAMES = len(feat)
denominator = 2 * sum([i ** 2 for i in range(1, N + 1)])
delta_feat = np.empty_like(feat)
padded = np.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = np.dot(np.arange(-N, N + 1),
padded[t: t + 2 * N + 1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat.T
def Test_Fbank():
sr = 16000
num_filter = 40
NFFT = 512
mel_filter = MelFilterBank(nfilter=num_filter, nfft=NFFT, sr=sr, fmin=0, fmax=None)
FBank = mel_filter.get_fbank() # (M,F)
plt.title(f'mel_fbank_nfft={NFFT}_num_filter={num_filter}')
freqs_num = NFFT // 2 + 1
fs_bin = sr // 2 / (NFFT // 2) # 一个频点多少Hz
x = np.linspace(0, freqs_num, freqs_num)
plt.plot(x * fs_bin, FBank.T)
plt.grid(linestyle='--')
plt.show()
def Test_interp_band():
sr = 16000
num_filter = 40
window_len = NFFT = 512
EPS = 1e-10
mel_filter = MelFilterBank(nfilter=num_filter, nfft=NFFT, samplerate=sr, lowfreq=0, highfreq=None)
FBank = mel_filter.get_fbank() # (M,F)
# filterbank = np.sum(melfilterbank.numpy(), axis=0,keepdims=True) # (M,F)
# filterbank = filterbank/np.sum(melfilterbank, axis=0, keepdims=True)
print("FBank", FBank.shape)
wav = librosa.load("../wav_data/TIMIT.wav", sr=sr)[0]
# spec = librosa.stft(wav, n_fft=NFFT, hop_length=window_len // 2, win_length=window_len)
# mag = np.abs(spec) # (F,T) (257, 183)
freq_axis, time_axis, spec = signal.stft(wav, sr, window="hann", nperseg=window_len, noverlap=window_len // 2)
mag = np.abs(spec) # (257, 184)
# 矩阵相乘获得mel频谱 --------------------------------------
FBank_spec = np.matmul(FBank, mag) # (M,T)
FBank_spec_inverse = np.matmul(FBank.T, FBank_spec) # (F,T)(257, 4)
FBank_spec_inter = mel_filter.interp_band_gain_numpy(FBank_spec) # (M,T)-->(F,T)
print("转置和滤波器插值是否相等", np.allclose(FBank_spec_inverse, FBank_spec_inter))
fig = plt.figure(figsize=(8, 15))
fig.suptitle('mel_fbank_nfft={}_num_filter={}'.format(NFFT, num_filter))
plt.subplot(3, 1, 1)
plt.title("mag")
plt.imshow(10 * np.log10(mag ** 2 + EPS), cmap='jet', aspect='auto', origin='lower') # norm=norm,restore
plt.xlabel('Time/s', fontsize=14)
plt.ylabel('FreqBins/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.subplot(3, 1, 2)
plt.title("FBank_spec")
plt.imshow(10 * np.log10(FBank_spec ** 2 + EPS), cmap='jet', aspect='auto', origin='lower') # norm=norm
plt.xlabel('Time/s', fontsize=14)
plt.ylabel('FreqBands/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.subplot(3, 1, 3)
plt.title("FBank_spec_inverse")
plt.imshow(10 * np.log10(FBank_spec_inverse ** 2 + EPS), cmap='jet', aspect='auto', origin='lower') # norm=norm
plt.xlabel('Time/s', fontsize=14)
plt.ylabel('FreqBins/kHz', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.show()
def Test_mfcc():
sr = 16000
num_filter = 40
num_cep = 40
window_len = NFFT = 512
mel_filter = MelFilterBank(nfilter=num_filter, nfft=NFFT, sr=sr)
FBank = mel_filter.get_fbank() # (M,F) (40,257)
# filterbank = np.sum(melfilterbank.numpy(), axis=0,keepdims=True) # (M,F)
# filterbank = filterbank/np.sum(melfilterbank, axis=0, keepdims=True)
print("FBank", FBank.shape)
wav = librosa.load("../wav_data/TIMIT.wav", sr=sr)[0]
spec = librosa.stft(wav, n_fft=NFFT, hop_length=window_len // 2, win_length=window_len)
mag = np.abs(spec) # (F,T) (257, 183)
# 矩阵相乘获得mel频谱 --------------------------------------
FBank_spec = np.matmul(FBank, mag) # (M,F)*(F,T)=(M,T)
mfcc = get_mfcc(FBank_spec, num_cep=num_cep, dct_type=2, norm='ortho', lifter=22) # (13, 184)
print("mfcc", mfcc.shape) # (13, 183)
# 替换第一个道倒谱系数为对数能量
mfcc[0] = np.log(np.sum(mag, axis=0) + 1e-10)
mfcc_d = delta(mfcc, 2) # 一阶差分
mfcc_dd = delta(mfcc_d, 2) # 二阶差分
# 返回值分别是mfcc特征向量的矩阵及其一阶差分和二阶差分矩阵
wav_feature = np.column_stack((mfcc, mfcc_d, mfcc_dd))
# mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=40, n_mels=40, n_fft=NFFT, hop_length=window_len // 2,
# win_length=window_len) # (M,T) (13, 184)
# print("mfcc", mfcc.shape)
plt.imshow(wav_feature, cmap='jet', aspect='auto', origin='lower')
plt.show()
if __name__ == "__main__":
from scipy import signal
import librosa
import librosa.feature
import librosa.display
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import torch
# Test_Fbank()
# Test_interp_band()
Test_mfcc()