-
Notifications
You must be signed in to change notification settings - Fork 1
Open
Description
I found your repo from this issue: jik876/hifi-gan#63
I am still confused about the mismatch between repos in Mel spectrogram generation. I collect some method from some TTS repo, there are some differences such as
- STFT from torch vs librosa
- Log mel with base e vs base 10
- Difference in padding
- Use center or not
def get_mel_librosa1(wave):
wave = wave / max_wav_value
wave = wave.astype('float32')
mel = librosa.feature.melspectrogram(y=wave, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) #, center=True, pad_mode='constant', power=2.0)
return mel
def get_mel_librosa2(wave):
wave = wave / max_wav_value
wave = wave.astype('float32')
sgram = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
sgram_mag, _ = librosa.magphase(sgram)
mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)
return mel_sgram
def get_mel_parallelwavegan(wave):
# get amplitude spectrogram
wave = wave / max_wav_value
wave = wave.astype('float32')
x_stft = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa, center=True, pad_mode="reflect")
spc = np.abs(x_stft).T # (#frames, #bins)
mel = np.maximum(eps, np.dot(spc, melbasis.T))
return np.log10(mel).T
def get_mel_tacotron2(wave):
wave = torch.FloatTensor(wave)
audio_norm = wave / max_wav_value
audio_norm = audio_norm.unsqueeze(0)
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
_stft = TacotronSTFT(fft_size, hop_size, fft_size, num_mels, sampling_rate, fmin, fmax)
melspec = _stft.mel_spectrogram(audio_norm)
melspec = torch.squeeze(melspec, 0)
return melspec.cpu().detach().numpy()
def get_mel_hifigan_origin(y):
y = y/max_wav_value
y = torch.FloatTensor([y]).to(device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
mel_basis = torch.from_numpy( melbasis ).float().to(device)
spec = torch.matmul(mel_basis, spec)
spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
return spec.cpu().detach().numpy()[0]
def get_mel_hifigan_center(y):
y = y/max_wav_value
y = torch.FloatTensor([y]).to(device)
# y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=True, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
mel_basis = torch.from_numpy( melbasis ).float().to(device)
spec = torch.matmul(mel_basis, spec)
spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
return spec.cpu().detach().numpy()[0]
def get_mel_hifigan_change_pad(y):
# https://github.com/jik876/hifi-gan/issues/63
y = y/max_wav_value
y = torch.FloatTensor([y]).to(device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size)/2), int((fft_size)/2)), mode='reflect').squeeze(1)
spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
mel_basis = torch.from_numpy( melbasis ).float().to(device)
spec = torch.matmul(mel_basis, spec)
spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
return spec.cpu().detach().numpy()[0]
mel0 = get_mel_librosa1(wave)
mel1 = get_mel_librosa2(wave)
mel2 = get_mel_parallelwavegan(wave)
mel3 = get_mel_tacotron2(wave)
mel4 = get_mel_hifigan_origin(wave)
mel5 = get_mel_hifigan_center(wave)
mel6 = get_mel_hifigan_change_pad(wave)
(80, 487)
(80, 487)
(80, 487)
(80, 487)
(80, 486)
(80, 487)
(80, 487)
Only the origin way of hifigan repo give difference shape: get_mel_hifigan_origin
Do you have any comments on this, when I compare element values, there is no total match between these method.
One more question, Is there any benchmark for these Vocoders?
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels