Source code for diffsptk.modules.plp

# ------------------------------------------------------------------------ #
# Copyright 2022 SPTK Working Group                                        #
#                                                                          #
# Licensed under the Apache License, Version 2.0 (the "License");          #
# you may not use this file except in compliance with the License.         #
# You may obtain a copy of the License at                                  #
#                                                                          #
#     http://www.apache.org/licenses/LICENSE-2.0                           #
#                                                                          #
# Unless required by applicable law or agreed to in writing, software      #
# distributed under the License is distributed on an "AS IS" BASIS,        #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and      #
# limitations under the License.                                           #
# ------------------------------------------------------------------------ #

import numpy as np
import torch
import torch.nn as nn

from ..misc.utils import numpy_to_torch
from ..misc.utils import replicate1
from .fbank import MelFilterBankAnalysis
from .levdur import LevinsonDurbin
from .mgc2mgc import MelGeneralizedCepstrumToMelGeneralizedCepstrum


[docs] class PerceptualLinearPredictiveCoefficientsAnalysis(nn.Module): """See `this page <https://sp-nitech.github.io/sptk/latest/main/plp.html>`_ for details. Parameters ---------- mfcc_order : int >= 1 Order of MFCC, :math:`M`. n_channel : int >= 1 Number of mel-filter banks, :math:`C`. fft_length : int >= 2 Number of FFT bins, :math:`L`. sample_rate : int >= 1 Sample rate in Hz. lifter : int >= 1 Liftering coefficient. compression_factor : float > 0 Amplitude compression factor. f_min : float >= 0 Minimum frequency in Hz. f_max : float <= sample_rate // 2 Maximum frequency in Hz. floor : float > 0 Minimum mel-filter bank output in linear scale. n_fft : int >> M Number of FFT bins. Accurate conversion requires the large value. out_format : ['y', 'yE', 'yc', 'ycE'] `y` is MFCC, `c` is C0, and `E` is energy. References ---------- .. [1] S. Young et al., "The HTK Book," *Cambridge University Press*, 2006. """ def __init__( self, plp_order, n_channel, fft_length, sample_rate, lifter=1, compression_factor=0.33, n_fft=512, out_format="y", **fbank_kwargs, ): super(PerceptualLinearPredictiveCoefficientsAnalysis, self).__init__() assert 1 <= plp_order < n_channel assert 1 <= lifter assert 0 < compression_factor self.plp_order = plp_order self.compression_factor = compression_factor self.formatter = self._formatter(out_format) self.fbank = MelFilterBankAnalysis( n_channel, fft_length, sample_rate, use_power=True, out_format="y,E", **fbank_kwargs, ) self.levdur = LevinsonDurbin(self.plp_order) self.lpc2c = MelGeneralizedCepstrumToMelGeneralizedCepstrum( self.plp_order, self.plp_order, in_gamma=-1, in_norm=True, in_mul=True, n_fft=n_fft, ) f = self.fbank.center_frequencies[:-1] ** 2 e = (f / (f + 1.6e5)) ** 2 * (f + 1.44e6) / (f + 9.61e6) self.register_buffer("equal_loudness_curve", numpy_to_torch(e)) m = np.arange(self.plp_order + 1) v = 1 + (lifter / 2) * np.sin((np.pi / lifter) * m) v[0] = 2 self.register_buffer("liftering_vector", numpy_to_torch(v))
[docs] def forward(self, x): """Compute PLP. Parameters ---------- x : Tensor [shape=(..., L/2+1)] Power spectrum. Returns ------- y : Tensor [shape=(..., M)] PLP without C0. E : Tensor [shape=(..., 1)] (optional) Energy. c : Tensor [shape=(..., 1)] (optional) C0. Examples -------- >>> x = diffsptk.ramp(19) >>> stft = diffsptk.STFT(frame_length=10, frame_period=10, fft_length=32) >>> plp = diffsptk.PLP(4, 8, 32, 8000) >>> y = plp(stft(x)) >>> y tensor([[-0.2896, -0.2356, -0.0586, -0.0387], [ 0.4468, -0.5820, 0.0104, -0.0505]]) """ y, E = self.fbank(x) y = (torch.exp(y) * self.equal_loudness_curve) ** self.compression_factor y = replicate1(y) y = torch.fft.hfft(y, norm="forward")[..., : self.plp_order + 1].real y = self.levdur(y) y = self.lpc2c(y) y *= self.liftering_vector c, y = torch.split(y, [1, self.plp_order], dim=-1) return self.formatter(y, c, E)
@staticmethod def _formatter(out_format): if out_format == 0 or out_format == "y": return lambda y, c, E: y elif out_format == 1 or out_format == "yE": return lambda y, c, E: torch.cat((y, E), dim=-1) elif out_format == 2 or out_format == "yc": return lambda y, c, E: torch.cat((y, c), dim=-1) elif out_format == 3 or out_format == "ycE": return lambda y, c, E: torch.cat((y, c, E), dim=-1) raise ValueError(f"out_format {out_format} is not supported.")