Source code for diffsptk.modules.mfcc

# ------------------------------------------------------------------------ #
# Copyright 2022 SPTK Working Group                                        #
#                                                                          #
# Licensed under the Apache License, Version 2.0 (the "License");          #
# you may not use this file except in compliance with the License.         #
# You may obtain a copy of the License at                                  #
#                                                                          #
#     http://www.apache.org/licenses/LICENSE-2.0                           #
#                                                                          #
# Unless required by applicable law or agreed to in writing, software      #
# distributed under the License is distributed on an "AS IS" BASIS,        #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and      #
# limitations under the License.                                           #
# ------------------------------------------------------------------------ #

import numpy as np
import torch
import torch.nn as nn

from ..misc.utils import numpy_to_torch
from .dct import DiscreteCosineTransform
from .fbank import MelFilterBankAnalysis



[docs]
class MelFrequencyCepstralCoefficientsAnalysis(nn.Module):
    """See `this page <https://sp-nitech.github.io/sptk/latest/main/mfcc.html>`_
    for details.

    Parameters
    ----------
    mfcc_order : int >= 1
        Order of MFCC, :math:`M`.

    n_channel : int >= 1
        Number of mel-filter banks, :math:`C`.

    fft_length : int >= 2
        Number of FFT bins, :math:`L`.

    sample_rate : int >= 1
        Sample rate in Hz.

    lifter : int >= 1
        Liftering coefficient.

    f_min : float >= 0
        Minimum frequency in Hz.

    f_max : float <= sample_rate // 2
        Maximum frequency in Hz.

    floor : float > 0
        Minimum mel-filter bank output in linear scale.

    out_format : ['y', 'yE', 'yc', 'ycE']
        `y` is MFCC, `c` is C0, and `E` is energy.

    References
    ----------
    .. [1] S. Young et al., "The HTK Book," *Cambridge University Press*, 2006.

    """

    def __init__(
        self,
        mfcc_order,
        n_channel,
        fft_length,
        sample_rate,
        lifter=1,
        out_format="y",
        **fbank_kwargs,
    ):
        super(MelFrequencyCepstralCoefficientsAnalysis, self).__init__()

        assert 1 <= mfcc_order < n_channel
        assert 1 <= lifter

        self.mfcc_order = mfcc_order
        self.formatter = self._formatter(out_format)

        self.fbank = MelFilterBankAnalysis(
            n_channel, fft_length, sample_rate, out_format="y,E", **fbank_kwargs
        )
        self.dct = DiscreteCosineTransform(n_channel)

        m = np.arange(self.mfcc_order + 1)
        v = 1 + (lifter / 2) * np.sin((np.pi / lifter) * m)
        v[0] = np.sqrt(2)
        self.register_buffer("liftering_vector", numpy_to_torch(v))


[docs]
    def forward(self, x):
        """Compute MFCC.

        Parameters
        ----------
        x : Tensor [shape=(..., L/2+1)]
            Power spectrum.

        Returns
        -------
        y : Tensor [shape=(..., M)]
            MFCC without C0.

        E : Tensor [shape=(..., 1)] (optional)
            Energy.

        c : Tensor [shape=(..., 1)] (optional)
            C0.

        Examples
        --------
        >>> x = diffsptk.ramp(19)
        >>> stft = diffsptk.STFT(frame_length=10, frame_period=10, fft_length=32)
        >>> mfcc = diffsptk.MFCC(4, 8, 32, 8000)
        >>> y = mfcc(stft(x))
        >>> y
        tensor([[-7.7745e-03, -1.4447e-02,  1.6157e-02,  1.1069e-03],
                [ 2.8049e+00, -1.6257e+00, -2.3566e-02,  1.2804e-01]])

        """
        y, E = self.fbank(x)
        y = self.dct(y)
        y = y[..., : self.mfcc_order + 1] * self.liftering_vector
        c, y = torch.split(y, [1, self.mfcc_order], dim=-1)
        return self.formatter(y, c, E)


    @staticmethod
    def _formatter(out_format):
        if out_format == 0 or out_format == "y":
            return lambda y, c, E: y
        elif out_format == 1 or out_format == "yE":
            return lambda y, c, E: torch.cat((y, E), dim=-1)
        elif out_format == 2 or out_format == "yc":
            return lambda y, c, E: torch.cat((y, c), dim=-1)
        elif out_format == 3 or out_format == "ycE":
            return lambda y, c, E: torch.cat((y, c, E), dim=-1)
        raise ValueError(f"out_format {out_format} is not supported.")