Source code for diffsptk.modules.mcpf

# ------------------------------------------------------------------------ #
# Copyright 2022 SPTK Working Group                                        #
#                                                                          #
# Licensed under the Apache License, Version 2.0 (the "License");          #
# you may not use this file except in compliance with the License.         #
# You may obtain a copy of the License at                                  #
#                                                                          #
#     http://www.apache.org/licenses/LICENSE-2.0                           #
#                                                                          #
# Unless required by applicable law or agreed to in writing, software      #
# distributed under the License is distributed on an "AS IS" BASIS,        #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and      #
# limitations under the License.                                           #
# ------------------------------------------------------------------------ #

import inspect

import torch
from torch import nn

from ..typing import Callable, Precomputed
from ..utils.private import get_layer, get_values
from .b2mc import MLSADigitalFilterCoefficientsToMelCepstrum
from .base import BaseFunctionalModule
from .c2acr import CepstrumToAutocorrelation
from .freqt import FrequencyTransform
from .mc2b import MelCepstrumToMLSADigitalFilterCoefficients



[docs]
class MelCepstrumPostfiltering(BaseFunctionalModule):
    """See `this page <https://sp-nitech.github.io/sptk/latest/main/mcpf.html>`_
    for details.

    Parameters
    ----------
    cep_order : int >= 0
        The order of the mel-cepstrum, :math:`M`.

    alpha : float in (-1, 1)
        The frequency warping factor, :math:`\\alpha`.

    beta : float
        The intensity parameter, :math:`\\beta`.

    onset : int >= 0
        The onset index.

    ir_length : int >= 1
        The length of the impulse response.

    References
    ----------
    .. [1] T. Yoshimura et al., "Incorporating a mixed excitation model and postfilter
           into HMM-based text-to-speech synthesis," *Systems and Computers in Japan*,
           vol. 36, no. 12, pp. 43-50, 2005.

    """

    def __init__(
        self,
        cep_order: int,
        alpha: float = 0,
        beta: float = 0,
        onset: int = 2,
        ir_length: int = 128,
    ) -> None:
        super().__init__()

        _, layers, tensors = self._precompute(*get_values(locals()))
        self.layers = nn.ModuleList(layers)
        self.register_buffer("weight", tensors[0])


[docs]
    def forward(self, mc: torch.Tensor) -> torch.Tensor:
        """Perform mel-cesptrum postfiltering.

        Parameters
        ----------
        mc : Tensor [shape=(..., M+1)]
            The input mel-cepstral coefficients.

        Returns
        -------
        out : Tensor [shape=(..., M+1)]
            The postfiltered mel-cepstral coefficients.

        Examples
        --------
        >>> X = diffsptk.nrand(4).square()
        >>> X
        tensor([0.2725, 2.5650, 0.3552, 0.3757, 0.1904])
        >>> mcep = diffsptk.MelCepstralAnalysis(3, 8, 0.1)
        >>> mcpf = diffsptk.MelCepstrumPostfiltering(3, 0.1, 0.2)
        >>> mc1 = mcep(X)
        >>> mc1
        tensor([-0.2819,  0.3486, -0.2487, -0.3600])
        >>> mc2 = mcpf(mc1)
        >>> mc2
        tensor([-0.3256,  0.3486, -0.2984, -0.4320])

        """
        return self._forward(mc, *self.layers, **self._buffers)


    @staticmethod
    def _func(mc: torch.Tensor, *args, **kwargs) -> torch.Tensor:
        _, layers, tensors = MelCepstrumPostfiltering._precompute(
            mc.size(-1) - 1, *args, **kwargs, device=mc.device, dtype=mc.dtype
        )
        return MelCepstrumPostfiltering._forward(mc, *layers, *tensors)

    @staticmethod
    def _takes_input_size() -> bool:
        return True

    @staticmethod
    def _check(onset: int) -> None:
        if onset < 0:
            raise ValueError("onset must be non-negative.")

    @staticmethod
    def _precompute(
        cep_order: int,
        alpha: float,
        beta: float,
        onset: int,
        ir_length: int,
        device: torch.device | None = None,
        dtype: torch.dtype | None = None,
    ) -> Precomputed:
        MelCepstrumPostfiltering._check(onset)
        module = inspect.stack()[1].function != "_func"

        freqt = get_layer(
            module,
            FrequencyTransform,
            dict(
                in_order=cep_order,
                out_order=ir_length - 1,
                alpha=-alpha,
            ),
        )
        c2acr = get_layer(
            module,
            CepstrumToAutocorrelation,
            dict(
                cep_order=ir_length - 1,
                acr_order=0,
                n_fft=ir_length,
            ),
        )
        mc2b = get_layer(
            module,
            MelCepstrumToMLSADigitalFilterCoefficients,
            dict(cep_order=cep_order, alpha=alpha),
        )
        b2mc = get_layer(
            module,
            MLSADigitalFilterCoefficientsToMelCepstrum,
            dict(cep_order=cep_order, alpha=alpha),
        )

        weight = torch.full((cep_order + 1,), 1 + beta, device=device, dtype=dtype)
        weight[:onset] = 1
        return None, (freqt, c2acr, mc2b, b2mc), (weight,)

    @staticmethod
    def _forward(
        mc: torch.Tensor,
        freqt: Callable,
        c2acr: Callable,
        mc2b: Callable,
        b2mc: Callable,
        weight: torch.Tensor,
    ) -> torch.Tensor:
        mc1 = mc
        e1 = c2acr(freqt(mc1))

        mc2 = mc * weight
        e2 = c2acr(freqt(mc2))

        b2 = mc2b(mc2)
        b2[..., :1] += 0.5 * torch.log(e1 / e2)
        mc2 = b2mc(b2)
        return mc2