Source code for diffsptk.core.pitch

# ------------------------------------------------------------------------ #
# Copyright 2022 SPTK Working Group                                        #
#                                                                          #
# Licensed under the Apache License, Version 2.0 (the "License");          #
# you may not use this file except in compliance with the License.         #
# You may obtain a copy of the License at                                  #
#                                                                          #
#                           #
#                                                                          #
# Unless required by applicable law or agreed to in writing, software      #
# distributed under the License is distributed on an "AS IS" BASIS,        #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and      #
# limitations under the License.                                           #
# ------------------------------------------------------------------------ #

from abc import ABCMeta
from abc import abstractmethod
import importlib

import torch
import torch.nn as nn

from ..misc.utils import UNVOICED_SYMBOL
from ..misc.utils import numpy_to_torch
from .frame import Frame
from .stft import ShortTermFourierTransform

[docs]class Pitch(nn.Module): """Pitch extraction module using external neural models. Parameters ---------- frame_period : int >= 1 [scalar] Frame period, :math:`P`. sample_rate : int >= 1 [scalar] Sample rate in Hz. algorithm : ['crepe'] Algorithm. out_format : ['pitch', 'f0', 'log-f0', 'prob', 'embed'] Output format. f_min : float >= 0 [scalar] Minimum frequency in Hz. f_max : float <= sample_rate // 2 [scalar] Maximum frequency in Hz. voicing_threshold : float [scalar] Voiced/unvoiced threshold. silence_threshold : float [scalar] Silence threshold in dB. filter_length : int >= 1 [scalar] Window length of median and moving average filters. model : ['tiny', 'full'] Model size. """ def __init__( self, frame_period, sample_rate, algorithm="crepe", out_format="pitch", **kwargs, ): super(Pitch, self).__init__() assert 1 <= frame_period assert 1 <= sample_rate if algorithm == "crepe": self.extractor = PitchExtractionByCrepe(frame_period, sample_rate, **kwargs) else: raise ValueError(f"algorithm {algorithm} is not supported") def calc_pitch(x, convert, unvoiced_symbol=UNVOICED_SYMBOL): with torch.no_grad(): y = self.extractor.calc_pitch(x) mask = y != UNVOICED_SYMBOL y[mask] = convert(y[mask]) if unvoiced_symbol != UNVOICED_SYMBOL: y[~mask] = unvoiced_symbol return y if out_format == 0 or out_format == "pitch": self.convert = lambda x: calc_pitch(x, lambda y: sample_rate / y) elif out_format == 1 or out_format == "f0": self.convert = lambda x: calc_pitch(x, lambda y: y) elif out_format == 2 or out_format == "log-f0": self.convert = lambda x: calc_pitch(x, lambda y: torch.log(y), -1e10) elif out_format == "prob": self.convert = lambda x: self.extractor.calc_prob(x) elif out_format == "embed": self.convert = lambda x: self.extractor.calc_embed(x) else: raise ValueError(f"out_format {out_format} is not supported")
[docs] def forward(self, x): """Compute pitch representation. Parameters ---------- x : Tensor [shape=(B, T) or (T,)] Waveform. Returns ------- y : Tensor [shape=(B, N, C) or (N, C) or (B, N) or (N,)] Pitch probability, embedding, or pitch, where N is the number of frames and C is the number of pitch classes or the dimension of embedding. Examples -------- >>> x = diffsptk.sin(100, 10) >>> pitch = diffsptk.Pitch(80, 16000) >>> y = pitch(x) >>> y tensor([10.0860, 10.0860]) """ d = x.dim() if d == 1: x = x.unsqueeze(0) assert x.dim() == 2 y = self.convert(x) if d == 1: y = y.squeeze(0) return y
class PitchExtractionInterface(metaclass=ABCMeta): """Abstract class for pitch extraction.""" @abstractmethod def calc_prob(self, x): """Calculate pitch probability. Parameters ---------- x : Tensor [shape=(B, T)] Waveform. Returns ------- y : Tensor [shape=(B, N, C)] Probability, where C is the number of pitch classes. """ @abstractmethod def calc_embed(self, x): """Calculate embedding. Parameters ---------- x : Tensor [shape=(B, T)] Waveform. Returns ------- y : Tensor [shape=(B, N, D)] Embedding, where D is the dimension of embedding. """ @abstractmethod def calc_pitch(self, x): """Calculate pitch sequence. Parameters ---------- x : Tensor [shape=(B, T)] Waveform. Returns ------- y : Tensor [shape=(B, N)] F0 sequence. """ class PitchExtractionByCrepe(PitchExtractionInterface, nn.Module): """Pitch extraction by CREPE.""" def __init__( self, frame_period, sample_rate, f_min=0, f_max=None, voicing_threshold=1e-2, silence_threshold=-60, filter_length=3, model="full", ): super(PitchExtractionByCrepe, self).__init__() self.torchcrepe = importlib.import_module("torchcrepe") self.f_min = f_min self.f_max = self.torchcrepe.MAX_FMAX if f_max is None else f_max self.voicing_threshold = voicing_threshold self.silence_threshold = silence_threshold self.filter_length = filter_length self.model = model assert 0 <= self.f_min < self.f_max <= sample_rate / 2 assert self.model in ("tiny", "full") if sample_rate != self.torchcrepe.SAMPLE_RATE: raise ValueError(f"Only {self.torchcrepe.SAMPLE_RATE} Hz is supported") self.frame = Frame(self.torchcrepe.WINDOW_SIZE, frame_period, zmean=True) self.stft = ShortTermFourierTransform( self.torchcrepe.WINDOW_SIZE, frame_period, self.torchcrepe.WINDOW_SIZE, norm="none", window="hanning", out_format="db", ) weights = self.torchcrepe.loudness.perceptual_weights().squeeze(-1) self.register_buffer("weights", numpy_to_torch(weights)) def forward(self, x, embed=True): # torchcrepe.preprocess x = self.frame(x) x = x / torch.clip(x.std(dim=-1, keepdim=True), min=1e-10) # torchcrepe.infer B, N, L = x.shape x = x.reshape(-1, L) y = self.torchcrepe.infer(x, model=self.model, embed=embed) y = y.reshape(B, N, -1) return y def calc_prob(self, x): return self.forward(x, embed=False) def calc_embed(self, x): return self.forward(x, embed=True) def calc_pitch(self, x): # Compute pitch probabilities. prob = self.calc_prob(x).mT # Decode pitch probabilities. pitch, periodicity = self.torchcrepe.postprocess( prob, fmin=self.f_min, fmax=self.f_max, decoder=self.torchcrepe.decode.viterbi, return_harmonicity=False, return_periodicity=True, ) # Apply filters. periodicity = self.torchcrepe.filter.median(periodicity, self.filter_length) pitch = self.torchcrepe.filter.mean(pitch, self.filter_length) # Decide voiced/unvoiced. loudness = self.stft(x) + self.weights loudness = torch.clip(loudness, min=self.torchcrepe.loudness.MIN_DB) loudness = loudness.mean(-1) mask = torch.logical_or( periodicity < self.voicing_threshold, loudness < self.silence_threshold ) pitch[mask] = UNVOICED_SYMBOL return pitch