# ------------------------------------------------------------------------ #
# Copyright 2022 SPTK Working Group #
# #
# Licensed under the Apache License, Version 2.0 (the "License"); #
# you may not use this file except in compliance with the License. #
# You may obtain a copy of the License at #
# #
# http://www.apache.org/licenses/LICENSE-2.0 #
# #
# Unless required by applicable law or agreed to in writing, software #
# distributed under the License is distributed on an "AS IS" BASIS, #
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #
# See the License for the specific language governing permissions and #
# limitations under the License. #
# ------------------------------------------------------------------------ #
from abc import ABCMeta
from abc import abstractmethod
import importlib
import torch
import torch.nn as nn
from ..misc.utils import UNVOICED_SYMBOL
from ..misc.utils import numpy_to_torch
from .frame import Frame
from .stft import ShortTermFourierTransform
[docs]class Pitch(nn.Module):
"""Pitch extraction module using external neural models.
Parameters
----------
frame_period : int >= 1 [scalar]
Frame period, :math:`P`.
sample_rate : int >= 1 [scalar]
Sample rate in Hz.
algorithm : ['crepe']
Algorithm.
out_format : ['pitch', 'f0', 'log-f0', 'prob', 'embed']
Output format.
f_min : float >= 0 [scalar]
Minimum frequency in Hz.
f_max : float <= sample_rate // 2 [scalar]
Maximum frequency in Hz.
voicing_threshold : float [scalar]
Voiced/unvoiced threshold.
silence_threshold : float [scalar]
Silence threshold in dB.
filter_length : int >= 1 [scalar]
Window length of median and moving average filters.
model : ['tiny', 'full']
Model size.
"""
def __init__(
self,
frame_period,
sample_rate,
algorithm="crepe",
out_format="pitch",
**kwargs,
):
super(Pitch, self).__init__()
assert 1 <= frame_period
assert 1 <= sample_rate
if algorithm == "crepe":
self.extractor = PitchExtractionByCrepe(frame_period, sample_rate, **kwargs)
else:
raise ValueError(f"algorithm {algorithm} is not supported")
def calc_pitch(x, convert, unvoiced_symbol=UNVOICED_SYMBOL):
with torch.no_grad():
y = self.extractor.calc_pitch(x)
mask = y != UNVOICED_SYMBOL
y[mask] = convert(y[mask])
if unvoiced_symbol != UNVOICED_SYMBOL:
y[~mask] = unvoiced_symbol
return y
if out_format == 0 or out_format == "pitch":
self.convert = lambda x: calc_pitch(x, lambda y: sample_rate / y)
elif out_format == 1 or out_format == "f0":
self.convert = lambda x: calc_pitch(x, lambda y: y)
elif out_format == 2 or out_format == "log-f0":
self.convert = lambda x: calc_pitch(x, lambda y: torch.log(y), -1e10)
elif out_format == "prob":
self.convert = lambda x: self.extractor.calc_prob(x)
elif out_format == "embed":
self.convert = lambda x: self.extractor.calc_embed(x)
else:
raise ValueError(f"out_format {out_format} is not supported")
[docs] def forward(self, x):
"""Compute pitch representation.
Parameters
----------
x : Tensor [shape=(B, T) or (T,)]
Waveform.
Returns
-------
y : Tensor [shape=(B, N, C) or (N, C) or (B, N) or (N,)]
Pitch probability, embedding, or pitch, where N is the number of frames
and C is the number of pitch classes or the dimension of embedding.
Examples
--------
>>> x = diffsptk.sin(100, 10)
>>> pitch = diffsptk.Pitch(80, 16000)
>>> y = pitch(x)
>>> y
tensor([10.0860, 10.0860])
"""
d = x.dim()
if d == 1:
x = x.unsqueeze(0)
assert x.dim() == 2
y = self.convert(x)
if d == 1:
y = y.squeeze(0)
return y
class PitchExtractionInterface(metaclass=ABCMeta):
"""Abstract class for pitch extraction."""
@abstractmethod
def calc_prob(self, x):
"""Calculate pitch probability.
Parameters
----------
x : Tensor [shape=(B, T)]
Waveform.
Returns
-------
y : Tensor [shape=(B, N, C)]
Probability, where C is the number of pitch classes.
"""
@abstractmethod
def calc_embed(self, x):
"""Calculate embedding.
Parameters
----------
x : Tensor [shape=(B, T)]
Waveform.
Returns
-------
y : Tensor [shape=(B, N, D)]
Embedding, where D is the dimension of embedding.
"""
@abstractmethod
def calc_pitch(self, x):
"""Calculate pitch sequence.
Parameters
----------
x : Tensor [shape=(B, T)]
Waveform.
Returns
-------
y : Tensor [shape=(B, N)]
F0 sequence.
"""
class PitchExtractionByCrepe(PitchExtractionInterface, nn.Module):
"""Pitch extraction by CREPE."""
def __init__(
self,
frame_period,
sample_rate,
f_min=0,
f_max=None,
voicing_threshold=1e-2,
silence_threshold=-60,
filter_length=3,
model="full",
):
super(PitchExtractionByCrepe, self).__init__()
self.torchcrepe = importlib.import_module("torchcrepe")
self.f_min = f_min
self.f_max = self.torchcrepe.MAX_FMAX if f_max is None else f_max
self.voicing_threshold = voicing_threshold
self.silence_threshold = silence_threshold
self.filter_length = filter_length
self.model = model
assert 0 <= self.f_min < self.f_max <= sample_rate / 2
assert self.model in ("tiny", "full")
if sample_rate != self.torchcrepe.SAMPLE_RATE:
raise ValueError(f"Only {self.torchcrepe.SAMPLE_RATE} Hz is supported")
self.frame = Frame(self.torchcrepe.WINDOW_SIZE, frame_period, zmean=True)
self.stft = ShortTermFourierTransform(
self.torchcrepe.WINDOW_SIZE,
frame_period,
self.torchcrepe.WINDOW_SIZE,
norm="none",
window="hanning",
out_format="db",
)
weights = self.torchcrepe.loudness.perceptual_weights().squeeze(-1)
self.register_buffer("weights", numpy_to_torch(weights))
def forward(self, x, embed=True):
# torchcrepe.preprocess
x = self.frame(x)
x = x / torch.clip(x.std(dim=-1, keepdim=True), min=1e-10)
# torchcrepe.infer
B, N, L = x.shape
x = x.reshape(-1, L)
y = self.torchcrepe.infer(x, model=self.model, embed=embed)
y = y.reshape(B, N, -1)
return y
def calc_prob(self, x):
return self.forward(x, embed=False)
def calc_embed(self, x):
return self.forward(x, embed=True)
def calc_pitch(self, x):
# Compute pitch probabilities.
prob = self.calc_prob(x).mT
# Decode pitch probabilities.
pitch, periodicity = self.torchcrepe.postprocess(
prob,
fmin=self.f_min,
fmax=self.f_max,
decoder=self.torchcrepe.decode.viterbi,
return_harmonicity=False,
return_periodicity=True,
)
# Apply filters.
periodicity = self.torchcrepe.filter.median(periodicity, self.filter_length)
pitch = self.torchcrepe.filter.mean(pitch, self.filter_length)
# Decide voiced/unvoiced.
loudness = self.stft(x) + self.weights
loudness = torch.clip(loudness, min=self.torchcrepe.loudness.MIN_DB)
loudness = loudness.mean(-1)
mask = torch.logical_or(
periodicity < self.voicing_threshold, loudness < self.silence_threshold
)
pitch[mask] = UNVOICED_SYMBOL
return pitch