Skip to content

Commit

Permalink
dev(narugo): save current code
Browse files Browse the repository at this point in the history
  • Loading branch information
narugo1992 committed Aug 29, 2024
1 parent b73966b commit 25f459c
Show file tree
Hide file tree
Showing 33 changed files with 3,211 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -1225,3 +1225,6 @@ fabric.properties
/YOLOv8
.tests
/pretrained_models
/*.wav
/mdist
/onnxs
26 changes: 26 additions & 0 deletions report_dynamo_export.sarif
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"runs":[
{
"tool":{
"driver":{
"name":"torch.onnx.dynamo_export",
"contents":[
"localizedData",
"nonLocalizedData"
],
"language":"en-US",
"rules":[],
"version":"2.4.0+cu121"
}
},
"language":"en-US",
"newlineSequences":[
"\r\n",
"\n"
],
"results":[]
}
],
"version":"2.1.0",
"schemaUri":"https://docs.oasis-open.org/sarif/sarif/v2.1.0/cs01/schemas/sarif-schema-2.1.0.json"
}
3 changes: 2 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ pytest-image-diff>=0.0.11
matplotlib
natsort
torchaudio
torch
torch
transformers
2 changes: 2 additions & 0 deletions requirements-zoo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ pyarrow
pandas
langdetect
pyquery
onnxscript
transformers
2 changes: 2 additions & 0 deletions soundutils/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .mel import mel_to_hertz, hertz_to_mel
from .octave import hertz_to_octave
from .sound import Sound, SoundTyping
125 changes: 125 additions & 0 deletions soundutils/data/mel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module provides functions for converting between Hertz and Mel frequency scales.
The Mel scale is a perceptual scale of pitches judged by listeners to be equal in distance from one another.
This scale is often used in speech processing and audio analysis tasks.
The module supports three different Mel scale implementations:
1. HTK (Hidden Markov Model Toolkit)
2. Kaldi (Speech Recognition Toolkit)
3. Slaney (Malcolm Slaney's implementation)
Functions:
- hertz_to_mel: Convert frequency from Hertz to Mel scale
- mel_to_hertz: Convert frequency from Mel scale to Hertz
These functions support both single float values and numpy arrays as inputs.
"""

from typing import Union

import numpy as np


def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
"""
Convert frequency from hertz to mels.
This function transforms frequencies from the Hertz scale to the Mel scale.
The Mel scale is a perceptual scale of pitches judged by listeners to be equal in distance from one another.
:param freq: The frequency, or multiple frequencies, in hertz (Hz).
:type freq: float or np.ndarray
:param mel_scale: The mel frequency scale to use. Options are "htk", "kaldi", or "slaney".
:type mel_scale: str, optional
:return: The frequencies on the mel scale.
:rtype: float or np.ndarray
:raises ValueError: If mel_scale is not one of "htk", "slaney", or "kaldi".
:Example:
>>> hertz_to_mel(1000)
1000.0
>>> hertz_to_mel(np.array([440, 880]))
array([548.68, 968.31])
"""

if mel_scale not in ["slaney", "htk", "kaldi"]:
raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')

if mel_scale == "htk":
return 2595.0 * np.log10(1.0 + (freq / 700.0))
elif mel_scale == "kaldi":
return 1127.0 * np.log(1.0 + (freq / 700.0))

min_log_hertz = 1000.0
min_log_mel = 15.0
logstep = 27.0 / np.log(6.4)
mels = 3.0 * freq / 200.0

if isinstance(freq, np.ndarray):
log_region = freq >= min_log_hertz
mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
elif freq >= min_log_hertz:
mels = min_log_mel + np.log(freq / min_log_hertz) * logstep

return mels


def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
"""
Convert frequency from mels to hertz.
This function transforms frequencies from the Mel scale back to the Hertz scale.
It is the inverse operation of hertz_to_mel.
:param mels: The frequency, or multiple frequencies, in mels.
:type mels: float or np.ndarray
:param mel_scale: The mel frequency scale to use. Options are "htk", "kaldi", or "slaney".
:type mel_scale: str, optional
:return: The frequencies in hertz.
:rtype: float or np.ndarray
:raises ValueError: If mel_scale is not one of "htk", "slaney", or "kaldi".
:Example:
>>> mel_to_hertz(1000)
1000.0
>>> mel_to_hertz(np.array([548.68, 968.31]))
array([440., 880.])
"""

if mel_scale not in ["slaney", "htk", "kaldi"]:
raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')

if mel_scale == "htk":
return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
elif mel_scale == "kaldi":
return 700.0 * (np.exp(mels / 1127.0) - 1.0)

min_log_hertz = 1000.0
min_log_mel = 15.0
logstep = np.log(6.4) / 27.0
freq = 200.0 * mels / 3.0

if isinstance(mels, np.ndarray):
log_region = mels >= min_log_mel
freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
elif mels >= min_log_mel:
freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))

return freq
26 changes: 26 additions & 0 deletions soundutils/data/octave.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from typing import Union, Optional

import numpy as np


def hertz_to_octave(
freq: Union[float, np.ndarray], tuning: Optional[float] = 0.0, bins_per_octave: Optional[int] = 12
):
"""
Convert frequency from hertz to fractional octave numbers.
Adapted from *librosa*.
Args:
freq (`float` or `np.ndarray`):
The frequency, or multiple frequencies, in hertz (Hz).
tuning (`float`, defaults to `0.`):
Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave.
bins_per_octave (`int`, defaults to `12`):
Number of bins per octave.
Returns:
`float` or `np.ndarray`: The frequencies on the octave scale.
"""
stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave)
octave = np.log2(freq / (float(stuttgart_pitch) / 16))
return octave
Empty file.
1 change: 1 addition & 0 deletions soundutils/preprocess/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .whisper import WhisperFeatureExtractor
Loading

0 comments on commit 25f459c

Please sign in to comment.