From c2d283a64a7f33547952e3eb0fa6533fc375bcdd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 09:12:53 +0100 Subject: [PATCH 001/820] Bump tj-actions/changed-files from 22.2 to 41 in /.github/workflows (#28311) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 22.2 to 41. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v22.2...v41) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/self-push-caller.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/self-push-caller.yml b/.github/workflows/self-push-caller.yml index 9247848b89ec6d..14b5262426b452 100644 --- a/.github/workflows/self-push-caller.yml +++ b/.github/workflows/self-push-caller.yml @@ -25,7 +25,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v22.2 + uses: tj-actions/changed-files@v41 - name: Was setup changed id: was_changed From 6eba901d88909b4635b02e284723a71a675ebaa0 Mon Sep 17 00:00:00 2001 From: lain <70411813+not-lain@users.noreply.github.com> Date: Wed, 3 Jan 2024 18:20:34 +0100 Subject: [PATCH 002/820] fix documentation for zero_shot_object_detection (#28267) remove broken space --- docs/source/en/tasks/zero_shot_object_detection.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/docs/source/en/tasks/zero_shot_object_detection.md b/docs/source/en/tasks/zero_shot_object_detection.md index 3dfefb3c8b5e66..7af6bc3dc38441 100644 --- a/docs/source/en/tasks/zero_shot_object_detection.md +++ b/docs/source/en/tasks/zero_shot_object_detection.md @@ -299,11 +299,3 @@ as before except now there are no labels. Cats with bounding boxes -If you'd like to interactively try out inference with OWL-ViT, check out this demo: - - From d83ff5eeff1d5360052a66074dde1f87bd9f3a21 Mon Sep 17 00:00:00 2001 From: Connor Henderson Date: Wed, 3 Jan 2024 13:01:06 -0500 Subject: [PATCH 003/820] Add FastSpeech2Conformer (#23439) * start - docs, SpeechT5 copy and rename * add relevant code from FastSpeech2 draft, have tests pass * make it an actual conformer, demo ex. * matching inference with original repo, includes debug code * refactor nn.Sequentials, start more desc. var names * more renaming * more renaming * vocoder scratchwork * matching vocoder outputs * hifigan vocoder conversion script * convert model script, rename some config vars * replace postnet with speecht5's implementation * passing common tests, file cleanup * expand testing, add output hidden states and attention * tokenizer + passing tokenizer tests * variety of updates and tests * g2p_en pckg setup * import structure edits * docstrings and cleanup * repo consistency * deps * small cleanup * forward signature param order * address comments except for masks and labels * address comments on attention_mask and labels * address second round of comments * remove old unneeded line * address comments part 1 * address comments pt 2 * rename auto mapping * fixes for failing tests * address comments part 3 (bart-like, train loss) * make style * pass config where possible * add forward method + tests to WithHifiGan model * make style * address arg passing and generate_speech comments * address Arthur comments * address Arthur comments pt2 * lint changes * Sanchit comment * add g2p-en to doctest deps * move up self.encoder * onnx compatible tensor method * fix is symbolic * fix paper url * move models to espnet org * make style * make fix-copies * update docstring * Arthur comments * update docstring w/ new updates * add model architecture images * header size * md wording update * make style --- .circleci/create_circleci_config.py | 1 + README.md | 1 + README_es.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + .../en/model_doc/fastspeech2_conformer.md | 134 ++ docs/source/en/tasks/text-to-speech.md | 6 +- docs/source/fr/index.md | 2 + src/transformers/__init__.py | 47 +- src/transformers/file_utils.py | 1 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + src/transformers/models/auto/modeling_auto.py | 3 + .../models/auto/tokenization_auto.py | 13 +- .../models/fastspeech2_conformer/__init__.py | 77 + .../configuration_fastspeech2_conformer.py | 488 +++++ ..._original_pytorch_checkpoint_to_pytorch.py | 210 ++ .../fastspeech2_conformer/convert_hifigan.py | 134 ++ .../convert_model_with_hifigan.py | 102 + .../modeling_fastspeech2_conformer.py | 1686 +++++++++++++++++ .../tokenization_fastspeech2_conformer.py | 198 ++ .../models/speecht5/modeling_speecht5.py | 2 + src/transformers/testing_utils.py | 8 + src/transformers/utils/__init__.py | 1 + src/transformers/utils/dummy_pt_objects.py | 31 + src/transformers/utils/import_utils.py | 13 +- .../models/fastspeech2_conformer/__init__.py | 0 .../test_modeling_fastspeech2_conformer.py | 790 ++++++++ ...test_tokenization_fastspeech2_conformer.py | 190 ++ utils/check_config_attributes.py | 1 + utils/check_repo.py | 4 + 36 files changed, 4140 insertions(+), 16 deletions(-) create mode 100644 docs/source/en/model_doc/fastspeech2_conformer.md create mode 100644 src/transformers/models/fastspeech2_conformer/__init__.py create mode 100644 src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py create mode 100644 src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py create mode 100644 src/transformers/models/fastspeech2_conformer/convert_hifigan.py create mode 100644 src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py create mode 100644 src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py create mode 100644 src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py create mode 100644 tests/models/fastspeech2_conformer/__init__.py create mode 100644 tests/models/fastspeech2_conformer/test_modeling_fastspeech2_conformer.py create mode 100644 tests/models/fastspeech2_conformer/test_tokenization_fastspeech2_conformer.py diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 41e83d87438ea0..8fd237a5ae7471 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -515,6 +515,7 @@ def job_name(self): "pip install -U --upgrade-strategy eager -e git+https://github.com/huggingface/accelerate@main#egg=accelerate", "pip install --upgrade --upgrade-strategy eager pytest pytest-sugar", "pip install -U --upgrade-strategy eager natten", + "pip install -U --upgrade-strategy eager g2p-en", "find -name __pycache__ -delete", "find . -name \*.pyc -delete", # Add an empty file to keep the test step running correctly even no file is selected to be tested. diff --git a/README.md b/README.md index 0a45a99fd6bf7d..0e726850a8b20a 100644 --- a/README.md +++ b/README.md @@ -358,6 +358,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/README_es.md b/README_es.md index 2fe82606b928c3..31af97949dc3a9 100644 --- a/README_es.md +++ b/README_es.md @@ -333,6 +333,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/README_hd.md b/README_hd.md index 35e21548e6063f..9d9a9b2656ec16 100644 --- a/README_hd.md +++ b/README_hd.md @@ -307,6 +307,7 @@ conda install -c huggingface transformers 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu से) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. द्वाराअनुसंधान पत्र [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) के साथ जारी किया गया 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (मेटा AI से) ट्रांसफॉर्मर प्रोटीन भाषा मॉडल हैं। **ESM-1b** पेपर के साथ जारी किया गया था [ अलेक्जेंडर राइव्स, जोशुआ मेयर, टॉम सर्कु, सिद्धार्थ गोयल, ज़ेमिंग लिन द्वारा जैविक संरचना और कार्य असुरक्षित सीखने को 250 मिलियन प्रोटीन अनुक्रमों तक स्केल करने से उभरता है] (https://www.pnas.org/content/118/15/e2016239118) जेसन लियू, डेमी गुओ, मायल ओट, सी. लॉरेंस ज़िटनिक, जेरी मा और रॉब फर्गस। **ESM-1v** को पेपर के साथ जारी किया गया था [भाषा मॉडल प्रोटीन फ़ंक्शन पर उत्परिवर्तन के प्रभावों की शून्य-शॉट भविष्यवाणी को सक्षम करते हैं] (https://doi.org/10.1101/2021.07.09.450648) जोशुआ मेयर, रोशन राव, रॉबर्ट वेरकुइल, जेसन लियू, टॉम सर्कु और अलेक्जेंडर राइव्स द्वारा। **ESM-2** को पेपर के साथ जारी किया गया था [भाषा मॉडल विकास के पैमाने पर प्रोटीन अनुक्रम सटीक संरचना भविष्यवाणी को सक्षम करते हैं](https://doi.org/10.1101/2022.07.20.500902) ज़ेमिंग लिन, हलील अकिन, रोशन राव, ब्रायन ही, झोंगकाई झू, वेंटिंग लू, ए द्वारा लान डॉस सैंटोस कोस्टा, मरियम फ़ज़ल-ज़रंडी, टॉम सर्कू, साल कैंडिडो, अलेक्जेंडर राइव्स। 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (ESPnet and Microsoft Research से) Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. द्वाराअनुसंधान पत्र [Fastspeech 2: Fast And High-quality End-to-End Text To Speech](https://arxiv.org/pdf/2006.04558.pdf) के साथ जारी किया गया 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS से) साथ वाला पेपर [FlauBERT: Unsupervised Language Model Pre-training for फ़्रेंच](https://arxiv .org/abs/1912.05372) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, बेंजामिन लेकोउटेक्स, अलेक्जेंड्रे अल्लाउज़ेन, बेनोइट क्रैबे, लॉरेंट बेसेसियर, डिडिएर श्वाब द्वारा। diff --git a/README_ja.md b/README_ja.md index b87767cf37156a..bf37bf8eff34db 100644 --- a/README_ja.md +++ b/README_ja.md @@ -367,6 +367,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu から) Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. から公開された研究論文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (Meta AI から) はトランスフォーマープロテイン言語モデルです. **ESM-1b** は Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus から公開された研究論文: [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118). **ESM-1v** は Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives から公開された研究論文: [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648). **ESM-2** と **ESMFold** は Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives から公開された研究論文: [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (ESPnet and Microsoft Research から) Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. から公開された研究論文 [Fastspeech 2: Fast And High-quality End-to-End Text To Speech](https://arxiv.org/pdf/2006.04558.pdf) 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (Google AI から) Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V から公開されたレポジトリー [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (CNRS から) Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab から公開された研究論文: [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) diff --git a/README_ko.md b/README_ko.md index cd71488d1f455b..aac582a82c5c5a 100644 --- a/README_ko.md +++ b/README_ko.md @@ -282,6 +282,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (Baidu 에서 제공)은 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.의 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674)논문과 함께 발표했습니다. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (ESPnet and Microsoft Research 에서 제공)은 Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang.의 [Fastspeech 2: Fast And High-quality End-to-End Text To Speech](https://arxiv.org/pdf/2006.04558.pdf)논문과 함께 발표했습니다. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/README_zh-hans.md b/README_zh-hans.md index 4f3258ecde1860..3cb3143f28f7d9 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -306,6 +306,7 @@ conda install -c huggingface transformers 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (来自 Baidu) 伴随论文 [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) 由 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang 发布。 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (来自 ESPnet and Microsoft Research) 伴随论文 [Fastspeech 2: Fast And High-quality End-to-End Text To Speech](https://arxiv.org/pdf/2006.04558.pdf) 由 Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang 发布。 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 407c4e952b763a..6d57842e2e6a2d 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -318,6 +318,7 @@ conda install -c huggingface transformers 1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang. 1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2** was released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. 1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (from ESPnet and Microsoft Research) released with the paper [Fastspeech 2: Fast And High-quality End-to-End Text To Speech](https://arxiv.org/pdf/2006.04558.pdf) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. 1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 5116e4219fbcb1..739353d4d34a52 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -332,6 +332,8 @@ title: ESM - local: model_doc/falcon title: Falcon + - local: model_doc/fastspeech2_conformer + title: FastSpeech2Conformer - local: model_doc/flan-t5 title: FLAN-T5 - local: model_doc/flan-ul2 diff --git a/docs/source/en/index.md b/docs/source/en/index.md index f63922d7f854a0..bf87c991a996ef 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -132,6 +132,7 @@ Flax), PyTorch, and/or TensorFlow. | [ESM](model_doc/esm) | ✅ | ✅ | ❌ | | [FairSeq Machine-Translation](model_doc/fsmt) | ✅ | ❌ | ❌ | | [Falcon](model_doc/falcon) | ✅ | ❌ | ❌ | +| [FastSpeech2Conformer](model_doc/fastspeech2_conformer) | ✅ | ❌ | ❌ | | [FLAN-T5](model_doc/flan-t5) | ✅ | ✅ | ✅ | | [FLAN-UL2](model_doc/flan-ul2) | ✅ | ✅ | ✅ | | [FlauBERT](model_doc/flaubert) | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/fastspeech2_conformer.md b/docs/source/en/model_doc/fastspeech2_conformer.md new file mode 100644 index 00000000000000..3995036eff0ca4 --- /dev/null +++ b/docs/source/en/model_doc/fastspeech2_conformer.md @@ -0,0 +1,134 @@ + + +# FastSpeech2Conformer + +## Overview + +The FastSpeech2Conformer model was proposed with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. + +The abstract from the original FastSpeech2 paper is the following: + +*Non-autoregressive text to speech (TTS) models such as FastSpeech (Ren et al., 2019) can synthesize speech significantly faster than previous autoregressive models with comparable quality. The training of FastSpeech model relies on an autoregressive teacher model for duration prediction (to provide more information as input) and knowledge distillation (to simplify the data distribution in output), which can ease the one-to-many mapping problem (i.e., multiple speech variations correspond to the same text) in TTS. However, FastSpeech has several disadvantages: 1) the teacher-student distillation pipeline is complicated and time-consuming, 2) the duration extracted from the teacher model is not accurate enough, and the target mel-spectrograms distilled from teacher model suffer from information loss due to data simplification, both of which limit the voice quality. In this paper, we propose FastSpeech 2, which addresses the issues in FastSpeech and better solves the one-to-many mapping problem in TTS by 1) directly training the model with ground-truth target instead of the simplified output from teacher, and 2) introducing more variation information of speech (e.g., pitch, energy and more accurate duration) as conditional inputs. Specifically, we extract duration, pitch and energy from speech waveform and directly take them as conditional inputs in training and use predicted values in inference. We further design FastSpeech 2s, which is the first attempt to directly generate speech waveform from text in parallel, enjoying the benefit of fully end-to-end inference. Experimental results show that 1) FastSpeech 2 achieves a 3x training speed-up over FastSpeech, and FastSpeech 2s enjoys even faster inference speed; 2) FastSpeech 2 and 2s outperform FastSpeech in voice quality, and FastSpeech 2 can even surpass autoregressive models. Audio samples are available at https://speechresearch.github.io/fastspeech2/.* + +This model was contributed by [Connor Henderson](https://huggingface.co/connor-henderson). The original code can be found [here](https://github.com/espnet/espnet/blob/master/espnet2/tts/fastspeech2/fastspeech2.py). + + +## 🤗 Model Architecture +FastSpeech2's general structure with a Mel-spectrogram decoder was implemented, and the traditional transformer blocks were replaced with with conformer blocks as done in the ESPnet library. + +#### FastSpeech2 Model Architecture +![FastSpeech2 Model Architecture](https://www.microsoft.com/en-us/research/uploads/prod/2021/04/fastspeech2-1.png) + +#### Conformer Blocks +![Conformer Blocks](https://www.researchgate.net/profile/Hirofumi-Inaguma-2/publication/344911155/figure/fig2/AS:951455406108673@1603856054097/An-overview-of-Conformer-block.png) + +#### Convolution Module +![Convolution Module](https://d3i71xaburhd42.cloudfront.net/8809d0732f6147d4ad9218c8f9b20227c837a746/2-Figure1-1.png) + +## 🤗 Transformers Usage + +You can run FastSpeech2Conformer locally with the 🤗 Transformers library. + +1. First install the 🤗 [Transformers library](https://github.com/huggingface/transformers), g2p-en: + +``` +pip install --upgrade pip +pip install --upgrade transformers g2p-en +``` + +2. Run inference via the Transformers modelling code with the model and hifigan separately + +```python + +from transformers import FastSpeech2ConformerTokenizer, FastSpeech2ConformerModel, FastSpeech2ConformerHifiGan +import soundfile as sf + +tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") +inputs = tokenizer("Hello, my dog is cute.", return_tensors="pt") +input_ids = inputs["input_ids"] + +model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer") +output_dict = model(input_ids, return_dict=True) +spectrogram = output_dict["spectrogram"] + +hifigan = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan") +waveform = hifigan(spectrogram) + +sf.write("speech.wav", waveform.squeeze().detach().numpy(), samplerate=22050) +``` + +3. Run inference via the Transformers modelling code with the model and hifigan combined + +```python +from transformers import FastSpeech2ConformerTokenizer, FastSpeech2ConformerWithHifiGan +import soundfile as sf + +tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") +inputs = tokenizer("Hello, my dog is cute.", return_tensors="pt") +input_ids = inputs["input_ids"] + +model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan") +output_dict = model(input_ids, return_dict=True) +waveform = output_dict["waveform"] + +sf.write("speech.wav", waveform.squeeze().detach().numpy(), samplerate=22050) +``` + +4. Run inference with a pipeline and specify which vocoder to use +```python +from transformers import pipeline, FastSpeech2ConformerHifiGan +import soundfile as sf + +vocoder = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan") +synthesiser = pipeline(model="espnet/fastspeech2_conformer", vocoder=vocoder) + +speech = synthesiser("Hello, my dog is cooler than you!") + +sf.write("speech.wav", speech["audio"].squeeze(), samplerate=speech["sampling_rate"]) +``` + + +## FastSpeech2ConformerConfig + +[[autodoc]] FastSpeech2ConformerConfig + +## FastSpeech2ConformerHifiGanConfig + +[[autodoc]] FastSpeech2ConformerHifiGanConfig + +## FastSpeech2ConformerWithHifiGanConfig + +[[autodoc]] FastSpeech2ConformerWithHifiGanConfig + +## FastSpeech2ConformerTokenizer + +[[autodoc]] FastSpeech2ConformerTokenizer + - __call__ + - save_vocabulary + - decode + - batch_decode + +## FastSpeech2ConformerModel + +[[autodoc]] FastSpeech2ConformerModel + - forward + +## FastSpeech2ConformerHifiGan + +[[autodoc]] FastSpeech2ConformerHifiGan + - forward + +## FastSpeech2ConformerWithHifiGan + +[[autodoc]] FastSpeech2ConformerWithHifiGan + - forward diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md index 216c3c1f1133f7..0b324904e9e263 100644 --- a/docs/source/en/tasks/text-to-speech.md +++ b/docs/source/en/tasks/text-to-speech.md @@ -44,10 +44,8 @@ Here's a code snippet you can use to listen to the resulting audio in a notebook For more examples on what Bark and other pretrained TTS models can do, refer to our [Audio course](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models). -If you are looking to fine-tune a TTS model, you can currently fine-tune SpeechT5 only. SpeechT5 is pre-trained on a combination of -speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text -and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 -supports multiple speakers through x-vector speaker embeddings. +If you are looking to fine-tune a TTS model, the only text-to-speech models currently available in 🤗 Transformers +are [SpeechT5](model_doc/speecht5) and [FastSpeech2Conformer](model_doc/fastspeech2_conformer), though more will be added in the future. SpeechT5 is pre-trained on a combination of speech-to-text and text-to-speech data, allowing it to learn a unified space of hidden representations shared by both text and speech. This means that the same pre-trained model can be fine-tuned for different tasks. Furthermore, SpeechT5 supports multiple speakers through x-vector speaker embeddings. The remainder of this guide illustrates how to: diff --git a/docs/source/fr/index.md b/docs/source/fr/index.md index 9e3e6eb5c2362c..4877d18b55f121 100644 --- a/docs/source/fr/index.md +++ b/docs/source/fr/index.md @@ -108,6 +108,7 @@ La documentation est organisée en 5 parties: 1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. 1. **[ERNIE](model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu. 1. **[ESM](model_doc/esm)** (from Meta AI) are transformer protein language models. **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives. +1. **[FastSpeech2Conformer](model_doc/fastspeech2_conformer)** (from ESPnet) released with the paper [Recent Developments On Espnet Toolkit Boosted By Conformer](https://arxiv.org/abs/2010.13956) by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, and Yuekai Zhang. 1. **[FLAN-T5](model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. 1. **[FLAVA](model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela. @@ -290,6 +291,7 @@ Le tableau ci-dessous représente la prise en charge actuelle dans la bibliothè | ERNIE | ❌ | ❌ | ✅ | ❌ | ❌ | | ESM | ✅ | ❌ | ✅ | ✅ | ❌ | | FairSeq Machine-Translation | ✅ | ❌ | ✅ | ❌ | ❌ | +| FastSpeech2Conformer | ✅ | ❌ | ✅ | ❌ | ❌ | | FlauBERT | ✅ | ❌ | ✅ | ✅ | ❌ | | FLAVA | ❌ | ❌ | ✅ | ❌ | ❌ | | FNet | ✅ | ✅ | ✅ | ❌ | ❌ | diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e7d168154aaa56..b3512b32eca93b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -30,6 +30,7 @@ is_bitsandbytes_available, is_essentia_available, is_flax_available, + is_g2p_en_available, is_keras_nlp_available, is_librosa_available, is_pretty_midi_available, @@ -423,11 +424,16 @@ "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"], "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"], "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"], - "models.flaubert": [ - "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", - "FlaubertConfig", - "FlaubertTokenizer", - ], + "models.fastspeech2_conformer": [ + "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP", + "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP", + "FastSpeech2ConformerConfig", + "FastSpeech2ConformerHifiGanConfig", + "FastSpeech2ConformerTokenizer", + "FastSpeech2ConformerWithHifiGanConfig", + ], + "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"], "models.flava": [ "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlavaConfig", @@ -2126,6 +2132,15 @@ "FalconPreTrainedModel", ] ) + _import_structure["models.fastspeech2_conformer"].extend( + [ + "FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "FastSpeech2ConformerHifiGan", + "FastSpeech2ConformerModel", + "FastSpeech2ConformerPreTrainedModel", + "FastSpeech2ConformerWithHifiGan", + ] + ) _import_structure["models.flaubert"].extend( [ "FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5081,11 +5096,16 @@ from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig - from .models.flaubert import ( - FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - FlaubertConfig, - FlaubertTokenizer, - ) + from .models.fastspeech2_conformer import ( + FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP, + FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, + FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP, + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerTokenizer, + FastSpeech2ConformerWithHifiGanConfig, + ) + from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer from .models.flava import ( FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, FlavaConfig, @@ -6652,6 +6672,13 @@ FalconModel, FalconPreTrainedModel, ) + from .models.fastspeech2_conformer import ( + FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + FastSpeech2ConformerHifiGan, + FastSpeech2ConformerModel, + FastSpeech2ConformerPreTrainedModel, + FastSpeech2ConformerWithHifiGan, + ) from .models.flaubert import ( FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, FlaubertForMultipleChoice, diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 0dfcefd9c49cfa..7596e4cd231f0c 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -84,6 +84,7 @@ is_faiss_available, is_flax_available, is_ftfy_available, + is_g2p_en_available, is_in_notebook, is_ipex_available, is_librosa_available, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 319c8499319a3f..428eed37130ca6 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -83,6 +83,7 @@ ernie_m, esm, falcon, + fastspeech2_conformer, flaubert, flava, fnet, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index b91226ac877897..f1296b95484466 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -93,6 +93,7 @@ ("ernie_m", "ErnieMConfig"), ("esm", "EsmConfig"), ("falcon", "FalconConfig"), + ("fastspeech2_conformer", "FastSpeech2ConformerConfig"), ("flaubert", "FlaubertConfig"), ("flava", "FlavaConfig"), ("fnet", "FNetConfig"), @@ -319,6 +320,7 @@ ("ernie_m", "ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("esm", "ESM_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("falcon", "FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("fastspeech2_conformer", "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("flava", "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"), ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), @@ -542,6 +544,7 @@ ("ernie_m", "ErnieM"), ("esm", "ESM"), ("falcon", "Falcon"), + ("fastspeech2_conformer", "FastSpeech2Conformer"), ("flan-t5", "FLAN-T5"), ("flan-ul2", "FLAN-UL2"), ("flaubert", "FlauBERT"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 9978b1353035e3..6657b8b1b6187e 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -95,6 +95,7 @@ ("ernie_m", "ErnieMModel"), ("esm", "EsmModel"), ("falcon", "FalconModel"), + ("fastspeech2_conformer", "FastSpeech2ConformerModel"), ("flaubert", "FlaubertModel"), ("flava", "FlavaModel"), ("fnet", "FNetModel"), @@ -1075,6 +1076,7 @@ MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = OrderedDict( [ # Model for Text-To-Spectrogram mapping + ("fastspeech2_conformer", "FastSpeech2ConformerModel"), ("speecht5", "SpeechT5ForTextToSpeech"), ] ) @@ -1083,6 +1085,7 @@ [ # Model for Text-To-Waveform mapping ("bark", "BarkModel"), + ("fastspeech2_conformer", "FastSpeech2ConformerWithHifiGan"), ("musicgen", "MusicgenForConditionalGeneration"), ("seamless_m4t", "SeamlessM4TForTextToSpeech"), ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToSpeech"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 9e4066de99a5f9..5ff79fd822b950 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -25,7 +25,14 @@ from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE -from ...utils import cached_file, extract_commit_hash, is_sentencepiece_available, is_tokenizers_available, logging +from ...utils import ( + cached_file, + extract_commit_hash, + is_g2p_en_available, + is_sentencepiece_available, + is_tokenizers_available, + logging, +) from ..encoder_decoder import EncoderDecoderConfig from .auto_factory import _LazyAutoMapping from .configuration_auto import ( @@ -163,6 +170,10 @@ ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)), ("esm", ("EsmTokenizer", None)), ("falcon", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ( + "fastspeech2_conformer", + ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None), + ), ("flaubert", ("FlaubertTokenizer", None)), ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), ("fsmt", ("FSMTTokenizer", None)), diff --git a/src/transformers/models/fastspeech2_conformer/__init__.py b/src/transformers/models/fastspeech2_conformer/__init__.py new file mode 100644 index 00000000000000..1fd5cbf1dc272e --- /dev/null +++ b/src/transformers/models/fastspeech2_conformer/__init__.py @@ -0,0 +1,77 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = { + "configuration_fastspeech2_conformer": [ + "FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP", + "FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", + "FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP", + "FastSpeech2ConformerConfig", + "FastSpeech2ConformerHifiGanConfig", + "FastSpeech2ConformerWithHifiGanConfig", + ], + "tokenization_fastspeech2_conformer": ["FastSpeech2ConformerTokenizer"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_fastspeech2_conformer"] = [ + "FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "FastSpeech2ConformerWithHifiGan", + "FastSpeech2ConformerHifiGan", + "FastSpeech2ConformerModel", + "FastSpeech2ConformerPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_fastspeech2_conformer import ( + FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP, + FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, + FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP, + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerWithHifiGanConfig, + ) + from .tokenization_fastspeech2_conformer import FastSpeech2ConformerTokenizer + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_fastspeech2_conformer import ( + FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + FastSpeech2ConformerHifiGan, + FastSpeech2ConformerModel, + FastSpeech2ConformerPreTrainedModel, + FastSpeech2ConformerWithHifiGan, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py new file mode 100644 index 00000000000000..46dc10adb2900e --- /dev/null +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -0,0 +1,488 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" FastSpeech2Conformer model configuration""" + +from typing import Dict + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +FASTSPEECH2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "espnet/fastspeech2_conformer": "https://huggingface.co/espnet/fastspeech2_conformer/raw/main/config.json", +} + +FASTSPEECH2_CONFORMER_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "espnet/fastspeech2_conformer_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_hifigan/raw/main/config.json", +} + +FASTSPEECH2_CONFORMER_WITH_HIFIGAN_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "espnet/fastspeech2_conformer_with_hifigan": "https://huggingface.co/espnet/fastspeech2_conformer_with_hifigan/raw/main/config.json", +} + + +class FastSpeech2ConformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastSpeech2ConformerModel`]. It is used to + instantiate a FastSpeech2Conformer model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2Conformer [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 384): + The dimensionality of the hidden layers. + vocab_size (`int`, *optional*, defaults to 78): + The size of the vocabulary. + num_mel_bins (`int`, *optional*, defaults to 80): + The number of mel filters used in the filter bank. + encoder_num_attention_heads (`int`, *optional*, defaults to 2): + The number of attention heads in the encoder. + encoder_layers (`int`, *optional*, defaults to 4): + The number of layers in the encoder. + encoder_linear_units (`int`, *optional*, defaults to 1536): + The number of units in the linear layer of the encoder. + decoder_layers (`int`, *optional*, defaults to 4): + The number of layers in the decoder. + decoder_num_attention_heads (`int`, *optional*, defaults to 2): + The number of attention heads in the decoder. + decoder_linear_units (`int`, *optional*, defaults to 1536): + The number of units in the linear layer of the decoder. + speech_decoder_postnet_layers (`int`, *optional*, defaults to 5): + The number of layers in the post-net of the speech decoder. + speech_decoder_postnet_units (`int`, *optional*, defaults to 256): + The number of units in the post-net layers of the speech decoder. + speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5): + The kernel size in the post-net of the speech decoder. + positionwise_conv_kernel_size (`int`, *optional*, defaults to 3): + The size of the convolution kernel used in the position-wise layer. + encoder_normalize_before (`bool`, *optional*, defaults to `False`): + Specifies whether to normalize before encoder layers. + decoder_normalize_before (`bool`, *optional*, defaults to `False`): + Specifies whether to normalize before decoder layers. + encoder_concat_after (`bool`, *optional*, defaults to `False`): + Specifies whether to concatenate after encoder layers. + decoder_concat_after (`bool`, *optional*, defaults to `False`): + Specifies whether to concatenate after decoder layers. + reduction_factor (`int`, *optional*, defaults to 1): + The factor by which the speech frame rate is reduced. + speaking_speed (`float`, *optional*, defaults to 1.0): + The speed of the speech produced. + use_macaron_style_in_conformer (`bool`, *optional*, defaults to `True`): + Specifies whether to use macaron style in the conformer. + use_cnn_in_conformer (`bool`, *optional*, defaults to `True`): + Specifies whether to use convolutional neural networks in the conformer. + encoder_kernel_size (`int`, *optional*, defaults to 7): + The kernel size used in the encoder. + decoder_kernel_size (`int`, *optional*, defaults to 31): + The kernel size used in the decoder. + duration_predictor_layers (`int`, *optional*, defaults to 2): + The number of layers in the duration predictor. + duration_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the duration predictor. + duration_predictor_kernel_size (`int`, *optional*, defaults to 3): + The kernel size used in the duration predictor. + energy_predictor_layers (`int`, *optional*, defaults to 2): + The number of layers in the energy predictor. + energy_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the energy predictor. + energy_predictor_kernel_size (`int`, *optional*, defaults to 3): + The kernel size used in the energy predictor. + energy_predictor_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the energy predictor. + energy_embed_kernel_size (`int`, *optional*, defaults to 1): + The kernel size used in the energy embed layer. + energy_embed_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate in the energy embed layer. + stop_gradient_from_energy_predictor (`bool`, *optional*, defaults to `False`): + Specifies whether to stop gradients from the energy predictor. + pitch_predictor_layers (`int`, *optional*, defaults to 5): + The number of layers in the pitch predictor. + pitch_predictor_channels (`int`, *optional*, defaults to 256): + The number of channels in the pitch predictor. + pitch_predictor_kernel_size (`int`, *optional*, defaults to 5): + The kernel size used in the pitch predictor. + pitch_predictor_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the pitch predictor. + pitch_embed_kernel_size (`int`, *optional*, defaults to 1): + The kernel size used in the pitch embed layer. + pitch_embed_dropout (`float`, *optional*, defaults to 0.0): + The dropout rate in the pitch embed layer. + stop_gradient_from_pitch_predictor (`bool`, *optional*, defaults to `True`): + Specifies whether to stop gradients from the pitch predictor. + encoder_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the encoder. + encoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2): + The positional dropout rate in the encoder. + encoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2): + The attention dropout rate in the encoder. + decoder_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the decoder. + decoder_positional_dropout_rate (`float`, *optional*, defaults to 0.2): + The positional dropout rate in the decoder. + decoder_attention_dropout_rate (`float`, *optional*, defaults to 0.2): + The attention dropout rate in the decoder. + duration_predictor_dropout_rate (`float`, *optional*, defaults to 0.2): + The dropout rate in the duration predictor. + speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5): + The dropout rate in the speech decoder postnet. + max_source_positions (`int`, *optional*, defaults to 5000): + if `"relative"` position embeddings are used, defines the maximum source input positions. + use_masking (`bool`, *optional*, defaults to `True`): + Specifies whether to use masking in the model. + use_weighted_masking (`bool`, *optional*, defaults to `False`): + Specifies whether to use weighted masking in the model. + num_speakers (`int`, *optional*): + Number of speakers. If set to > 1, assume that the speaker ids will be provided as the input and use + speaker id embedding layer. + num_languages (`int`, *optional*): + Number of languages. If set to > 1, assume that the language ids will be provided as the input and use the + languge id embedding layer. + speaker_embed_dim (`int`, *optional*): + Speaker embedding dimension. If set to > 0, assume that speaker_embedding will be provided as the input. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Specifies whether the model is an encoder-decoder. + + Example: + + ```python + >>> from transformers import FastSpeech2ConformerModel, FastSpeech2ConformerConfig + + >>> # Initializing a FastSpeech2Conformer style configuration + >>> configuration = FastSpeech2ConformerConfig() + + >>> # Initializing a model from the FastSpeech2Conformer style configuration + >>> model = FastSpeech2ConformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "fastspeech2_conformer" + attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} + + def __init__( + self, + hidden_size=384, + vocab_size=78, + num_mel_bins=80, + encoder_num_attention_heads=2, + encoder_layers=4, + encoder_linear_units=1536, + decoder_layers=4, + decoder_num_attention_heads=2, + decoder_linear_units=1536, + speech_decoder_postnet_layers=5, + speech_decoder_postnet_units=256, + speech_decoder_postnet_kernel=5, + positionwise_conv_kernel_size=3, + encoder_normalize_before=False, + decoder_normalize_before=False, + encoder_concat_after=False, + decoder_concat_after=False, + reduction_factor=1, + speaking_speed=1.0, + use_macaron_style_in_conformer=True, + use_cnn_in_conformer=True, + encoder_kernel_size=7, + decoder_kernel_size=31, + duration_predictor_layers=2, + duration_predictor_channels=256, + duration_predictor_kernel_size=3, + energy_predictor_layers=2, + energy_predictor_channels=256, + energy_predictor_kernel_size=3, + energy_predictor_dropout=0.5, + energy_embed_kernel_size=1, + energy_embed_dropout=0.0, + stop_gradient_from_energy_predictor=False, + pitch_predictor_layers=5, + pitch_predictor_channels=256, + pitch_predictor_kernel_size=5, + pitch_predictor_dropout=0.5, + pitch_embed_kernel_size=1, + pitch_embed_dropout=0.0, + stop_gradient_from_pitch_predictor=True, + encoder_dropout_rate=0.2, + encoder_positional_dropout_rate=0.2, + encoder_attention_dropout_rate=0.2, + decoder_dropout_rate=0.2, + decoder_positional_dropout_rate=0.2, + decoder_attention_dropout_rate=0.2, + duration_predictor_dropout_rate=0.2, + speech_decoder_postnet_dropout=0.5, + max_source_positions=5000, + use_masking=True, + use_weighted_masking=False, + num_speakers=None, + num_languages=None, + speaker_embed_dim=None, + is_encoder_decoder=True, + **kwargs, + ): + if positionwise_conv_kernel_size % 2 == 0: + raise ValueError( + f"positionwise_conv_kernel_size must be odd, but got {positionwise_conv_kernel_size} instead." + ) + if encoder_kernel_size % 2 == 0: + raise ValueError(f"encoder_kernel_size must be odd, but got {encoder_kernel_size} instead.") + if decoder_kernel_size % 2 == 0: + raise ValueError(f"decoder_kernel_size must be odd, but got {decoder_kernel_size} instead.") + if duration_predictor_kernel_size % 2 == 0: + raise ValueError( + f"duration_predictor_kernel_size must be odd, but got {duration_predictor_kernel_size} instead." + ) + if energy_predictor_kernel_size % 2 == 0: + raise ValueError( + f"energy_predictor_kernel_size must be odd, but got {energy_predictor_kernel_size} instead." + ) + if energy_embed_kernel_size % 2 == 0: + raise ValueError(f"energy_embed_kernel_size must be odd, but got {energy_embed_kernel_size} instead.") + if pitch_predictor_kernel_size % 2 == 0: + raise ValueError( + f"pitch_predictor_kernel_size must be odd, but got {pitch_predictor_kernel_size} instead." + ) + if pitch_embed_kernel_size % 2 == 0: + raise ValueError(f"pitch_embed_kernel_size must be odd, but got {pitch_embed_kernel_size} instead.") + if hidden_size % encoder_num_attention_heads != 0: + raise ValueError("The hidden_size must be evenly divisible by encoder_num_attention_heads.") + if hidden_size % decoder_num_attention_heads != 0: + raise ValueError("The hidden_size must be evenly divisible by decoder_num_attention_heads.") + if use_masking and use_weighted_masking: + raise ValueError("Either use_masking or use_weighted_masking can be True, but not both.") + + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.num_mel_bins = num_mel_bins + self.encoder_config = { + "num_attention_heads": encoder_num_attention_heads, + "layers": encoder_layers, + "kernel_size": encoder_kernel_size, + "attention_dropout_rate": encoder_attention_dropout_rate, + "dropout_rate": encoder_dropout_rate, + "positional_dropout_rate": encoder_positional_dropout_rate, + "linear_units": encoder_linear_units, + "normalize_before": encoder_normalize_before, + "concat_after": encoder_concat_after, + } + self.decoder_config = { + "num_attention_heads": decoder_num_attention_heads, + "layers": decoder_layers, + "kernel_size": decoder_kernel_size, + "attention_dropout_rate": decoder_attention_dropout_rate, + "dropout_rate": decoder_dropout_rate, + "positional_dropout_rate": decoder_positional_dropout_rate, + "linear_units": decoder_linear_units, + "normalize_before": decoder_normalize_before, + "concat_after": decoder_concat_after, + } + self.encoder_num_attention_heads = encoder_num_attention_heads + self.encoder_layers = encoder_layers + self.duration_predictor_channels = duration_predictor_channels + self.duration_predictor_kernel_size = duration_predictor_kernel_size + self.duration_predictor_layers = duration_predictor_layers + self.energy_embed_dropout = energy_embed_dropout + self.energy_embed_kernel_size = energy_embed_kernel_size + self.energy_predictor_channels = energy_predictor_channels + self.energy_predictor_dropout = energy_predictor_dropout + self.energy_predictor_kernel_size = energy_predictor_kernel_size + self.energy_predictor_layers = energy_predictor_layers + self.pitch_embed_dropout = pitch_embed_dropout + self.pitch_embed_kernel_size = pitch_embed_kernel_size + self.pitch_predictor_channels = pitch_predictor_channels + self.pitch_predictor_dropout = pitch_predictor_dropout + self.pitch_predictor_kernel_size = pitch_predictor_kernel_size + self.pitch_predictor_layers = pitch_predictor_layers + self.positionwise_conv_kernel_size = positionwise_conv_kernel_size + self.speech_decoder_postnet_units = speech_decoder_postnet_units + self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout + self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel + self.speech_decoder_postnet_layers = speech_decoder_postnet_layers + self.reduction_factor = reduction_factor + self.speaking_speed = speaking_speed + self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor + self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor + self.max_source_positions = max_source_positions + self.use_cnn_in_conformer = use_cnn_in_conformer + self.use_macaron_style_in_conformer = use_macaron_style_in_conformer + self.use_masking = use_masking + self.use_weighted_masking = use_weighted_masking + self.num_speakers = num_speakers + self.num_languages = num_languages + self.speaker_embed_dim = speaker_embed_dim + self.duration_predictor_dropout_rate = duration_predictor_dropout_rate + self.is_encoder_decoder = is_encoder_decoder + + super().__init__( + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + + +class FastSpeech2ConformerHifiGanConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`FastSpeech2ConformerHifiGanModel`]. It is used to + instantiate a FastSpeech2Conformer HiFi-GAN vocoder model according to the specified arguments, defining the model + architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2Conformer + [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + model_in_dim (`int`, *optional*, defaults to 80): + The number of frequency bins in the input log-mel spectrogram. + upsample_initial_channel (`int`, *optional*, defaults to 512): + The number of input channels into the upsampling network. + upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 2, 2]`): + A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The + length of *upsample_rates* defines the number of convolutional layers and has to match the length of + *upsample_kernel_sizes*. + upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[16, 16, 4, 4]`): + A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The + length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of + *upsample_rates*. + resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`): + A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field + fusion (MRF) module. + resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`): + A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the + multi-receptive field fusion (MRF) module. + initializer_range (`float`, *optional*, defaults to 0.01): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + leaky_relu_slope (`float`, *optional*, defaults to 0.1): + The angle of the negative slope used by the leaky ReLU activation. + normalize_before (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance. + + Example: + + ```python + >>> from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig + + >>> # Initializing a FastSpeech2ConformerHifiGan configuration + >>> configuration = FastSpeech2ConformerHifiGanConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = FastSpeech2ConformerHifiGan(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "hifigan" + + def __init__( + self, + model_in_dim=80, + upsample_initial_channel=512, + upsample_rates=[8, 8, 2, 2], + upsample_kernel_sizes=[16, 16, 4, 4], + resblock_kernel_sizes=[3, 7, 11], + resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]], + initializer_range=0.01, + leaky_relu_slope=0.1, + normalize_before=True, + **kwargs, + ): + self.model_in_dim = model_in_dim + self.upsample_initial_channel = upsample_initial_channel + self.upsample_rates = upsample_rates + self.upsample_kernel_sizes = upsample_kernel_sizes + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.initializer_range = initializer_range + self.leaky_relu_slope = leaky_relu_slope + self.normalize_before = normalize_before + super().__init__(**kwargs) + + +class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`FastSpeech2ConformerWithHifiGan`]. It is used to + instantiate a `FastSpeech2ConformerWithHifiGanModel` model according to the specified sub-models configurations, + defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the + FastSpeech2ConformerModel [espnet/fastspeech2_conformer](https://huggingface.co/espnet/fastspeech2_conformer) and + FastSpeech2ConformerHifiGan + [espnet/fastspeech2_conformer_hifigan](https://huggingface.co/espnet/fastspeech2_conformer_hifigan) architectures. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + model_config (`typing.Dict`, *optional*): + Configuration of the text-to-speech model. + vocoder_config (`typing.Dict`, *optional*): + Configuration of the vocoder model. + model_config ([`FastSpeech2ConformerConfig`], *optional*): + Configuration of the text-to-speech model. + vocoder_config ([`FastSpeech2ConformerHiFiGanConfig`], *optional*): + Configuration of the vocoder model. + + Example: + + ```python + >>> from transformers import ( + ... FastSpeech2ConformerConfig, + ... FastSpeech2ConformerHifiGanConfig, + ... FastSpeech2ConformerWithHifiGanConfig, + ... FastSpeech2ConformerWithHifiGan, + ... ) + + >>> # Initializing FastSpeech2ConformerWithHifiGan sub-modules configurations. + >>> model_config = FastSpeech2ConformerConfig() + >>> vocoder_config = FastSpeech2ConformerHifiGanConfig() + + >>> # Initializing a FastSpeech2ConformerWithHifiGan module style configuration + >>> configuration = FastSpeech2ConformerWithHifiGanConfig(model_config.to_dict(), vocoder_config.to_dict()) + + >>> # Initializing a model (with random weights) + >>> model = FastSpeech2ConformerWithHifiGan(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "fastspeech2_conformer_with_hifigan" + is_composition = True + + def __init__( + self, + model_config: Dict = None, + vocoder_config: Dict = None, + **kwargs, + ): + if model_config is None: + model_config = {} + logger.info("model_config is None. initializing the model with default values.") + + if vocoder_config is None: + vocoder_config = {} + logger.info("vocoder_config is None. initializing the coarse model with default values.") + + self.model_config = FastSpeech2ConformerConfig(**model_config) + self.vocoder_config = FastSpeech2ConformerHifiGanConfig(**vocoder_config) + + super().__init__(**kwargs) diff --git a/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 00000000000000..bb9c432f82292f --- /dev/null +++ b/src/transformers/models/fastspeech2_conformer/convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,210 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer checkpoint.""" + +import argparse +import json +import re +from pathlib import Path +from tempfile import TemporaryDirectory + +import torch +import yaml + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerModel, + FastSpeech2ConformerTokenizer, + logging, +) + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + +CONFIG_MAPPING = { + "adim": "hidden_size", + "aheads": "num_attention_heads", + "conformer_dec_kernel_size": "decoder_kernel_size", + "conformer_enc_kernel_size": "encoder_kernel_size", + "decoder_normalize_before": "decoder_normalize_before", + "dlayers": "decoder_layers", + "dunits": "decoder_linear_units", + "duration_predictor_chans": "duration_predictor_channels", + "duration_predictor_kernel_size": "duration_predictor_kernel_size", + "duration_predictor_layers": "duration_predictor_layers", + "elayers": "encoder_layers", + "encoder_normalize_before": "encoder_normalize_before", + "energy_embed_dropout": "energy_embed_dropout", + "energy_embed_kernel_size": "energy_embed_kernel_size", + "energy_predictor_chans": "energy_predictor_channels", + "energy_predictor_dropout": "energy_predictor_dropout", + "energy_predictor_kernel_size": "energy_predictor_kernel_size", + "energy_predictor_layers": "energy_predictor_layers", + "eunits": "encoder_linear_units", + "pitch_embed_dropout": "pitch_embed_dropout", + "pitch_embed_kernel_size": "pitch_embed_kernel_size", + "pitch_predictor_chans": "pitch_predictor_channels", + "pitch_predictor_dropout": "pitch_predictor_dropout", + "pitch_predictor_kernel_size": "pitch_predictor_kernel_size", + "pitch_predictor_layers": "pitch_predictor_layers", + "positionwise_conv_kernel_size": "positionwise_conv_kernel_size", + "postnet_chans": "speech_decoder_postnet_units", + "postnet_filts": "speech_decoder_postnet_kernel", + "postnet_layers": "speech_decoder_postnet_layers", + "reduction_factor": "reduction_factor", + "stop_gradient_from_energy_predictor": "stop_gradient_from_energy_predictor", + "stop_gradient_from_pitch_predictor": "stop_gradient_from_pitch_predictor", + "transformer_dec_attn_dropout_rate": "decoder_attention_dropout_rate", + "transformer_dec_dropout_rate": "decoder_dropout_rate", + "transformer_dec_positional_dropout_rate": "decoder_positional_dropout_rate", + "transformer_enc_attn_dropout_rate": "encoder_attention_dropout_rate", + "transformer_enc_dropout_rate": "encoder_dropout_rate", + "transformer_enc_positional_dropout_rate": "encoder_positional_dropout_rate", + "use_cnn_in_conformer": "use_cnn_in_conformer", + "use_macaron_style_in_conformer": "use_macaron_style_in_conformer", + "use_masking": "use_masking", + "use_weighted_masking": "use_weighted_masking", + "idim": "input_dim", + "odim": "num_mel_bins", + "spk_embed_dim": "speaker_embed_dim", + "langs": "num_languages", + "spks": "num_speakers", +} + + +def remap_model_yaml_config(yaml_config_path): + with Path(yaml_config_path).open("r", encoding="utf-8") as f: + args = yaml.safe_load(f) + args = argparse.Namespace(**args) + + remapped_config = {} + + model_params = args.tts_conf["text2mel_params"] + # espnet_config_key -> hf_config_key, any keys not included are ignored + for espnet_config_key, hf_config_key in CONFIG_MAPPING.items(): + if espnet_config_key in model_params: + remapped_config[hf_config_key] = model_params[espnet_config_key] + + return remapped_config, args.g2p, args.token_list + + +def convert_espnet_state_dict_to_hf(state_dict): + new_state_dict = {} + for key in state_dict: + if "tts.generator.text2mel." in key: + new_key = key.replace("tts.generator.text2mel.", "") + if "postnet" in key: + new_key = new_key.replace("postnet.postnet", "speech_decoder_postnet.layers") + new_key = new_key.replace(".0.weight", ".conv.weight") + new_key = new_key.replace(".1.weight", ".batch_norm.weight") + new_key = new_key.replace(".1.bias", ".batch_norm.bias") + new_key = new_key.replace(".1.running_mean", ".batch_norm.running_mean") + new_key = new_key.replace(".1.running_var", ".batch_norm.running_var") + new_key = new_key.replace(".1.num_batches_tracked", ".batch_norm.num_batches_tracked") + if "feat_out" in key: + if "weight" in key: + new_key = "speech_decoder_postnet.feat_out.weight" + if "bias" in key: + new_key = "speech_decoder_postnet.feat_out.bias" + if "encoder.embed.0.weight" in key: + new_key = new_key.replace("0.", "") + if "w_1" in key: + new_key = new_key.replace("w_1", "conv1") + if "w_2" in key: + new_key = new_key.replace("w_2", "conv2") + if "predictor.conv" in key: + new_key = new_key.replace(".conv", ".conv_layers") + pattern = r"(\d)\.(\d)" + replacement = ( + r"\1.conv" if ("2.weight" not in new_key) and ("2.bias" not in new_key) else r"\1.layer_norm" + ) + new_key = re.sub(pattern, replacement, new_key) + if "pitch_embed" in key or "energy_embed" in key: + new_key = new_key.replace("0", "conv") + if "encoders" in key: + new_key = new_key.replace("encoders", "conformer_layers") + new_key = new_key.replace("norm_final", "final_layer_norm") + new_key = new_key.replace("norm_mha", "self_attn_layer_norm") + new_key = new_key.replace("norm_ff_macaron", "ff_macaron_layer_norm") + new_key = new_key.replace("norm_ff", "ff_layer_norm") + new_key = new_key.replace("norm_conv", "conv_layer_norm") + if "lid_emb" in key: + new_key = new_key.replace("lid_emb", "language_id_embedding") + if "sid_emb" in key: + new_key = new_key.replace("sid_emb", "speaker_id_embedding") + + new_state_dict[new_key] = state_dict[key] + + return new_state_dict + + +@torch.no_grad() +def convert_FastSpeech2ConformerModel_checkpoint( + checkpoint_path, + yaml_config_path, + pytorch_dump_folder_path, + repo_id=None, +): + model_params, tokenizer_name, vocab = remap_model_yaml_config(yaml_config_path) + config = FastSpeech2ConformerConfig(**model_params) + + # Prepare the model + model = FastSpeech2ConformerModel(config) + + espnet_checkpoint = torch.load(checkpoint_path) + hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) + + model.load_state_dict(hf_compatible_state_dict) + + model.save_pretrained(pytorch_dump_folder_path) + + # Prepare the tokenizer + with TemporaryDirectory() as tempdir: + vocab = {token: id for id, token in enumerate(vocab)} + vocab_file = Path(tempdir) / "vocab.json" + with open(vocab_file, "w") as f: + json.dump(vocab, f) + should_strip_spaces = "no_space" in tokenizer_name + tokenizer = FastSpeech2ConformerTokenizer(str(vocab_file), should_strip_spaces=should_strip_spaces) + + tokenizer.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + tokenizer.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" + ) + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + convert_FastSpeech2ConformerModel_checkpoint( + args.checkpoint_path, + args.yaml_config_path, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py new file mode 100644 index 00000000000000..ec9f57ce7142d6 --- /dev/null +++ b/src/transformers/models/fastspeech2_conformer/convert_hifigan.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer HiFi-GAN checkpoint.""" + +import argparse +from pathlib import Path + +import torch +import yaml + +from transformers import FastSpeech2ConformerHifiGan, FastSpeech2ConformerHifiGanConfig, logging + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + + +def load_weights(checkpoint, hf_model, config): + vocoder_key_prefix = "tts.generator.vocoder." + checkpoint = {k.replace(vocoder_key_prefix, ""): v for k, v in checkpoint.items() if vocoder_key_prefix in k} + + hf_model.apply_weight_norm() + + hf_model.conv_pre.weight_g.data = checkpoint["input_conv.weight_g"] + hf_model.conv_pre.weight_v.data = checkpoint["input_conv.weight_v"] + hf_model.conv_pre.bias.data = checkpoint["input_conv.bias"] + + for i in range(len(config.upsample_rates)): + hf_model.upsampler[i].weight_g.data = checkpoint[f"upsamples.{i}.1.weight_g"] + hf_model.upsampler[i].weight_v.data = checkpoint[f"upsamples.{i}.1.weight_v"] + hf_model.upsampler[i].bias.data = checkpoint[f"upsamples.{i}.1.bias"] + + for i in range(len(config.upsample_rates) * len(config.resblock_kernel_sizes)): + for j in range(len(config.resblock_dilation_sizes)): + hf_model.resblocks[i].convs1[j].weight_g.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_g"] + hf_model.resblocks[i].convs1[j].weight_v.data = checkpoint[f"blocks.{i}.convs1.{j}.1.weight_v"] + hf_model.resblocks[i].convs1[j].bias.data = checkpoint[f"blocks.{i}.convs1.{j}.1.bias"] + + hf_model.resblocks[i].convs2[j].weight_g.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_g"] + hf_model.resblocks[i].convs2[j].weight_v.data = checkpoint[f"blocks.{i}.convs2.{j}.1.weight_v"] + hf_model.resblocks[i].convs2[j].bias.data = checkpoint[f"blocks.{i}.convs2.{j}.1.bias"] + + hf_model.conv_post.weight_g.data = checkpoint["output_conv.1.weight_g"] + hf_model.conv_post.weight_v.data = checkpoint["output_conv.1.weight_v"] + hf_model.conv_post.bias.data = checkpoint["output_conv.1.bias"] + + hf_model.remove_weight_norm() + + +def remap_hifigan_yaml_config(yaml_config_path): + with Path(yaml_config_path).open("r", encoding="utf-8") as f: + args = yaml.safe_load(f) + args = argparse.Namespace(**args) + + vocoder_type = args.tts_conf["vocoder_type"] + if vocoder_type != "hifigan_generator": + raise TypeError(f"Vocoder config must be for `hifigan_generator`, but got {vocoder_type}") + + remapped_dict = {} + vocoder_params = args.tts_conf["vocoder_params"] + + # espnet_config_key -> hf_config_key + key_mappings = { + "channels": "upsample_initial_channel", + "in_channels": "model_in_dim", + "resblock_dilations": "resblock_dilation_sizes", + "resblock_kernel_sizes": "resblock_kernel_sizes", + "upsample_kernel_sizes": "upsample_kernel_sizes", + "upsample_scales": "upsample_rates", + } + for espnet_config_key, hf_config_key in key_mappings.items(): + remapped_dict[hf_config_key] = vocoder_params[espnet_config_key] + remapped_dict["sampling_rate"] = args.tts_conf["sampling_rate"] + remapped_dict["normalize_before"] = False + remapped_dict["leaky_relu_slope"] = vocoder_params["nonlinear_activation_params"]["negative_slope"] + + return remapped_dict + + +@torch.no_grad() +def convert_hifigan_checkpoint( + checkpoint_path, + pytorch_dump_folder_path, + yaml_config_path=None, + repo_id=None, +): + if yaml_config_path is not None: + config_kwargs = remap_hifigan_yaml_config(yaml_config_path) + config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) + else: + config = FastSpeech2ConformerHifiGanConfig() + + model = FastSpeech2ConformerHifiGan(config) + + orig_checkpoint = torch.load(checkpoint_path) + load_weights(orig_checkpoint, model, config) + + model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument("--yaml_config_path", default=None, type=str, help="Path to config.yaml of model to convert") + parser.add_argument( + "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + convert_hifigan_checkpoint( + args.checkpoint_path, + args.pytorch_dump_folder_path, + args.yaml_config_path, + args.push_to_hub, + ) diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py new file mode 100644 index 00000000000000..2a780d5cf0b8ea --- /dev/null +++ b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py @@ -0,0 +1,102 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert FastSpeech2Conformer checkpoint.""" + +import argparse + +import torch + +from transformers import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGan, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerModel, + FastSpeech2ConformerWithHifiGan, + FastSpeech2ConformerWithHifiGanConfig, + logging, +) + +from .convert_fastspeech2_conformer_original_pytorch_checkpoint_to_pytorch import ( + convert_espnet_state_dict_to_hf, + remap_model_yaml_config, +) +from .convert_hifigan import load_weights, remap_hifigan_yaml_config + + +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.FastSpeech2Conformer") + + +def convert_FastSpeech2ConformerWithHifiGan_checkpoint( + checkpoint_path, + yaml_config_path, + pytorch_dump_folder_path, + repo_id=None, +): + # Prepare the model + model_params, *_ = remap_model_yaml_config(yaml_config_path) + model_config = FastSpeech2ConformerConfig(**model_params) + + model = FastSpeech2ConformerModel(model_config) + + espnet_checkpoint = torch.load(checkpoint_path) + hf_compatible_state_dict = convert_espnet_state_dict_to_hf(espnet_checkpoint) + model.load_state_dict(hf_compatible_state_dict) + + # Prepare the vocoder + config_kwargs = remap_hifigan_yaml_config(yaml_config_path) + vocoder_config = FastSpeech2ConformerHifiGanConfig(**config_kwargs) + + vocoder = FastSpeech2ConformerHifiGan(vocoder_config) + load_weights(espnet_checkpoint, vocoder, vocoder_config) + + # Prepare the model + vocoder + config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) + with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) + with_hifigan_model.model = model + with_hifigan_model.vocoder = vocoder + + with_hifigan_model.save_pretrained(pytorch_dump_folder_path) + + if repo_id: + print("Pushing to the hub...") + with_hifigan_model.push_to_hub(repo_id) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint") + parser.add_argument( + "--yaml_config_path", required=True, default=None, type=str, help="Path to config.yaml of model to convert" + ) + parser.add_argument( + "--pytorch_dump_folder_path", + required=True, + default=None, + type=str, + help="Path to the output `FastSpeech2ConformerModel` PyTorch model.", + ) + parser.add_argument( + "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub." + ) + + args = parser.parse_args() + + convert_FastSpeech2ConformerWithHifiGan_checkpoint( + args.checkpoint_path, + args.yaml_config_path, + args.pytorch_dump_folder_path, + args.push_to_hub, + ) diff --git a/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py new file mode 100644 index 00000000000000..89148ee09d6fc5 --- /dev/null +++ b/src/transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py @@ -0,0 +1,1686 @@ +# coding=utf-8 +# Copyright 2023 The Espnet authors, IMS Toucan authors, and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch FastSpeech2Conformer model.""" + +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +from torch import nn + +from ...modeling_outputs import BaseModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ModelOutput, add_start_docstrings, logging, replace_return_docstrings +from .configuration_fastspeech2_conformer import ( + FastSpeech2ConformerConfig, + FastSpeech2ConformerHifiGanConfig, + FastSpeech2ConformerWithHifiGanConfig, +) + + +logger = logging.get_logger(__name__) + +FASTSPEECH2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "espnet/fastspeech2_conformer", + # See all FastSpeech2Conformer models at https://huggingface.co/models?filter=fastspeech2_conformer +] + + +@dataclass +class FastSpeech2ConformerModelOutput(ModelOutput): + """ + Output type of [`FastSpeech2ConformerModel`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`): + The predicted spectrogram. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*): + Outputs of the duration predictor. + pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the pitch predictor. + energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the energy predictor. + + """ + + loss: Optional[torch.FloatTensor] = None + spectrogram: torch.FloatTensor = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + duration_outputs: torch.LongTensor = None + pitch_outputs: torch.FloatTensor = None + energy_outputs: torch.FloatTensor = None + + +@dataclass +class FastSpeech2ConformerWithHifiGanOutput(FastSpeech2ConformerModelOutput): + """ + Output type of [`FastSpeech2ConformerWithHifiGan`]. + + Args: + waveform (`torch.FloatTensor` of shape `(batch_size, audio_length)`): + Speech output as a result of passing the predicted mel spectrogram through the vocoder. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`): + The predicted spectrogram. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + duration_outputs (`torch.LongTensor` of shape `(batch_size, max_text_length + 1)`, *optional*): + Outputs of the duration predictor. + pitch_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the pitch predictor. + energy_outputs (`torch.FloatTensor` of shape `(batch_size, max_text_length + 1, 1)`, *optional*): + Outputs of the energy predictor. + """ + + waveform: torch.FloatTensor = None + + +_CONFIG_FOR_DOC = "FastSpeech2ConformerConfig" + +FASTSPEECH2_CONFORMER_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FastSpeech2ConformerConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +HIFIGAN_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FastSpeech2ConformerConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +FASTSPEECH2_CONFORMER_WITH_HIFIGAN_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`FastSpeech2ConformerWithHifiGanConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +def length_regulator(encoded_embeddings, duration_labels, speaking_speed=1.0): + """ + Length regulator for feed-forward Transformer. + + This is the length regulator module described in `FastSpeech: Fast, Robust and Controllable Text to Speech` + https://arxiv.org/pdf/1905.09263.pdf. The length regulator expands char or phoneme-level embedding features to + frame-level by repeating each feature based on the corresponding predicted durations. + + Args: + encoded_embeddings (`torch.Tensor` of shape `(batch_size, max_text_length, embedding_dim)`): + Batch of sequences of char or phoneme embeddings. + duration_labels (`torch.LongTensor` of shape `(batch_size, time)`): + Batch of durations of each frame. + speaking_speed (`float`, *optional*, defaults to 1.0): + Value to control speed of speech. + + Returns: + `torch.Tensor`: + Replicated input tensor based on durations (batch_size, time*, embedding_dim). + """ + + if speaking_speed <= 0: + raise ValueError("`speaking_speed` must be greater than 0.") + elif speaking_speed != 1.0: + duration_labels = torch.round(duration_labels.float() * speaking_speed).long() + + if duration_labels.sum() == 0: + duration_labels[duration_labels.sum(dim=1).eq(0)] = 1 + + # Calculate the maximum length needed + max_len = torch.sum(duration_labels, dim=1).max() + + # Create a padded tensor to hold the results + hidden_states = torch.zeros( + (encoded_embeddings.size(0), max_len, encoded_embeddings.size(2)), + dtype=torch.float, + device=encoded_embeddings.device, + ) + + # Loop through the batch and fill in the data + for i, (encoded_embedding, target_duration) in enumerate(zip(encoded_embeddings, duration_labels)): + repeated = torch.repeat_interleave(encoded_embedding, target_duration, dim=0) + hidden_states[i, : repeated.size(0)] = repeated + + return hidden_states + + +class FastSpeech2ConformerDurationPredictor(nn.Module): + """ + Duration predictor module. + + This is a module of duration predictor described in the paper 'FastSpeech: Fast, Robust and Controllable Text to + Speech' https://arxiv.org/pdf/1905.09263.pdf The duration predictor predicts a duration of each frame in log domain + from the hidden embeddings of encoder. + + Note: + The calculation domain of outputs is different between in `forward` and in `inference`. In `forward`, the + outputs are calculated in log domain but in `inference`, those are calculated in linear domain. + + """ + + def __init__(self, config: FastSpeech2ConformerConfig): + super().__init__() + + self.conv_layers = nn.ModuleList() + self.log_domain_offset = 1.0 + + for layer_idx in range(config.duration_predictor_layers): + num_chans = config.duration_predictor_channels + input_channels = config.hidden_size if layer_idx == 0 else num_chans + layer = FastSpeech2ConformerPredictorLayer( + input_channels, + num_chans, + config.duration_predictor_kernel_size, + config.duration_predictor_dropout_rate, + ) + self.conv_layers.append(layer) + self.linear = nn.Linear(config.duration_predictor_channels, 1) + + def forward(self, encoder_hidden_states): + """ + Args: + hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`): + Batch of input sequences. + padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*): + Batch of masks indicating padded part. + + Returns: + `torch.Tensor`: Batch of predicted durations in log domain `(batch_size, max_text_length)`. + + """ + # (batch_size, input_dim, max_text_length) + hidden_states = encoder_hidden_states.transpose(1, -1) + for layer in self.conv_layers: + hidden_states = layer(hidden_states) + + # NOTE: calculate in log domain, (batch_size, max_text_length) + hidden_states = self.linear(hidden_states.transpose(1, -1)).squeeze(-1) + + if not self.training: + # NOTE: calculate in linear domain + hidden_states = torch.clamp(torch.round(hidden_states.exp() - self.log_domain_offset), min=0).long() + + return hidden_states + + +# Copied from transformers.models.speecht5.modeling_speecht5.SpeechT5BatchNormConvLayer +class FastSpeech2ConformerBatchNormConvLayer(nn.Module): + def __init__(self, config, layer_id=0): + super().__init__() + + if layer_id == 0: + in_conv_dim = config.num_mel_bins + else: + in_conv_dim = config.speech_decoder_postnet_units + + if layer_id == config.speech_decoder_postnet_layers - 1: + out_conv_dim = config.num_mel_bins + else: + out_conv_dim = config.speech_decoder_postnet_units + + self.conv = nn.Conv1d( + in_conv_dim, + out_conv_dim, + kernel_size=config.speech_decoder_postnet_kernel, + stride=1, + padding=(config.speech_decoder_postnet_kernel - 1) // 2, + bias=False, + ) + self.batch_norm = nn.BatchNorm1d(out_conv_dim) + + if layer_id < config.speech_decoder_postnet_layers - 1: + self.activation = nn.Tanh() + else: + self.activation = None + + self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.batch_norm(hidden_states) + if self.activation is not None: + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class FastSpeech2ConformerSpeechDecoderPostnet(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor) + self.layers = nn.ModuleList( + [FastSpeech2ConformerBatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)] + ) + + def forward(self, hidden_states: torch.Tensor): + outputs_before_postnet = self.feat_out(hidden_states).view(hidden_states.size(0), -1, self.config.num_mel_bins) + layer_output = outputs_before_postnet.transpose(1, 2) + for layer in self.layers: + layer_output = layer(layer_output) + outputs_after_postnet = outputs_before_postnet + layer_output.transpose(1, 2) + return outputs_before_postnet, outputs_after_postnet + + +class FastSpeech2ConformerPredictorLayer(nn.Module): + def __init__(self, input_channels, num_chans, kernel_size, dropout_rate): + super().__init__() + self.conv = nn.Conv1d( + input_channels, + num_chans, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) + self.activation = nn.ReLU() + self.layer_norm = nn.LayerNorm(num_chans) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + + # Perform layer norm on dimension 1 + hidden_states = hidden_states.transpose(1, -1) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose(1, -1) + + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class FastSpeech2ConformerVariancePredictor(nn.Module): + def __init__( + self, + config: FastSpeech2ConformerConfig, + num_layers=2, + num_chans=384, + kernel_size=3, + dropout_rate=0.5, + ): + """ + Initilize variance predictor module. + + Args: + input_dim (`int`): Input dimension. + num_layers (`int`, *optional*, defaults to 2): Number of convolutional layers. + num_chans (`int`, *optional*, defaults to 384): Number of channels of convolutional layers. + kernel_size (`int`, *optional*, defaults to 3): Kernel size of convolutional layers. + dropout_rate (`float`, *optional*, defaults to 0.5): Dropout rate. + """ + super().__init__() + self.conv_layers = nn.ModuleList() + for idx in range(num_layers): + input_channels = config.hidden_size if idx == 0 else num_chans + layer = FastSpeech2ConformerPredictorLayer(input_channels, num_chans, kernel_size, dropout_rate) + self.conv_layers.append(layer) + self.linear = nn.Linear(num_chans, 1) + + def forward(self, encoder_hidden_states, padding_masks=None): + """ + Calculate forward propagation. + + Args: + encoder_hidden_states (`torch.Tensor` of shape `(batch_size, max_text_length, input_dim)`): + Batch of input sequences. + padding_masks (`torch.ByteTensor` of shape `(batch_size, max_text_length)`, *optional*): + Batch of masks indicating padded part. + + Returns: + Tensor: Batch of predicted sequences `(batch_size, max_text_length, 1)`. + """ + # (batch_size, input_dim, max_text_length) + hidden_states = encoder_hidden_states.transpose(1, -1) + for layer in self.conv_layers: + hidden_states = layer(hidden_states) + + hidden_states = self.linear(hidden_states.transpose(1, 2)) + + if padding_masks is not None: + hidden_states = hidden_states.masked_fill(padding_masks, 0.0) + + return hidden_states + + +class FastSpeech2ConformerVarianceEmbedding(nn.Module): + def __init__( + self, + in_channels=1, + out_channels=384, + kernel_size=1, + padding=0, + dropout_rate=0.0, + ): + super().__init__() + self.conv = nn.Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + padding=padding, + ) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose(1, 2) + hidden_states = self.conv(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states.transpose(1, 2) + return hidden_states + + +class FastSpeech2ConformerAttention(nn.Module): + """ + Multi-Head attention layer with relative position encoding. Details can be found in + https://github.com/espnet/espnet/pull/2816. Paper: https://arxiv.org/abs/1901.02860. + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """Construct an FastSpeech2ConformerAttention object.""" + super().__init__() + # We assume d_v always equals dim_key + self.num_heads = module_config["num_attention_heads"] + self.hidden_size = config.hidden_size + self.dim_key = self.hidden_size // self.num_heads + self.head_dim = self.hidden_size // self.num_heads + self.linear_q = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_k = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_v = nn.Linear(self.hidden_size, self.hidden_size) + self.linear_out = nn.Linear(self.hidden_size, self.hidden_size) + self.dropout = nn.Dropout(p=module_config["attention_dropout_rate"]) + + # linear transformation for positional encoding + self.linear_pos = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + # these two learnable bias are used in matrix c and matrix d + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + self.pos_bias_u = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim)) + self.pos_bias_v = nn.Parameter(torch.Tensor(self.num_heads, self.head_dim)) + + def shift_relative_position_tensor(self, pos_tensor): + """ + Args: + pos_tensor (torch.Tensor of shape (batch_size, head, time1, 2*time1-1)): Input tensor. + """ + zero_pad = torch.zeros((*pos_tensor.size()[:3], 1), device=pos_tensor.device, dtype=pos_tensor.dtype) + pos_tensor_padded = torch.cat([zero_pad, pos_tensor], dim=-1) + + pos_tensor_padded = pos_tensor_padded.view(*pos_tensor.size()[:2], pos_tensor.size(3) + 1, pos_tensor.size(2)) + # only keep the positions from 0 to time2 + pos_tensor = pos_tensor_padded[:, :, 1:].view_as(pos_tensor)[:, :, :, : pos_tensor.size(-1) // 2 + 1] + + return pos_tensor + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + pos_emb: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Compute 'Scaled Dot Product Attention' with rel. positional encoding. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time2, size)`): Values of the hidden states + attention_mask (`torch.Tensor` of shape `(batch, time1, time2)`): Mask tensor. + pos_emb (`torch.Tensor` of shape `(batch, 2*time1-1, size)`): Positional embedding tensor. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time1, d_model)`. + """ + bsz, q_len, _ = hidden_states.size() + query_states = self.linear_q(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + key_states = self.linear_k(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + value_states = self.linear_v(hidden_states).view(bsz, -1, self.num_heads, self.head_dim) + + bsz_pos = pos_emb.size(0) + pos_encoding = self.linear_pos(pos_emb).view(bsz_pos, -1, self.num_heads, self.head_dim) + + # (batch_size, head, time1, dim_key) + query_with_bias_u = (query_states + self.pos_bias_u).transpose(1, 2) + # (batch_size, head, time1, dim_key) + query_with_bias_v = (query_states + self.pos_bias_v).transpose(1, 2) + + # compute attention score + # first compute matrix a and matrix c + # as described in https://arxiv.org/abs/1901.02860 Section 3.3 + # (batch_size, head, time1, time2) + matrix_ac = torch.matmul(query_with_bias_u, key_states.permute(0, 2, 3, 1)) + + # compute matrix b and matrix d + # (batch_size, head, time1, 2*time1-1) + matrix_bd = torch.matmul(query_with_bias_v, pos_encoding.permute(0, 2, 3, 1)) + matrix_bd = self.shift_relative_position_tensor(matrix_bd) + + # (batch_size, head, time1, time2) + scores = (matrix_ac + matrix_bd) / math.sqrt(self.dim_key) + + # Forward attention + if attention_mask is not None: + expected_size = (bsz, 1, q_len) + if attention_mask.size() != expected_size: + raise ValueError(f"Attention mask should be of size {expected_size}, but is {attention_mask.size()}") + attention_mask = attention_mask.unsqueeze(1).eq(0) + min_value = float(torch.finfo(scores.dtype).min) + scores = scores.masked_fill(attention_mask, min_value) + attn_weights = torch.softmax(scores, dim=-1).masked_fill(attention_mask, 0.0) + else: + attn_weights = torch.softmax(scores, dim=-1) + + attn_weights = self.dropout(attn_weights) + attn_output = torch.matmul(attn_weights, value_states.transpose(1, 2)) + attn_output = attn_output.transpose(1, 2).contiguous().view(bsz, q_len, -1) + + attn_output = self.linear_out(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class FastSpeech2ConformerConvolutionModule(nn.Module): + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + super().__init__() + # kernel_size should be an odd number for 'SAME' padding + channels = config.hidden_size + kernel_size = module_config["kernel_size"] + self.pointwise_conv1 = nn.Conv1d(channels, 2 * channels, kernel_size=1, stride=1, padding=0, bias=True) + self.depthwise_conv = nn.Conv1d( + channels, channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, bias=True + ) + self.norm = nn.BatchNorm1d(channels) + self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0, bias=True) + + def forward(self, hidden_states): + """ + Compute convolution module. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor. + + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time, channels)`. + + """ + # exchange the temporal dimension and the feature dimension + hidden_states = hidden_states.transpose(1, 2) + + # GLU mechanism, (batch_size, 2*channel, dim) + hidden_states = self.pointwise_conv1(hidden_states) + # (batch_size, channel, dim) + hidden_states = nn.functional.glu(hidden_states, dim=1) + + # 1D Depthwise Conv + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.norm(hidden_states) + + hidden_states = hidden_states * torch.sigmoid(hidden_states) + + hidden_states = self.pointwise_conv2(hidden_states) + + return hidden_states.transpose(1, 2) + + +class FastSpeech2ConformerEncoderLayer(nn.Module): + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + super().__init__() + + # self-attention module definition + self.self_attn = FastSpeech2ConformerAttention(config, module_config) + + # feed-forward module definition + self.feed_forward = FastSpeech2ConformerMultiLayeredConv1d(config, module_config) + + self.macaron_style = config.use_macaron_style_in_conformer + if self.macaron_style: + self.feed_forward_macaron = FastSpeech2ConformerMultiLayeredConv1d(config, module_config) + self.ff_macaron_layer_norm = nn.LayerNorm(config.hidden_size) + self.ff_scale = 0.5 + else: + self.ff_scale = 1.0 + + # convolution module definition + self.use_cnn_module = config.use_cnn_in_conformer + if self.use_cnn_module: + self.conv_module = FastSpeech2ConformerConvolutionModule(config, module_config) + self.conv_layer_norm = nn.LayerNorm(config.hidden_size) + self.final_layer_norm = nn.LayerNorm(config.hidden_size) + + self.ff_layer_norm = nn.LayerNorm(config.hidden_size) + + self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size) + + self.dropout = nn.Dropout(module_config["dropout_rate"]) + self.size = config.hidden_size + self.normalize_before = module_config["normalize_before"] + self.concat_after = module_config["concat_after"] + if self.concat_after: + self.concat_linear = nn.Linear(config.hidden_size + config.hidden_size, config.hidden_size) + + def forward( + self, + hidden_states: torch.Tensor, + pos_emb: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[torch.Tensor] = False, + ): + """ + Compute encoded features. + + Args: + hidden_states (`torch.Tensor` of shape `(batch, time, size)`): Input tensor. + pos_emb (`torch.Tensor` of shape `(1, time, size)`): Positional embeddings tensor. + attention_mask (`torch.Tensor` of shape `(batch, time)`): Attention mask tensor for the input. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + Returns: + `torch.Tensor`: Output tensor of shape `(batch, time, size)`. + + """ + # whether to use macaron style + if self.macaron_style: + residual = hidden_states + if self.normalize_before: + hidden_states = self.ff_macaron_layer_norm(hidden_states) + hidden_states = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(hidden_states)) + if not self.normalize_before: + hidden_states = self.ff_macaron_layer_norm(hidden_states) + + # multi-headed self-attention module + residual = hidden_states + if self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + attention_output, attention_scores = self.self_attn( + hidden_states, attention_mask=attention_mask, pos_emb=pos_emb, output_attentions=output_attentions + ) + + if self.concat_after: + x_concat = torch.cat((hidden_states, attention_output), dim=-1) + hidden_states = self.concat_linear(x_concat) + hidden_states = residual + hidden_states + else: + hidden_states = self.dropout(attention_output) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.self_attn_layer_norm(hidden_states) + + # convolution module + if self.use_cnn_module: + residual = hidden_states + if self.normalize_before: + hidden_states = self.conv_layer_norm(hidden_states) + hidden_states = self.conv_module(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + hidden_states + if not self.normalize_before: + hidden_states = self.conv_layer_norm(hidden_states) + + # feed forward module + residual = hidden_states + if self.normalize_before: + hidden_states = self.ff_layer_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = residual + self.ff_scale * hidden_states + if not self.normalize_before: + hidden_states = self.ff_layer_norm(hidden_states) + + if self.conv_module is not None: + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_scores,) + + return outputs + + +class FastSpeech2ConformerMultiLayeredConv1d(nn.Module): + """ + Multi-layered conv1d for Transformer block. + + This is a module of multi-layered conv1d designed to replace positionwise feed-forward network in Transformer + block, which is introduced in 'FastSpeech: Fast, Robust and Controllable Text to Speech' + https://arxiv.org/pdf/1905.09263.pdf + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """ + Initialize FastSpeech2ConformerMultiLayeredConv1d module. + + Args: + input_channels (`int`): Number of input channels. + hidden_channels (`int`): Number of hidden channels. + kernel_size (`int`): Kernel size of conv1d. + dropout_rate (`float`): Dropout rate. + """ + super().__init__() + input_channels = config.hidden_size + hidden_channels = module_config["linear_units"] + kernel_size = config.positionwise_conv_kernel_size + self.conv1 = nn.Conv1d(input_channels, hidden_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.conv2 = nn.Conv1d(hidden_channels, input_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2) + self.dropout = nn.Dropout(module_config["dropout_rate"]) + + def forward(self, hidden_states): + """ + Calculate forward propagation. + + Args: + hidden_states (torch.Tensor): Batch of input tensors (batch_size, time, input_channels). + + Returns: + torch.Tensor: Batch of output tensors (batch_size, time, hidden_channels). + """ + hidden_states = hidden_states.transpose(-1, 1) + hidden_states = self.conv1(hidden_states) + hidden_states = torch.relu(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.conv2(hidden_states) + hidden_states = hidden_states.transpose(-1, 1) + return hidden_states + + +class FastSpeech2ConformerRelPositionalEncoding(nn.Module): + """ + Args: + Relative positional encoding module (new implementation). Details can be found in + https://github.com/espnet/espnet/pull/2816. See : Appendix Batch in https://arxiv.org/abs/1901.02860 + config (`FastSpeech2ConformerConfig`): + FastSpeech2ConformerConfig instance. + module_config (`dict`): + Dictionary containing the encoder or decoder module configuration from the `FastSpeech2ConformerConfig`. + """ + + def __init__(self, config: FastSpeech2ConformerConfig, module_config): + """ + Construct an PositionalEncoding object. + """ + super().__init__() + self.embed_dim = config.hidden_size + self.input_scale = math.sqrt(self.embed_dim) + self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"]) + self.pos_enc = None + self.max_len = 5000 + self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len)) + + def extend_pos_enc(self, x): + """Reset the positional encodings.""" + if self.pos_enc is not None: + # self.pos_enc contains both positive and negative parts + # the length of self.pos_enc is 2 * input_len - 1 + if self.pos_enc.size(1) >= x.size(1) * 2 - 1: + if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device: + self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device) + return + # Suppose `i` means to the position of query vector and `j` means the + # position of key vector. We use position relative positions when keys + # are to the left (i>j) and negative relative positions otherwise (i