From cb3846d52ed375a5a166b906046cac5c15d08818 Mon Sep 17 00:00:00 2001 From: luoliyan Date: Mon, 29 Mar 2021 11:43:17 +0930 Subject: [PATCH] Update gTTS; fix failing unit tests --- Makefile | 2 +- chinese/_version.py | 2 +- chinese/about.py | 1 + chinese/config.json | 4 +- chinese/gui.py | 4 +- chinese/lib/gtts/lang.py | 151 ++++------ chinese/lib/gtts/langs.py | 64 +++++ chinese/lib/gtts/tests/__init__.py | 0 .../tests/input_files/test_cli_test_ascii.txt | 2 + .../tests/input_files/test_cli_test_utf8.txt | 5 + chinese/lib/gtts/tests/test_cli.py | 264 ++++++++++++++++++ chinese/lib/gtts/tests/test_lang.py | 23 ++ chinese/lib/gtts/tests/test_tts.py | 181 ++++++++++++ chinese/lib/gtts/tests/test_utils.py | 62 ++++ chinese/lib/gtts/tokenizer/tests/test_core.py | 73 +++++ .../tokenizer/tests/test_pre_processors.py | 30 ++ .../tokenizer/tests/test_tokenizer_cases.py | 44 +++ chinese/lib/gtts/tts.py | 55 ++-- chinese/lib/gtts/version.py | 2 +- chinese/tts.py | 4 +- tests/__init__.py | 1 + 21 files changed, 840 insertions(+), 134 deletions(-) create mode 100644 chinese/lib/gtts/langs.py create mode 100644 chinese/lib/gtts/tests/__init__.py create mode 100644 chinese/lib/gtts/tests/input_files/test_cli_test_ascii.txt create mode 100644 chinese/lib/gtts/tests/input_files/test_cli_test_utf8.txt create mode 100644 chinese/lib/gtts/tests/test_cli.py create mode 100644 chinese/lib/gtts/tests/test_lang.py create mode 100644 chinese/lib/gtts/tests/test_tts.py create mode 100644 chinese/lib/gtts/tests/test_utils.py create mode 100644 chinese/lib/gtts/tokenizer/tests/test_core.py create mode 100644 chinese/lib/gtts/tokenizer/tests/test_pre_processors.py create mode 100644 chinese/lib/gtts/tokenizer/tests/test_tokenizer_cases.py diff --git a/Makefile b/Makefile index 7a3d8a1..a8f8271 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ PROJECT_SHORT = chinese PROJECT_LONG = chinese-support-redux -VERSION = 0.14.0 +VERSION = 0.14.2 XDG_DATA_HOME ?= $(HOME)/.local/share ADDON_PATH = "$(XDG_DATA_HOME)/Anki2/addons21/$(PROJECT_LONG)" ZIP_NAME = $(PROJECT_LONG)-v$(VERSION).zip diff --git a/chinese/_version.py b/chinese/_version.py index ef91994..c41af0b 100644 --- a/chinese/_version.py +++ b/chinese/_version.py @@ -1 +1 @@ -__version__ = '0.14.0' +__version__ = '0.14.2' diff --git a/chinese/about.py b/chinese/about.py index 2520994..aa198b3 100644 --- a/chinese/about.py +++ b/chinese/about.py @@ -33,6 +33,7 @@ def showAbout(): contributors = [ 'Alex Griffin', 'Chris Hatch', + 'Joe Minicucci', 'Roland Sieker', 'Thomas TEMPÉ', ] diff --git a/chinese/config.json b/chinese/config.json index 726ea93..5147a67 100644 --- a/chinese/config.json +++ b/chinese/config.json @@ -1,8 +1,8 @@ { "firstRun": true, - "version": "0.14.0", + "version": "0.14.2", "enabledModels": [], - "speech": "google|zh-cn", + "speech": "google|zh-CN", "target": "pinyin", "max_examples": -1, "fields": { diff --git a/chinese/gui.py b/chinese/gui.py index 775c4a0..64faea3 100644 --- a/chinese/gui.py +++ b/chinese/gui.py @@ -41,8 +41,8 @@ SPEECH_ENGINES = { 'Baidu Translate': 'baidu|zh', - 'Google Mandarin (PRC)': 'google|zh-cn', - 'Google Mandarin (Taiwan)': 'google|zh-tw', + 'Google Mandarin (PRC)': 'google|zh-CN', + 'Google Mandarin (Taiwan)': 'google|zh-TW', 'Amazon Polly' : 'aws|Zhiyu', 'Disabled': None, } diff --git a/chinese/lib/gtts/lang.py b/chinese/lib/gtts/lang.py index 089c84c..fbb5e1f 100644 --- a/chinese/lib/gtts/lang.py +++ b/chinese/lib/gtts/lang.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +from gtts.langs import _main_langs +from warnings import warn import logging __all__ = ['tts_langs'] @@ -14,13 +16,13 @@ def tts_langs(): Returns: dict: A dictionary of the type `{ '': ''}` - Where `` is an IETF language tag such as `en` or `pt-br`, + Where `` is an IETF language tag such as `en` or `zh-TW`, and `` is the full English name of the language, such as - `English` or `Portuguese (Brazil)`. + `English` or `Chinese (Mandarin/Taiwan)`. The dictionary returned combines languages from two origins: - - Languages fetched from Google Translate + - Languages fetched from Google Translate (pre-generated in :mod:`gtts.langs`) - Languages that are undocumented variations that were observed to work and present different dialects or accents. @@ -32,112 +34,65 @@ def tts_langs(): return langs -def _main_langs(): - """Define the main languages. +def _extra_langs(): + """Define extra languages. Returns: - dict: A dictionnary of the main languages extracted from - Google Translate. + dict: A dictionnary of extra languages manually defined. + + Variations of the ones generated in `_main_langs`, + observed to provide different dialects or accents or + just simply accepted by the Google Translate Text-to-Speech API. """ return { - 'af': 'Afrikaans', - 'ar': 'Arabic', - 'bn': 'Bengali', - 'bs': 'Bosnian', - 'ca': 'Catalan', - 'cs': 'Czech', - 'cy': 'Welsh', - 'da': 'Danish', - 'de': 'German', - 'el': 'Greek', - 'en': 'English', - 'eo': 'Esperanto', - 'es': 'Spanish', - 'et': 'Estonian', - 'fi': 'Finnish', - 'fr': 'French', - 'gu': 'Gujarati', - 'hi': 'Hindi', - 'hr': 'Croatian', - 'hu': 'Hungarian', - 'hy': 'Armenian', - 'id': 'Indonesian', - 'is': 'Icelandic', - 'it': 'Italian', - 'ja': 'Japanese', - 'jw': 'Javanese', - 'km': 'Khmer', - 'kn': 'Kannada', - 'ko': 'Korean', - 'la': 'Latin', - 'lv': 'Latvian', - 'mk': 'Macedonian', - 'ml': 'Malayalam', - 'mr': 'Marathi', - 'my': 'Myanmar (Burmese)', - 'ne': 'Nepali', - 'nl': 'Dutch', - 'no': 'Norwegian', - 'pl': 'Polish', - 'pt': 'Portuguese', - 'ro': 'Romanian', - 'ru': 'Russian', - 'si': 'Sinhala', - 'sk': 'Slovak', - 'sq': 'Albanian', - 'sr': 'Serbian', - 'su': 'Sundanese', - 'sv': 'Swedish', - 'sw': 'Swahili', - 'ta': 'Tamil', - 'te': 'Telugu', - 'th': 'Thai', - 'tl': 'Filipino', - 'tr': 'Turkish', - 'uk': 'Ukrainian', - 'ur': 'Urdu', - 'vi': 'Vietnamese', - 'zh-CN': 'Chinese' + # Chinese + 'zh-TW': 'Chinese (Mandarin/Taiwan)', + 'zh': 'Chinese (Mandarin)' } -def _extra_langs(): - """Define extra languages. +def _fallback_deprecated_lang(lang): + """Languages Google Text-to-Speech used to support. + + Language tags that don't work anymore, but that can + fallback to a more general language code to maintain + compatibility. + + Args: + lang (string): The language tag. Returns: - dict: A dictionnary of extra languages manually defined. + string: The language tag, as-is if not deprecated, + or a fallack if it exits. - Variations of the ones fetched by `_main_langs`, - observed to provide different dialects or accents or - just simply accepted by the Google Translate Text-to-Speech API. + Example: + ``en-GB`` returns ``en``. + ``en-gb`` returns ``en``. """ - return { - # Chinese - 'zh-cn': 'Chinese (Mandarin/China)', - 'zh-tw': 'Chinese (Mandarin/Taiwan)', - # English - 'en-us': 'English (US)', - 'en-ca': 'English (Canada)', - 'en-uk': 'English (UK)', - 'en-gb': 'English (UK)', - 'en-au': 'English (Australia)', - 'en-gh': 'English (Ghana)', - 'en-in': 'English (India)', - 'en-ie': 'English (Ireland)', - 'en-nz': 'English (New Zealand)', - 'en-ng': 'English (Nigeria)', - 'en-ph': 'English (Philippines)', - 'en-za': 'English (South Africa)', - 'en-tz': 'English (Tanzania)', - # French - 'fr-ca': 'French (Canada)', - 'fr-fr': 'French (France)', - # Portuguese - 'pt-br': 'Portuguese (Brazil)', - 'pt-pt': 'Portuguese (Portugal)', - # Spanish - 'es-es': 'Spanish (Spain)', - 'es-us': 'Spanish (United States)' + + deprecated = { + # '': [] + 'en': ['en-us', 'en-ca', 'en-uk', 'en-gb', 'en-au', 'en-gh', 'en-in', + 'en-ie', 'en-nz', 'en-ng', 'en-ph', 'en-za', 'en-tz'], + 'fr': ['fr-ca', 'fr-fr'], + 'pt': ['pt-br', 'pt-pt'], + 'es': ['es-es', 'es-us'], + 'zh-CN': ['zh-cn'], + 'zh-TW': ['zh-tw'], } + + for fallback_lang, deprecated_langs in deprecated.items(): + if lang.lower() in deprecated_langs: + msg = ( + "'{}' has been deprecated, falling back to '{}'. " + "This fallback will be removed in a future version." + ).format(lang, fallback_lang) + + warn(msg, DeprecationWarning) + log.warning(msg) + + return fallback_lang + + return lang \ No newline at end of file diff --git a/chinese/lib/gtts/langs.py b/chinese/lib/gtts/langs.py new file mode 100644 index 0000000..449f039 --- /dev/null +++ b/chinese/lib/gtts/langs.py @@ -0,0 +1,64 @@ +# Note: this file is generated +_langs = { + "af": "Afrikaans", + "ar": "Arabic", + "bn": "Bengali", + "bs": "Bosnian", + "ca": "Catalan", + "cs": "Czech", + "cy": "Welsh", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English", + "eo": "Esperanto", + "es": "Spanish", + "et": "Estonian", + "fi": "Finnish", + "fr": "French", + "gu": "Gujarati", + "hi": "Hindi", + "hr": "Croatian", + "hu": "Hungarian", + "hy": "Armenian", + "id": "Indonesian", + "is": "Icelandic", + "it": "Italian", + "ja": "Japanese", + "jw": "Javanese", + "km": "Khmer", + "kn": "Kannada", + "ko": "Korean", + "la": "Latin", + "lv": "Latvian", + "mk": "Macedonian", + "ml": "Malayalam", + "mr": "Marathi", + "my": "Myanmar (Burmese)", + "ne": "Nepali", + "nl": "Dutch", + "no": "Norwegian", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "si": "Sinhala", + "sk": "Slovak", + "sq": "Albanian", + "sr": "Serbian", + "su": "Sundanese", + "sv": "Swedish", + "sw": "Swahili", + "ta": "Tamil", + "te": "Telugu", + "th": "Thai", + "tl": "Filipino", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "zh-CN": "Chinese" +} + +def _main_langs(): + return _langs diff --git a/chinese/lib/gtts/tests/__init__.py b/chinese/lib/gtts/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chinese/lib/gtts/tests/input_files/test_cli_test_ascii.txt b/chinese/lib/gtts/tests/input_files/test_cli_test_ascii.txt new file mode 100644 index 0000000..ce0019b --- /dev/null +++ b/chinese/lib/gtts/tests/input_files/test_cli_test_ascii.txt @@ -0,0 +1,2 @@ +Can you make pink a little more pinkish can you make pink a little more pinkish, nor can you make the font bigger? +How much will it cost the website doesn't have the theme i was going for. \ No newline at end of file diff --git a/chinese/lib/gtts/tests/input_files/test_cli_test_utf8.txt b/chinese/lib/gtts/tests/input_files/test_cli_test_utf8.txt new file mode 100644 index 0000000..5bde1bc --- /dev/null +++ b/chinese/lib/gtts/tests/input_files/test_cli_test_utf8.txt @@ -0,0 +1,5 @@ +这是一个三岁的小孩 +在讲述她从一系列照片里看到的东西。 +对这个世界, 她也许还有很多要学的东西, +但在一个重要的任务上, 她已经是专家了: +去理解她所看到的东西。 diff --git a/chinese/lib/gtts/tests/test_cli.py b/chinese/lib/gtts/tests/test_cli.py new file mode 100644 index 0000000..b801bda --- /dev/null +++ b/chinese/lib/gtts/tests/test_cli.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- +import pytest +import re +import os +from click.testing import CliRunner +from gtts.cli import tts_cli + +# Need to look into gTTS' log output to test proper instantiation +# - Use testfixtures.LogCapture() b/c TestCase.assertLogs() needs py3.4+ +# - Clear 'gtts' logger handlers (set in gtts.cli) to reduce test noise +import logging +from testfixtures import LogCapture +logger = logging.getLogger('gtts') +logger.handlers = [] + + +"""Test options and arguments""" + + +def runner(args, input=None): + return CliRunner().invoke(tts_cli, args, input) + + +def runner_debug(args, input=None): + return CliRunner().invoke(tts_cli, args + ['--debug'], input) + + +# tests +def test_text_no_text_or_file(): + """One of (arg) and should be set""" + result = runner_debug([]) + + assert " required" in result.output + assert result.exit_code != 0 + + +def test_text_text_and_file(tmp_path): + """ (arg) and should not be set together""" + filename = tmp_path / 'test_and_file.txt' + filename.touch() + + result = runner_debug(['--file', str(filename), 'test']) + + assert " can't be used together" in result.output + assert result.exit_code != 0 + + +def test_text_empty(tmp_path): + """Exit on no text to speak (via )""" + filename = tmp_path / 'text_empty.txt' + filename.touch() + + result = runner_debug(['--file', str(filename)]) + + assert "No text to speak" in result.output + assert result.exit_code != 0 + + +# tests +def test_file_not_exists(): + """ should exist""" + result = runner_debug(['--file', 'notexist.txt', 'test']) + + assert "No such file or directory" in result.output + assert result.exit_code != 0 + + +# tests +@pytest.mark.net +def test_all(): + """Option should return a list of languages""" + result = runner(['--all']) + + # One or more of " xy: name" (\n optional to match the last) + # Ex. " xx: xxxxx\n xx-yy: xxxxx\n xx: xxxxx" + + assert re.match(r"^(?:\s{2}(\w{2}|\w{2}-\w{2}): .+\n?)+$", result.output) + assert result.exit_code == 0 + + +# tests +@pytest.mark.net +def test_lang_not_valid(): + """Invalid should display an error""" + result = runner(['--lang', 'xx', 'test']) + + assert "xx' not in list of supported languages" in result.output + assert result.exit_code != 0 + + +@pytest.mark.net +def test_lang_nocheck(): + """Invalid (with ) should display an error message from gtts""" + with LogCapture() as lc: + result = runner_debug(['--lang', 'xx', '--nocheck', 'test']) + + log = str(lc) + + assert 'lang: xx' in log + assert 'lang_check: False' in log + assert "Unsupported language 'xx'" in result.output + assert result.exit_code != 0 + +# Param set tests +@pytest.mark.net +def test_params_set(): + """Options should set gTTS instance arguments (read from debug log)""" + with LogCapture() as lc: + result = runner_debug(['--lang', 'fr', '--tld', 'es', '--slow', '--nocheck', 'test']) + + log = str(lc) + + assert 'lang: fr' in log + assert 'tld: es' in log + assert 'lang_check: False' in log + assert 'slow: True' in log + assert 'text: test' in log + assert result.exit_code == 0 + + +# Test all input methods +pwd = os.path.dirname(__file__) + +# Text for stdin ('-' for or ) +textstdin = """stdin +test +123""" + +# Text for stdin ('-' for or ) (Unicode) +textstdin_unicode = u"""你吃饭了吗? +你最喜欢哪部电影? +我饿了,我要去做饭了。""" + +# Text for and +text = """Can you make pink a little more pinkish can you make pink a little more pinkish, nor can you make the font bigger? +How much will it cost the website doesn't have the theme i was going for.""" + +textfile_ascii = os.path.join(pwd, 'input_files', 'test_cli_test_ascii.txt') + +# Text for and (Unicode) +text_unicode = u"""这是一个三岁的小孩 +在讲述她从一系列照片里看到的东西。 +对这个世界, 她也许还有很多要学的东西, +但在一个重要的任务上, 她已经是专家了: +去理解她所看到的东西。""" + +textfile_utf8 = os.path.join(pwd, 'input_files', 'test_cli_test_utf8.txt') + +""" +Method that mimics's LogCapture's __str__ method to make +the string in the comprehension a unicode literal for P2.7 +https://github.com/Simplistix/testfixtures/blob/32c87902cb111b7ede5a6abca9b597db551c88ef/testfixtures/logcapture.py#L149 +""" + + +def logcapture_str(lc): + if not lc.records: + return 'No logging captured' + + return '\n'.join([u"%s %s\n %s" % r for r in lc.actual()]) + + +@pytest.mark.net +def test_stdin_text(): + with LogCapture() as lc: + result = runner_debug(['-'], textstdin) + log = logcapture_str(lc) + + assert 'text: %s' % textstdin in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_stdin_text_unicode(): + with LogCapture() as lc: + result = runner_debug(['-'], textstdin_unicode) + log = logcapture_str(lc) + + assert u'text: %s' % textstdin_unicode in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_stdin_file(): + with LogCapture() as lc: + result = runner_debug(['--file', '-'], textstdin) + log = logcapture_str(lc) + + assert 'text: %s' % textstdin in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_stdin_file_unicode(): + with LogCapture() as lc: + result = runner_debug(['--file', '-'], textstdin_unicode) + log = logcapture_str(lc) + + assert 'text: %s' % textstdin_unicode in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_text(): + with LogCapture() as lc: + result = runner_debug([text]) + log = logcapture_str(lc) + + assert "text: %s" % text in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_text_unicode(): + with LogCapture() as lc: + result = runner_debug([text_unicode]) + log = logcapture_str(lc) + + assert "text: %s" % text_unicode in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_file_ascii(): + with LogCapture() as lc: + result = runner_debug(['--file', textfile_ascii]) + log = logcapture_str(lc) + + assert "text: %s" % text in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_file_utf8(): + with LogCapture() as lc: + result = runner_debug(['--file', textfile_utf8]) + log = logcapture_str(lc) + + assert "text: %s" % text_unicode in log + assert result.exit_code == 0 + + +@pytest.mark.net +def test_stdout(): + result = runner(['test']) + + # The MP3 encoding (LAME 3.99.5) used to leave a signature in the raw output + # This no longer appears to be the case + assert result.exit_code == 0 + + +@pytest.mark.net +def test_file(tmp_path): + filename = tmp_path / 'out.mp3' + + result = runner(['test', '--output', str(filename)]) + + # Check if files created is > 2k + assert filename.stat().st_size > 2000 + assert result.exit_code == 0 + + +if __name__ == '__main__': + pytest.main(['-x', __file__]) diff --git a/chinese/lib/gtts/tests/test_lang.py b/chinese/lib/gtts/tests/test_lang.py new file mode 100644 index 0000000..bedc2d6 --- /dev/null +++ b/chinese/lib/gtts/tests/test_lang.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +import pytest +from gtts.lang import tts_langs, _extra_langs, _fallback_deprecated_lang +from gtts.langs import _main_langs + +"""Test language list""" + + +def test_main_langs(): + """Fetch languages successfully""" + # Safe to assume 'en' (English) will always be there + scraped_langs = _main_langs() + assert 'en' in scraped_langs + + +def test_deprecated_lang(): + """Test language deprecation fallback""" + with pytest.deprecated_call(): + assert _fallback_deprecated_lang('en-gb') == 'en' + + +if __name__ == '__main__': + pytest.main(['-x', __file__]) diff --git a/chinese/lib/gtts/tests/test_tts.py b/chinese/lib/gtts/tests/test_tts.py new file mode 100644 index 0000000..d2d2849 --- /dev/null +++ b/chinese/lib/gtts/tests/test_tts.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +import os +import pytest +from mock import Mock +from six.moves import urllib + +from gtts.tts import gTTS, gTTSError +from gtts.langs import _main_langs +from gtts.lang import _extra_langs + +# Testing all languages takes some time. +# Set TEST_LANGS envvar to choose languages to test. +# * 'main': Languages extracted from the Web +# * 'extra': Languagee set in Languages.EXTRA_LANGS +# * 'all': All of the above +# * : Languages tags list to test +# Unset TEST_LANGS to test everything ('all') +# See: langs_dict() + + +"""Construct a dict of suites of languages to test. +{ '' : } + +ex.: { 'fetch' : {'en': 'English', 'fr': 'French'}, + 'extra' : {'en': 'English', 'fr': 'French'} } +ex.: { 'environ' : ['en', 'fr'] } +""" +env = os.environ.get('TEST_LANGS') +if not env or env == 'all': + langs = _main_langs() + langs.update(_extra_langs()) +elif env == 'main': + langs = _main_langs() +elif env == 'extra': + langs = _extra_langs() +else: + env_langs = {l: l for l in env.split(',') if l} + langs = env_langs + + +@pytest.mark.net +@pytest.mark.parametrize('lang', langs.keys(), ids=list(langs.values())) +def test_TTS(tmp_path, lang): + """Test all supported languages and file save""" + + text = "This is a test" + """Create output .mp3 file successfully""" + for slow in (False, True): + filename = tmp_path / 'test_{}_.mp3'.format(lang) + # Create gTTS and save + tts = gTTS(text=text, lang=lang, slow=slow, lang_check=False) + tts.save(filename) + + # Check if files created is > 1.5 + assert filename.stat().st_size > 1500 + + +@pytest.mark.net +def test_unsupported_language_check(): + """Raise ValueError on unsupported language (with language check)""" + lang = 'xx' + text = "Lorem ipsum" + check = True + with pytest.raises(ValueError): + gTTS(text=text, lang=lang, lang_check=check) + + +def test_empty_string(): + """Raise AssertionError on empty string""" + text = "" + with pytest.raises(AssertionError): + gTTS(text=text) + + +def test_no_text_parts(tmp_path): + """Raises AssertionError on no content to send to API (no text_parts)""" + text = " ..,\n" + with pytest.raises(AssertionError): + filename = tmp_path / 'no_content.txt' + tts = gTTS(text=text) + tts.save(filename) + + +# Test write_to_fp()/save() cases not covered elsewhere in this file + +def test_bad_fp_type(): + """Raise TypeError if fp is not a file-like object (no .write())""" + # Create gTTS and save + tts = gTTS(text='test') + with pytest.raises(TypeError): + tts.write_to_fp(5) + + +@pytest.mark.net +def test_save(tmp_path): + """Save .mp3 file successfully""" + filename = tmp_path / 'save.mp3' + # Create gTTS and save + tts = gTTS(text='test') + tts.save(filename) + + # Check if file created is > 2k + assert filename.stat().st_size > 2000 + + +@pytest.mark.net +def test_get_bodies(): + """get request bodies list""" + tts = gTTS(text='test', tld='com', lang='en') + body = tts.get_bodies()[0] + assert 'test' in body + # \"en\" url-encoded + assert '%5C%22en%5C%22' in body + + +def test_msg(): + """Test gTTsError internal exception handling + Set exception message successfully""" + error1 = gTTSError('test') + assert 'test' == error1.msg + + error2 = gTTSError() + assert error2.msg is None + + +def test_infer_msg(): + """Infer message sucessfully based on context""" + + # Without response: + + # Bad TLD + ttsTLD = Mock(tld='invalid') + errorTLD = gTTSError(tts=ttsTLD) + assert errorTLD.msg == "Failed to connect. Probable cause: Host 'https://translate.google.invalid/' is not reachable" + + # With response: + + # 403 + tts403 = Mock() + response403 = Mock(status_code=403, reason='aaa') + error403 = gTTSError(tts=tts403, response=response403) + assert error403.msg == "403 (aaa) from TTS API. Probable cause: Bad token or upstream API changes" + + # 200 (and not lang_check) + tts200 = Mock(lang='xx', lang_check=False) + response404 = Mock(status_code=200, reason='bbb') + error200 = gTTSError(tts=tts200, response=response404) + assert error200.msg == "200 (bbb) from TTS API. Probable cause: No audio stream in response. Unsupported language 'xx'" + + # >= 500 + tts500 = Mock() + response500 = Mock(status_code=500, reason='ccc') + error500 = gTTSError(tts=tts500, response=response500) + assert error500.msg == "500 (ccc) from TTS API. Probable cause: Uptream API error. Try again later." + + # Unknown (ex. 100) + tts100 = Mock() + response100 = Mock(status_code=100, reason='ddd') + error100 = gTTSError(tts=tts100, response=response100) + assert error100.msg == "100 (ddd) from TTS API. Probable cause: Unknown" + + +@pytest.mark.net +def test_WebRequest(tmp_path): + """Test Web Requests""" + + text = "Lorem ipsum" + + """Raise gTTSError on unsupported language (without language check)""" + lang = 'xx' + check = False + + with pytest.raises(gTTSError): + filename = tmp_path / 'xx.txt' + # Create gTTS + tts = gTTS(text=text, lang=lang, lang_check=check) + tts.save(filename) + + +if __name__ == '__main__': + pytest.main(['-x', __file__]) diff --git a/chinese/lib/gtts/tests/test_utils.py b/chinese/lib/gtts/tests/test_utils.py new file mode 100644 index 0000000..e41c1c5 --- /dev/null +++ b/chinese/lib/gtts/tests/test_utils.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +import pytest +from gtts.utils import _minimize, _len, _clean_tokens, _translate_url + +delim = ' ' +Lmax = 10 + + +def test_ascii(): + _in = "Bacon ipsum dolor sit amet" + _out = ["Bacon", "ipsum", "dolor sit", "amet"] + assert _minimize(_in, delim, Lmax) == _out + + +def test_ascii_no_delim(): + _in = "Baconipsumdolorsitametflankcornedbee" + _out = ["Baconipsum", "dolorsitam", "etflankcor", "nedbee"] + assert _minimize(_in, delim, Lmax) == _out + + +def test_unicode(): + _in = u"这是一个三岁的小孩在讲述他从一系列照片里看到的东西。" + _out = [u"这是一个三岁的小孩在", u"讲述他从一系列照片里", u"看到的东西。"] + assert _minimize(_in, delim, Lmax) == _out + + +def test_startwith_delim(): + _in = delim + "test" + _out = ["test"] + assert _minimize(_in, delim, Lmax) == _out + + +def test_len_ascii(): + text = "Bacon ipsum dolor sit amet flank corned beef." + assert _len(text) == 45 + + +def test_len_unicode(): + text = u"但在一个重要的任务上" + assert _len(text) == 10 + + +def test_only_space_and_punc(): + _in = [",(:)?", "\t ", "\n"] + _out = [] + assert _clean_tokens(_in) == _out + + +def test_strip(): + _in = [" Bacon ", "& ", "ipsum\r", "."] + _out = ["Bacon", "&", "ipsum"] + assert _clean_tokens(_in) == _out + + +def test_translate_url(): + _in = {"tld": "qwerty", "path": "asdf"} + _out = "https://translate.google.qwerty/asdf" + assert _translate_url(**_in) == _out + + +if __name__ == '__main__': + pytest.main(['-x', __file__]) diff --git a/chinese/lib/gtts/tokenizer/tests/test_core.py b/chinese/lib/gtts/tokenizer/tests/test_core.py new file mode 100644 index 0000000..8c89ecd --- /dev/null +++ b/chinese/lib/gtts/tokenizer/tests/test_core.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +import unittest +import re +from gtts.tokenizer.core import RegexBuilder, PreProcessorRegex, PreProcessorSub, Tokenizer + +# Tests based on classes usage examples +# See class documentation for details + + +class TestRegexBuilder(unittest.TestCase): + def test_regexbuilder(self): + rb = RegexBuilder('abc', lambda x: "{}".format(x)) + self.assertEqual(rb.regex, re.compile('a|b|c')) + + +class TestPreProcessorRegex(unittest.TestCase): + def test_preprocessorregex(self): + pp = PreProcessorRegex('ab', lambda x: "{}".format(x), 'c') + self.assertEqual(len(pp.regexes), 2) + self.assertEqual(pp.regexes[0].pattern, 'a') + self.assertEqual(pp.regexes[1].pattern, 'b') + + +class TestPreProcessorSub(unittest.TestCase): + def test_proprocessorsub(self): + sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')] + pp = PreProcessorSub(sub_pairs) + _in = "I use firefox on my mac" + _out = "I use Chrome on my PC" + self.assertEqual(pp.run(_in), _out) + + +class TestTokenizer(unittest.TestCase): + # tokenizer case 1 + def case1(self): + return re.compile(r"\,") + + # tokenizer case 2 + def case2(self): + return RegexBuilder('abc', lambda x: r"{}\.".format(x)).regex + + def test_tokenizer(self): + t = Tokenizer([self.case1, self.case2]) + _in = "Hello, my name is Linda a. Call me Lin, b. I'm your friend" + _out = [ + 'Hello', + ' my name is Linda ', + ' Call me Lin', + ' ', + " I'm your friend"] + self.assertEqual(t.run(_in), _out) + + def test_bad_params_not_list(self): + # original exception: TypeError + with self.assertRaises(TypeError): + Tokenizer(self.case1) + + def test_bad_params_not_callable(self): + # original exception: TypeError + with self.assertRaises(TypeError): + Tokenizer([100]) + + def test_bad_params_not_callable_returning_regex(self): + # original exception: AttributeError + def not_regex(): + return 1 + + with self.assertRaises(TypeError): + Tokenizer([not_regex]) + + +if __name__ == '__main__': + unittest.main() diff --git a/chinese/lib/gtts/tokenizer/tests/test_pre_processors.py b/chinese/lib/gtts/tokenizer/tests/test_pre_processors.py new file mode 100644 index 0000000..8c6a428 --- /dev/null +++ b/chinese/lib/gtts/tokenizer/tests/test_pre_processors.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +import unittest +from gtts.tokenizer.pre_processors import tone_marks, end_of_line, abbreviations, word_sub + + +class TestPreProcessors(unittest.TestCase): + def test_tone_marks(self): + _in = "lorem!ipsum?" + _out = "lorem! ipsum? " + self.assertEqual(tone_marks(_in), _out) + + def test_end_of_line(self): + _in = """test- +ing""" + _out = "testing" + self.assertEqual(end_of_line(_in), _out) + + def test_abbreviations(self): + _in = "jr. sr. dr." + _out = "jr sr dr" + self.assertEqual(abbreviations(_in), _out) + + def test_word_sub(self): + _in = "Esq. Bacon" + _out = "Esquire Bacon" + self.assertEqual(word_sub(_in), _out) + + +if __name__ == '__main__': + unittest.main() diff --git a/chinese/lib/gtts/tokenizer/tests/test_tokenizer_cases.py b/chinese/lib/gtts/tokenizer/tests/test_tokenizer_cases.py new file mode 100644 index 0000000..13e63f2 --- /dev/null +++ b/chinese/lib/gtts/tokenizer/tests/test_tokenizer_cases.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +import unittest +from gtts.tokenizer.tokenizer_cases import tone_marks, period_comma, colon, other_punctuation, legacy_all_punctuation +from gtts.tokenizer import Tokenizer, symbols + + +class TestPreTokenizerCases(unittest.TestCase): + def test_tone_marks(self): + t = Tokenizer([tone_marks]) + _in = "Lorem? Ipsum!" + _out = ['Lorem?', 'Ipsum!'] + self.assertEqual(t.run(_in), _out) + + def test_period_comma(self): + t = Tokenizer([period_comma]) + _in = "Hello, it's 24.5 degrees in the U.K. today. $20,000,000." + _out = ['Hello', "it's 24.5 degrees in the U.K. today", '$20,000,000.'] + self.assertEqual(t.run(_in), _out) + + def test_colon(self): + t = Tokenizer([colon]) + _in = "It's now 6:30 which means: morning missing:space" + _out = ["It's now 6:30 which means", ' morning missing', 'space'] + self.assertEqual(t.run(_in), _out) + + def test_other_punctuation(self): + # String of the unique 'other punctuations' + other_punc_str = ''.join( + set(symbols.ALL_PUNC) - + set(symbols.TONE_MARKS) - + set(symbols.PERIOD_COMMA) - + set(symbols.COLON)) + + t = Tokenizer([other_punctuation]) + self.assertEqual(len(t.run(other_punc_str)) - 1, len(other_punc_str)) + + def test_legacy_all_punctuation(self): + t = Tokenizer([legacy_all_punctuation]) + self.assertEqual(len(t.run(symbols.ALL_PUNC)) - + 1, len(symbols.ALL_PUNC)) + + +if __name__ == '__main__': + unittest.main() diff --git a/chinese/lib/gtts/tts.py b/chinese/lib/gtts/tts.py index 3945dc6..43e3678 100644 --- a/chinese/lib/gtts/tts.py +++ b/chinese/lib/gtts/tts.py @@ -1,11 +1,15 @@ # -*- coding: utf-8 -*- from gtts.tokenizer import pre_processors, Tokenizer, tokenizer_cases from gtts.utils import _minimize, _len, _clean_tokens, _translate_url -from gtts.lang import tts_langs +from gtts.lang import tts_langs, _fallback_deprecated_lang from six.moves import urllib -from urllib.parse import quote -import urllib3 +try: + from urllib.parse import quote + import urllib3 +except ImportError: + from urllib import quote + import urllib2 import requests import logging import json @@ -38,10 +42,11 @@ class gTTS: Args: text (string): The text to be read. tld (string): Top-level domain for the Google Translate host, - i.e `https://translate.google.`. This is useful - when ``google.com`` might be blocked within a network but - a local or different Google host (e.g. ``google.cn``) is not. - Default is ``com``. + i.e `https://translate.google.`. Different Google domains + can produce different localized 'accents' for a given + language. This is also useful when ``google.com`` might be blocked + within a network but a local or different Google host + (e.g. ``google.cn``) is not. Default is ``com``. lang (string, optional): The language (IETF language tag) to read the text in. Default is ``en``. slow (bool, optional): Reads text more slowly. Defaults to ``False``. @@ -130,18 +135,21 @@ def __init__( self.tld = tld # Language - if lang_check: + self.lang_check = lang_check + self.lang = lang + + if self.lang_check: + # Fallback lang in case it is deprecated + self.lang = _fallback_deprecated_lang(lang) + try: langs = tts_langs() - if lang.lower() not in langs: - raise ValueError("Language not supported: %s" % lang) + if self.lang not in langs: + raise ValueError("Language not supported: %s" % lang) except RuntimeError as e: log.debug(str(e), exc_info=True) log.warning(str(e)) - self.lang_check = lang_check - self.lang = lang.lower() - # Read speed if slow: self.speed = Speed.SLOW @@ -220,18 +228,6 @@ def _package_rpc(self, text): espaced_rpc = json.dumps(rpc, separators=(',', ':')) return "f.req={}&".format(quote(espaced_rpc)) - def get_urls(self): - """Get TTS API request URL(s) that would be sent to the TTS API. - - Returns: - list: A list of TTS API request URLs to make. - - This is particularly useful to get the list of URLs generated - by ``gTTS`` but not yet fullfilled, - for example to be used by an external program. - """ - return [pr.url for pr in self._prepare_requests()] - def get_bodies(self): """Get TTS API request bodies(s) that would be sent to the TTS API. @@ -253,7 +249,12 @@ def write_to_fp(self, fp): """ # When disabling ssl verify in requests (for proxies and firewalls), # urllib3 prints an insecure warning on stdout. We disable that. - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + try: + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + except: + pass + + prepared_requests = self._prepare_requests() for idx, pr in enumerate(prepared_requests): @@ -356,4 +357,4 @@ def infer_msg(self, tts, rsp=None): elif status >= 500: cause = "Uptream API error. Try again later." - return "{}. Probable cause: {}".format(premise, cause) \ No newline at end of file + return "{}. Probable cause: {}".format(premise, cause) diff --git a/chinese/lib/gtts/version.py b/chinese/lib/gtts/version.py index 36a511e..f1edb19 100644 --- a/chinese/lib/gtts/version.py +++ b/chinese/lib/gtts/version.py @@ -1 +1 @@ -__version__ = '2.2.1' +__version__ = '2.2.2' diff --git a/chinese/tts.py b/chinese/tts.py index d8cf290..5517cef 100644 --- a/chinese/tts.py +++ b/chinese/tts.py @@ -22,7 +22,7 @@ class AudioDownloader: - def __init__(self, text, source='google|zh-cn'): + def __init__(self, text, source='google|zh-CN'): self.text = text self.service, self.lang = source.split('|') self.path = self.get_path() @@ -53,7 +53,7 @@ def download(self): return basename(self.path) def get_google(self): - tts = gTTS(self.text, lang=self.lang) + tts = gTTS(self.text, lang=self.lang, tld='cn') try: tts.save(self.path) except gTTSError as e: diff --git a/tests/__init__.py b/tests/__init__.py index 62e6012..35367fb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -48,6 +48,7 @@ else: media_dir = 'collection.media' modules['gtts'] = MagicMock() + modules['gtts.tts'] = MagicMock() modules['requests'] = MagicMock() patch.dict('sys.modules', modules).start()