From d9a07e5a09e50a9371f8b3a952fee0aa841e5ab3 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 2 Apr 2024 22:51:50 -0400 Subject: [PATCH 1/2] fix: do not copy the input mapping filename when generating --- g2p/mappings/create_ipa_mapping.py | 4 +-- g2p/tests/test_cli.py | 41 ++++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/g2p/mappings/create_ipa_mapping.py b/g2p/mappings/create_ipa_mapping.py index fa11fb69..237ce232 100644 --- a/g2p/mappings/create_ipa_mapping.py +++ b/g2p/mappings/create_ipa_mapping.py @@ -199,8 +199,8 @@ def create_mapping( ) # Initialize mapping with input language parameters (as_is, - # case_sensitive, prevent_feeding, etc) - config = mapping_1.model_copy().model_dump() + # case_sensitive, prevent_feeding, etc) but *not* the same filename! + config = mapping_1.model_copy().model_dump(exclude={"rules_path"}) config = { **config, "authors": [f"Generated {dt.datetime.now()}"], diff --git a/g2p/tests/test_cli.py b/g2p/tests/test_cli.py index 180ede2f..221d9d02 100755 --- a/g2p/tests/test_cli.py +++ b/g2p/tests/test_cli.py @@ -24,6 +24,7 @@ update_schema, ) from g2p.log import LOGGER +from g2p.mappings import MappingConfig from g2p.mappings.langs import ( LANGS_DIR, LANGS_FILE_NAME, @@ -281,14 +282,44 @@ def test_convert_option_tl(self): result = self.runner.invoke(convert, "--tok-lang fra e\\'i oji oji-ipa") self.assertIn("eː'i", result.stdout) + def test_generate_mapping_config(self): + """Ensure that generate-mapping creates valid configuration.""" + # The underlying create_mapping() function is tested in + # test_create_mapping.py, and align_to_dummy_fallback() in + # test_fallback.py, with less expensive inputs than our real + # g2p mappings, and with predictable results. However, we do + # need to ensure that it creates/updates a correct + # configuration, so we test that here. + with tempfile.TemporaryDirectory() as tmpdir: + results = self.runner.invoke( + generate_mapping, ["--ipa", "atj", "--out-dir", tmpdir] + ) + self.assertEqual(results.exit_code, 0) + rulespath = os.path.join(tmpdir, "atj-ipa_to_eng-ipa.json") + self.assertTrue(os.path.exists(rulespath)) + confpath = os.path.join(tmpdir, "config-g2p.yaml") + config = MappingConfig.load_mapping_config_from_path(confpath) + self.assertEqual(len(config.mappings), 1) + self.assertEqual(config.mappings[0].rules_path, Path(rulespath)) + # Run it again, should get the same result + results = self.runner.invoke( + generate_mapping, ["--ipa", "atj", "--out-dir", tmpdir] + ) + self.assertEqual(results.exit_code, 0) + config = MappingConfig.load_mapping_config_from_path(confpath) + self.assertEqual(len(config.mappings), 1) + self.assertEqual(config.mappings[0].rules_path, Path(rulespath)) + # Run it with a different language, should get more config + results = self.runner.invoke( + generate_mapping, ["--ipa", "alq", "--out-dir", tmpdir] + ) + self.assertEqual(results.exit_code, 0) + config = MappingConfig.load_mapping_config_from_path(confpath) + self.assertEqual(len(config.mappings), 2) + def test_generate_mapping_errors(self): """Exercise various error situations with the g2p generate-mapping CLI command""" - # We don't exercise valid calls to generate_mapping here. The underlying - # create_mapping() function is tested in test_create_mapping.py, and - # align_to_dummy_fallback() in test_fallback.py, with less expensive - # inputs than our real g2p mappings, and with predictable results. - results = self.runner.invoke(generate_mapping) self.assertNotEqual(results.exit_code, 0) self.assertIn("Nothing to do", results.output) From ea042628b9419eb3844b67b63d68cc8875723542 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Tue, 2 Apr 2024 23:06:22 -0400 Subject: [PATCH 2/2] fix: do not try to generate mappings for empty outputs --- g2p/mappings/__init__.py | 24 ++++++++++++++++-------- g2p/mappings/create_ipa_mapping.py | 4 ++-- g2p/tests/test_create_mapping.py | 20 ++++++++++++++++++++ 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/g2p/mappings/__init__.py b/g2p/mappings/__init__.py index 068c4e25..6006c611 100644 --- a/g2p/mappings/__init__.py +++ b/g2p/mappings/__init__.py @@ -44,7 +44,9 @@ class Mapping(_MappingModelDefinition): """Class for lookup tables""" def model_post_init(self, *_args, **_kwargs) -> None: - """After the model is constructed, we process the model specs by applying all the configuration to the rules (ie prevent feeding, unicode normalization etc..)""" + """After the model is constructed, we process the model specs by + applying all the configuration to the rules (ie prevent feeding, + unicode normalization etc..)""" if self.type == MAPPING_TYPE.mapping or self.type is None: # load abbreviations from path if self.abbreviations_path is not None and not self.abbreviations: @@ -81,7 +83,8 @@ def find_mapping( @staticmethod def find_mapping_by_id(map_id: str) -> "Mapping": - """Find the mapping with a given ID, i.e., the "id" found in the mapping, like in the "panphon_preprocessor" mapping.""" + """Find the mapping with a given ID, i.e., the "id" found in the + mapping, like in the "panphon_preprocessor" mapping.""" for mapping in MAPPINGS_AVAILABLE: if mapping.id == map_id: return deepcopy(mapping) @@ -89,9 +92,9 @@ def find_mapping_by_id(map_id: str) -> "Mapping": @staticmethod def load_mapping_from_path(path_to_mapping_config: Union[str, Path], index=0): - """Loads a mapping from a path, if there is more than one mapping, then it loads based on the int - provided to the 'index' argument. Default is 0. - """ + """Loads a mapping from a path, if there is more than one mapping, + then it loads based on the int provided to the 'index' + argument. Default is 0.""" mapping_config = MappingConfig.load_mapping_config_from_path( path_to_mapping_config ) @@ -122,13 +125,17 @@ def index(self, item): """Find the location of an item in self""" return self.rules.index(item) - def inventory(self, in_or_out: str = "in"): + def inventory(self, in_or_out: str = "in", non_empty: bool = False): """Return just inputs or outputs as inventory of mapping""" if in_or_out == "in": in_or_out = "rule_input" if in_or_out == "out": in_or_out = "rule_output" - return [getattr(x, in_or_out) for x in self.rules] + inv = [getattr(x, in_or_out) for x in self.rules] + if non_empty: + return [sym for sym in inv if sym != ""] + else: + return inv def plain_mapping(self): """Return the plain mapping for displaying or saving to disk. @@ -274,7 +281,8 @@ def rule_to_regex(self, rule: Union[Rule, dict]) -> Union[Pattern, None]: ) raise exceptions.MalformedMapping( f"Your regex in mapping between {in_lang} and {out_lang} is malformed. " - f"Do you have un-escaped regex characters in your input {inp}, contexts {rule.context_before}, {rule.context_after}?" + f"Do you have un-escaped regex characters in your input {inp}, " + f"contexts {rule.context_before}, {rule.context_after}?" ) from e return rule_regex diff --git a/g2p/mappings/create_ipa_mapping.py b/g2p/mappings/create_ipa_mapping.py index 237ce232..6e8213cb 100644 --- a/g2p/mappings/create_ipa_mapping.py +++ b/g2p/mappings/create_ipa_mapping.py @@ -190,8 +190,8 @@ def create_mapping( ) l1_is_xsampa, l2_is_xsampa = is_xsampa(map_1_name), is_xsampa(map_2_name) rules = align_inventories( - mapping_1.inventory(mapping_1_io), - mapping_2.inventory(mapping_2_io), + mapping_1.inventory(mapping_1_io, non_empty=True), + mapping_2.inventory(mapping_2_io, non_empty=True), l1_is_xsampa, l2_is_xsampa, distance=distance, diff --git a/g2p/tests/test_create_mapping.py b/g2p/tests/test_create_mapping.py index daae0a48..4e9ad740 100755 --- a/g2p/tests/test_create_mapping.py +++ b/g2p/tests/test_create_mapping.py @@ -4,6 +4,8 @@ Test all Mappings """ +import io +from contextlib import redirect_stderr from unittest import TestCase, main from g2p.log import LOGGER @@ -155,6 +157,24 @@ def test_distances(self): set_of_mappings.add(tuple(rule.rule_output for rule in mapping.rules)) self.assertGreater(len(set_of_mappings), 3) + def test_deletion_mapping(self): + """Ensure that deletion rules do not lead to spurious warnings.""" + src_mappings = [ + {"in": "foo", "out": ""}, + {"in": "ᐃ", "out": "i"}, + {"in": "ᐅ", "out": "u"}, + {"in": "ᐊ", "out": "a"}, + ] + src_mapping = Mapping(rules=src_mappings, in_lang="crj", out_lang="crj-ipa") + log_output = io.StringIO() + with redirect_stderr(log_output): + mapping = create_mapping(src_mapping, self.target_mapping) + self.assertFalse("WARNING" in log_output.getvalue()) + transducer = Transducer(mapping) + self.assertEqual(transducer("a").output_string, "ɑ") + self.assertEqual(transducer("i").output_string, "i") + self.assertEqual(transducer("u").output_string, "u") + if __name__ == "__main__": main()