-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Dev.casing #274
Dev.casing #274
Changes from 4 commits
1c7792f
996a060
a04aeff
3811a9e
d768d74
c31c66b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -187,6 +187,10 @@ def test_case_sensitive(self): | |
self.assertEqual(transducer_case_sensitive("a").output_string, "a") | ||
self.assertEqual(transducer("A").output_string, "b") | ||
|
||
def test_case_equivalencies(self): | ||
with self.assertRaises(exceptions.MalformedMapping): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a good compromise |
||
Mapping(rules=[{"in": "a", "out": "b"}], case_equivalencies={"a": "AA"}) | ||
|
||
def test_escape_special(self): | ||
mapping = Mapping(rules=[{"in": r"\d", "out": "digit"}]) | ||
mapping_escaped = Mapping( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import os | ||
from unittest import TestCase, main | ||
|
||
from g2p.exceptions import MalformedMapping | ||
from g2p.mappings import Mapping | ||
from g2p.tests.public import PUBLIC_DIR | ||
from g2p.transducer import CompositeTransducer, Transducer, normalize_edges | ||
|
@@ -218,6 +219,42 @@ def test_deletion(self): | |
self.assertEqual(self.test_deletion_transducer_csv("a").output_string, "") | ||
self.assertEqual(self.test_deletion_transducer_json("a").output_string, "") | ||
|
||
def test_case_preservation(self): | ||
mapping = Mapping( | ||
rules=[ | ||
{"in": "'a", "out": "b"}, | ||
{"in": "e\u0301", "out": "f"}, | ||
{"in": "tl", "out": "λ"}, | ||
], | ||
case_sensitive=False, | ||
preserve_case=True, | ||
norm_form="NFC", | ||
case_equivalencies={"λ": "\u2144"}, | ||
) | ||
transducer = Transducer(mapping) | ||
self.assertEqual(transducer("'a").output_string, "b") | ||
self.assertEqual(transducer("'A").output_string, "B") | ||
self.assertEqual(transducer("E\u0301").output_string, "F") | ||
roedoejet marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.assertEqual(transducer("e\u0301").output_string, "f") | ||
# Test what happens in Heiltsuk. \u03BB (λ) should be capitalized as \u2144 (⅄) | ||
self.assertEqual(transducer("TLaba").output_string, "\u2144aba") | ||
self.assertEqual(transducer("tlaba").output_string, "λaba") | ||
# I guess it's arguable what should happen here, but I'll just change case if any of the characters are differently cased | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think these tests are pretty clear with what our implementation is |
||
self.assertEqual(transducer("Tlaba").output_string, "\u2144aba") | ||
# case equivalencies that are not the same length cause indexing errors in the current implementation | ||
with self.assertRaises(MalformedMapping): | ||
Mapping( | ||
rules=[ | ||
{"in": "'a", "out": "b"}, | ||
{"in": "e\u0301", "out": "f"}, | ||
{"in": "tl", "out": "λ"}, | ||
], | ||
case_sensitive=False, | ||
preserve_case=True, | ||
norm_form="NFC", | ||
case_equivalencies={"λ": "\u2144\u2144\u2144"}, | ||
) | ||
|
||
def test_normalize_edges(self): | ||
# Remove non-deletion edges with the same index as deletions | ||
bad_edges = [ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -420,6 +420,7 @@ | |
def __init__(self, mapping: Mapping): | ||
self.mapping = mapping | ||
self.case_sensitive = mapping.case_sensitive | ||
self.preserve_case = mapping.preserve_case | ||
self.norm_form = mapping.norm_form | ||
self.out_delimiter = mapping.out_delimiter | ||
self._index_match_pattern = re.compile(r"(?<={)\d+(?=})") | ||
|
@@ -428,7 +429,7 @@ | |
def __repr__(self): | ||
return f"{self.__class__} between {self.mapping.in_lang} and {self.mapping.out_lang}" | ||
|
||
def __call__(self, to_convert: str, index: bool = False, debugger: bool = False): | ||
def __call__(self, to_convert: str): | ||
"""The basic method to transduce an input. A proxy for self.apply_rules. | ||
|
||
Args: | ||
|
@@ -439,7 +440,11 @@ | |
and output characters and their corresponding edges representing the indices | ||
of the transformation. | ||
""" | ||
return self.apply_rules(to_convert) | ||
tg = self.apply_rules(to_convert) | ||
if self.preserve_case: | ||
return preserve_case(tg, self.mapping.case_equivalencies) | ||
else: | ||
return tg | ||
|
||
@staticmethod | ||
def _pua_to_index(string: str) -> int: | ||
|
@@ -1257,3 +1262,53 @@ | |
else: | ||
return False | ||
return result | ||
|
||
|
||
def preserve_case( | ||
tg: TransductionGraph, case_equivalencies: Dict[str, str] = None | ||
) -> TransductionGraph: | ||
if case_equivalencies is None: | ||
case_equivalencies = {} | ||
reverse_case_equivalencies = {v: k for k, v in case_equivalencies.items()} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it's OK to assume this is a 1-1 mapping, as you do here. |
||
all_lower_case_equivalencies = case_equivalencies.keys() | ||
all_upper_case_equivalencies = case_equivalencies.values() | ||
new_string = "" | ||
for item in tg.substring_alignments(): | ||
in_sub = item[0] | ||
out_sub = item[1] | ||
any_in_upper = any(x.isupper() for x in in_sub) | ||
any_in_lower = any(x.islower() for x in in_sub) | ||
any_out_upper = any(x.isupper() for x in out_sub) | ||
any_out_lower = any(x.islower() for x in out_sub) | ||
# continue if character is un-caseable | ||
if ( | ||
out_sub not in case_equivalencies | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So you're assuming case_equivalencies only lists complete rule outputs, rather than characters? What about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes because we don't really have a notion of characters anywhere (re: the tokenization issue from before). here you would have to do |
||
and not any_out_upper | ||
and not any_out_lower | ||
): | ||
new_string += out_sub | ||
continue | ||
# lower case using case equivalencies if they exist | ||
if ( | ||
any_in_lower or in_sub in all_lower_case_equivalencies | ||
) and out_sub in all_upper_case_equivalencies: | ||
new_string += reverse_case_equivalencies[out_sub] | ||
continue | ||
# upper case using case equivalencies if they exist | ||
elif ( | ||
any_in_upper or in_sub in all_upper_case_equivalencies | ||
) and out_sub in all_lower_case_equivalencies: | ||
new_string += case_equivalencies[out_sub] | ||
continue | ||
# change to upper if required | ||
if any_in_upper and any_out_lower: | ||
new_string += out_sub.upper() | ||
continue | ||
# change to lower if required | ||
if any_in_lower and any_out_upper: | ||
new_string += out_sub.lower() | ||
continue | ||
# just in case, append the out_sub | ||
new_string += out_sub | ||
tg.output_string = new_string | ||
return tg | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Boy, it's hard to follow all this logic! We need more extensive unit testing just to make sure it works. E.g., exercise these:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, this works.
I don't fundamentally really understand what is going on here, and I can't reproduce the examples you have above in the rules even without changing anything so we need to talk more about this.
I disagree that for a rule a -> bb Axyz should get turned to Bbxyz. We will handle this as BBxyz, otherwise someone would need to add a case equivalency for bb and Bb, in which case it should work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess we might need to ask Daisy what she thinks is right for kwk. And I guess we should also test with kwk and see what comes out with your current implementation. So for kwk my assumption has been that you can infer whether a small cap L or a small cap E should be interpreted as upper case by the case of the next letter. I suppose we should validate that assumption with Daisy before investing too much development effort making it work... |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would like to see some tests about interactions between
case_sensitive
andpreserve_case
. Doescase_sensitive=False
andpreserve_case=True
work for example?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hum, good question. It actually only makes sense to have
case_sensitive=False
when you setpreserve_case=True
. Having both be true should raise an error.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The semantics of preserve case is that case does affect the meaning of the characters (i.e., the mapping is case insensitive) but should be preserved because it has non-phonetic meaning (e.g., proper name, beginning of the sentence).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I just added a model validator that raises when the two are set incompatibly.