diff --git a/ebl/atf_importer/domain/legacy_atf_transformers.py b/ebl/atf_importer/domain/legacy_atf_transformers.py index c457f8d48..c11bd1fba 100644 --- a/ebl/atf_importer/domain/legacy_atf_transformers.py +++ b/ebl/atf_importer/domain/legacy_atf_transformers.py @@ -29,23 +29,24 @@ def transform(self, tree: Tree) -> Tree: result = super().transform(tree) return result if result else tree - def _transform_children(self, children): + def _transform_children(self, children: Sequence[Tree]): + index_correction = 0 for index, child in enumerate(children): - self._enter_node(index) - if self.is_classes_break_at(self.get_ancestors()): - result = child - elif isinstance(child, Tree): - result = self._transform_tree(child) - elif self.__visit_tokens__ and isinstance(child, Token): - result = self._call_userfunc_token(child) - else: - result = child - + self._enter_node(index - index_correction) + result = self._get_child_result(child) + self._exit_node() if result is not Discard: - self._exit_node() yield result - else: - self._exit_node() + + def _get_child_result(self, child: Tree) -> Tree: + if self.is_classes_break_at(self.get_ancestors()): + return child + elif isinstance(child, Tree): + return self._transform_tree(child) + elif self.__visit_tokens__ and isinstance(child, Token): + return self._call_userfunc_token(child) + else: + return child def _enter_node(self, index: int = 0) -> None: self.current_path.append(index) @@ -60,7 +61,8 @@ def get_ancestors(self) -> Sequence: tree = self.current_tree ancestors = [tree.data] for parent_index in self.current_path[:-1]: - ancestors.append(tree.children[parent_index].data) + ancestor = tree.children[parent_index] + ancestors.append(ancestor.data) tree = tree.children[parent_index] return ancestors @@ -69,8 +71,6 @@ def is_classes_break_at(self, node_classes: Sequence[str]) -> bool: class HalfBracketsTransformer(LegacyTransformer): - # ToDo: Check if works - def __init__(self, **kwargs): super().__init__(**kwargs) self.open = False @@ -80,40 +80,36 @@ def clear(self): self.open = False @v_args(inline=True) - def ebl_atf_text_line__LEGACY_OPEN_HALF_BRACKET(self, bracket: str) -> str: - print("! bbbbbb", bracket) - input() + def ebl_atf_text_line__open_legacy_damage(self, bracket: str) -> Discard: self.legacy_found = True self.open = True - return "" + return Discard @v_args(inline=True) - def ebl_atf_text_line__LEGACY_CLOSE_HALF_BRACKET(self, bracket: str) -> str: - print("! bbbbbb", bracket) - input() + def ebl_atf_text_line__close_legacy_damage(self, bracket: str) -> Discard: self.legacy_found = True self.open = False - return "" + return Discard @v_args(inline=True) - def ebl_atf_text_line__flags(self, flags: str): - print("! bbbbbb", flags) - input() - return flags + "#" if self.open else flags + def ebl_atf_text_line__flags(self, *flags) -> Tree: + damage_flag = Token("ebl_atf_text_line__DAMAGE", "#") if self.open else None + _flags = [flag for flag in [*flags, damage_flag] if flag] + return Tree("ebl_atf_text_line__flags", _flags if _flags else []) class OraccJoinerTransformer(LegacyTransformer): @v_args(inline=True) - def ebl_atf_text_line__LEGACY_ORACC_JOINER(self, bracket: str) -> str: - print("!!!!!!!!!!!!!!!!!!!! LEGACY_ORACC_JOINER") - self.legacy_found = True - return "-" + def ebl_atf_text_line__joiner(self, joiner: Token) -> Tree: + if joiner.type == "ebl_atf_text_line__LEGACY_ORACC_JOINER": + self.legacy_found = True + return Tree("ebl_atf_text_line__joiner", [Token("MINUS", "-")]) + return Tree("ebl_atf_text_line__joiner", [joiner]) class OraccSpecialTransformer(LegacyTransformer): @v_args(inline=True) def ebl_atf_text_line__LEGACY_ORACC_DISH_DIVIDER(self, child: str) -> Tree: - print("!!!!!!!!!!!!!!!!!!!! LEGACY_ORACC_DISH_DIVIDER") self.legacy_found = True return Tree( "ebl_atf_text_line__logogram_name_part", @@ -194,3 +190,12 @@ def _set_sub_index(self, sub_index: str) -> None: "ebl_atf_text_line__sub_index", [sub_index_value], ) + + +class UncertainSignTransformer(LegacyTransformer): + @v_args(inline=True) + def ebl_atf_text_line__legacy_uncertain_sign( + self, prefix: Tree, sign: Tree + ) -> Tree: + self.legacy_found = True + return sign diff --git a/ebl/atf_importer/domain/legacy_atf_visitor.py b/ebl/atf_importer/domain/legacy_atf_visitor.py index c68bc860a..24ddac49c 100644 --- a/ebl/atf_importer/domain/legacy_atf_visitor.py +++ b/ebl/atf_importer/domain/legacy_atf_visitor.py @@ -5,6 +5,7 @@ HalfBracketsTransformer, OraccJoinerTransformer, OraccSpecialTransformer, + UncertainSignTransformer, ) from ebl.atf_importer.domain.legacy_atf_transformers import LegacyTransformer @@ -18,8 +19,9 @@ index_and_accented_transformer = (AccentedIndexTransformer(), "all_children") -half_brackets_transformer = (HalfBracketsTransformer(), "first_child") -oracc_joiner_transformer = (OraccJoinerTransformer(), "first_child") +uncertain_sign_transformer = (UncertainSignTransformer(), "all_children") +half_brackets_transformer = (HalfBracketsTransformer(), "all_children") +oracc_joiner_transformer = (OraccJoinerTransformer(), "all_children") oracc_special_transformer = (OraccSpecialTransformer(), "first_child") @@ -34,9 +36,11 @@ class LegacyAtfVisitor(Visitor): ], "surrogate": [index_and_accented_transformer], "grapheme": [index_and_accented_transformer], - "_parts_pattern": [half_brackets_transformer], - "_parts_pattern_gloss": [half_brackets_transformer], - "LEGACY_ORACC_JOINER": [oracc_joiner_transformer], + "word": [ + uncertain_sign_transformer, + oracc_joiner_transformer, + ], + "text": [half_brackets_transformer], } def __init__(self): diff --git a/ebl/tests/atf_importer/test_atf_preprocessor.py b/ebl/tests/atf_importer/test_atf_preprocessor.py index 64f217357..8b3a372cc 100644 --- a/ebl/tests/atf_importer/test_atf_preprocessor.py +++ b/ebl/tests/atf_importer/test_atf_preprocessor.py @@ -2,18 +2,14 @@ import json from ebl.atf_importer.domain.atf_preprocessor import AtfPreprocessor -PROBLEMATIC_TEXT_LINES = [ - ( - "8. KAR <:> e-ṭe-ri :* KAR : e-ke-mu : LUGAL ina di-bi-ri : LUGAL ina " - "ud-da-a-ta", - "8. KAR < : > e-ṭe-ri :* KAR : e-ke-mu : LUGAL ina di-bi-ri : LUGAL ina " - "ud-da-a-ta", - ), +# ToDo: All transformers should be tested + +PARSE_AND_TRANSFORM_LEGACY = [ + ("1. ⸢16?! 15! 12⸣ 17", "1. 16?!# 15!# 12# 17"), + ("1. $BAD.$É $ME", "1. BAD.E₂ ME"), ( - "14. [...] x (x) še-e-hu $BAD $É $ME : ina GAŠAN-ia₅ {d}SUEN {" - "d}INANA--<É>.AN.NA", - "14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {" - "d}INANA-.AN.NA", + "1. {d}INANA--<É>.AN.NA", + "1. {d}INANA-.AN.NA", ), ( "1. ŠÚ ù ŠÚ<(šumma)> |ŠÚ+ŠÚ|", @@ -27,19 +23,40 @@ "1. [*] * *-*", "1. [DIŠ] DIŠ DIŠ-DIŠ", ), + ( + "1. <:>", + "1. < : >", + ), +] + +PROBLEMATIC_TEXT_LINES = [ ( "1. [*] AN#.GE₆ GAR-ma U₄ ŠÚ{+up} * AN.GE₆ GAR-ma {d}IŠKUR KA-šú ŠUB{" "+di} * AN.GE₆", "1. [DIŠ] AN#.GE₆ GAR-ma U₄ ŠU₂{+up} DIŠ AN.GE₆ GAR-ma {d}IŠKUR KA-šu₂ " "ŠUB{+di} DIŠ AN.GE₆", ), + ( + "8. KAR <:> e-ṭe-ri :* KAR : e-ke-mu : LUGAL ina di-bi-ri : LUGAL ina " + "ud-da-a-ta", + "8. KAR < : > e-ṭe-ri :* KAR : e-ke-mu : LUGAL ina di-bi-ri : LUGAL ina " + "ud-da-a-ta", + ), + ( + "14. [...] x (x) še-e-hu $BAD $É $ME : ina GAŠAN-ia₅ {d}SUEN {" + "d}INANA--<É>.AN.NA", + "14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {" + "d}INANA-.AN.NA", + ), ] -FOLLOWING_SIGN_IS_NOT_A_LOGOGRAM = ( - "5'. [...] x [...] x-šu₂? : kal : nap-ha-ri : $WA-wa-ru : ia-ar₂-ru", - "5'. [...] x [...] x-šu₂? : kal : nap-ha-ri : WA-wa-ru : ia-ar₂-ru", -) +FOLLOWING_SIGN_IS_NOT_A_LOGOGRAM = [ + ( + "5'. [...] x [...] x-šu₂? : kal : nap-ha-ri : $WA-wa-ru : ia-ar₂-ru", + "5'. [...] x [...] x-šu₂? : kal : nap-ha-ri : WA-wa-ru : ia-ar₂-ru", + ) +] LEGACY_GRAMMAR_SIGNS = [ ( @@ -53,19 +70,23 @@ @pytest.mark.parametrize( "legacy_line,ebl_line", - [*PROBLEMATIC_TEXT_LINES, FOLLOWING_SIGN_IS_NOT_A_LOGOGRAM, *LEGACY_GRAMMAR_SIGNS], + [ + *PARSE_AND_TRANSFORM_LEGACY, + *PROBLEMATIC_TEXT_LINES, + *FOLLOWING_SIGN_IS_NOT_A_LOGOGRAM, + *LEGACY_GRAMMAR_SIGNS, + ], ) def test_text_lines(legacy_line, ebl_line): - # ToDo: fix atf_preprocessor = AtfPreprocessor("../logs", 0) legacy_tree = atf_preprocessor.ebl_parser.parse(legacy_line) legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree) expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line) - print("RESULT:\n", legacy_tree.pretty()) - print("EXPECTED:\n", expected_tree.pretty()) + # print("RESULT:\n", legacy_tree) # .pretty()) + # print("EXPECTED:\n", expected_tree) # .pretty()) + # input() <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" assert legacy_tree == expected_tree - input() lemma_lines = [] diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark index fae62bf3f..df8bf41c0 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark @@ -64,11 +64,14 @@ _head: _have_both _tail: _WORD_SEPARATOR _omit_right* _have_both? | _WORD_SEPARATOR? _omit_left+ | _WORD_SEPARATOR _require_both + | _WORD_SEPARATOR? _require_both_optional_ws _require_both: commentary_protocol - | divider_variant - | divider - | line_break + | line_break + +_require_both_optional_ws: divider_variant + | divider + _have_both: tabulation | column_token | erasure