From 69aa11f338279a5b16245fa233f8b4b5194ffccc Mon Sep 17 00:00:00 2001 From: Ilya Khait Date: Fri, 21 Feb 2025 19:52:02 +0000 Subject: [PATCH] Implement translation injection --- .../domain/atf_indexing_visitor.py | 57 +++++++++++++++ ebl/atf_importer/domain/atf_preprocessor.py | 69 ++++++++++++++++-- .../domain/legacy_atf_transformers.py | 73 ++++++------------- ebl/atf_importer/domain/legacy_atf_visitor.py | 12 ++- .../atf_importer/test_atf_preprocessor.py | 32 ++++---- .../atf_parsers/lark_parser/ebl_atf.lark | 3 +- .../lark_parser/ebl_atf_at_line.lark | 5 +- .../lark_parser/ebl_atf_dollar_line.lark | 4 +- 8 files changed, 173 insertions(+), 82 deletions(-) create mode 100644 ebl/atf_importer/domain/atf_indexing_visitor.py diff --git a/ebl/atf_importer/domain/atf_indexing_visitor.py b/ebl/atf_importer/domain/atf_indexing_visitor.py new file mode 100644 index 000000000..cddc7fd2c --- /dev/null +++ b/ebl/atf_importer/domain/atf_indexing_visitor.py @@ -0,0 +1,57 @@ +import roman +from lark.visitors import Visitor, Tree + +surface_mapping = { + "obverse": "o", + "reverse": "r", + "bottom": "b.e.", + "edge": "e.", + "left": "l.e.", + "right": "r.e.", + "top": "t.e.", +} + + +class IndexingVisitor(Visitor): + def __init__(self): + super().__init__() + self._reset() + + def _reset(self) -> None: + self.column_counter = 1 + self.cursor = {"surface": None, "column": None, "line": None} + + def ebl_atf_at_line__surface_with_status(self, tree: Tree) -> Tree: + surface = surface_mapping[str(tree.children[0])] + "".join( + str(child) for child in tree.children[1].children + ) + self.cursor["surface"] = surface + return tree + + def ebl_atf_at_line__legacy_column(self, tree: Tree) -> Tree: + self.cursor["column"] = roman.toRoman(self.column_counter).lower() + self.column_counter += 1 + return tree + + def ebl_atf_at_line__column(self, tree: Tree) -> Tree: + self.cursor["column"] = roman.toRoman(str(tree.children[0])).lower() + "".join( + str(child) for child in tree.children[1].children + ) + return tree + + def text_line(self, tree: Tree) -> Tree: + line_number = "".join( + str(child) for child in tree.children[0].children if child + ) + self.cursor["line"] = line_number + return tree + + @property + def cursor_index(self) -> str: + return " ".join( + [ + str(self.cursor[key]) + for key in ["surface", "column", "line"] + if self.cursor[key] + ] + ) diff --git a/ebl/atf_importer/domain/atf_preprocessor.py b/ebl/atf_importer/domain/atf_preprocessor.py index a1fbfc4d7..21d7c5a4a 100644 --- a/ebl/atf_importer/domain/atf_preprocessor.py +++ b/ebl/atf_importer/domain/atf_preprocessor.py @@ -3,13 +3,20 @@ from typing import Tuple, Optional, List, Dict, Any from ebl.atf_importer.domain.atf_preprocessor_base import AtfPreprocessorBase from ebl.atf_importer.domain.atf_preprocessor_util import Util -from ebl.atf_importer.domain.legacy_atf_visitor import LegacyAtfVisitor -#from ebl.transliteration.domain.line_transformer import LineTransformer +from ebl.atf_importer.domain.legacy_atf_visitor import ( + LegacyAtfVisitor, + translation_block_transformer, +) + +# from ebl.transliteration.domain.line_transformer import LineTransformer +from ebl.atf_importer.domain.atf_indexing_visitor import IndexingVisitor class AtfPreprocessor(AtfPreprocessorBase): + indexing_visitor = IndexingVisitor() legacy_visitor = LegacyAtfVisitor() - #line_transformer = LineTransformer() + translation_block_transformer = translation_block_transformer[0] + # line_transformer = LineTransformer() def convert_lines_from_string(self, text: str) -> List[Dict[str, Any]]: return self._convert_lines(text.split("\n")) @@ -19,11 +26,64 @@ def convert_lines_from_path(self, path: str, filename: str) -> List[Dict[str, An lines = self.read_lines_from_path(path) return self._convert_lines(lines) + def _parse_and_index_lines(self, lines: List[str]) -> None: + line_trees = [] + for line in lines: + # ToDo: Parsing should happen here and ONLY here + # Continue from here. Translation injection seems to work. + # Now it should be properly tested. + # For that, modify the logic and output a proper result. + # Also, add column logic to detect the end of the text + # and reset the legacy column transformer. + line_tree = self.ebl_parser.parse(line) + self.indexing_visitor.visit(line_tree) + self.legacy_visitor.visit(line_tree) + cursor = ( + self.indexing_visitor.cursor_index + if line_tree.data == "text_line" + else None + ) + if "translation_line" in line_tree.data: + line_trees = self._handle_legacy_translation(line_tree, line_trees) + else: + line_trees.append({"line": line_tree, "cursor": cursor}) + + def _handle_legacy_translation( + self, translation_line: Tree, line_trees: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + if translation_line.data == "translation_line": + insert_at = self.translation_block_transformer.start + line_trees = self._insert_translation_line( + translation_line, insert_at, line_trees + ) + return line_trees + + def _insert_translation_line( + self, translation_line: Tree, insert_at: str, line_trees: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + for index, tree_line in enumerate(line_trees): + if insert_at == tree_line["cursor"]: + if ( + index + 1 <= len(line_trees) + and line_trees[index + 1]["line"].data == "translation_line" + ): + line_trees[index + 1] = { + "cursor": None, + "line": translation_line, + } + else: + line_trees.insert( + index + 1, {"cursor": None, "line": translation_line} + ) + break + return line_trees + def _convert_lines(self, lines: List[str]) -> List[Dict[str, Any]]: + self._parse_and_index_lines(lines) # ToDo: Implement further logic processed_lines = [] for line in lines: result = self.process_line(line) - #c_line = self.line_transformer.transform(result[0]) + # c_line = self.line_transformer.transform(result[0]) processed_lines.append( { "c_line": result[0], @@ -74,7 +134,6 @@ def check_original_line( def transform_legacy_atf(self, tree: Tree) -> Tree: self.legacy_visitor.visit(tree) - # print('!!!! visitor.legacy_found', visitor.legacy_found) if self.legacy_visitor.legacy_found: self.logger.info("Legacy line successfully parsed") return tree diff --git a/ebl/atf_importer/domain/legacy_atf_transformers.py b/ebl/atf_importer/domain/legacy_atf_transformers.py index 644be03b5..6139b7003 100644 --- a/ebl/atf_importer/domain/legacy_atf_transformers.py +++ b/ebl/atf_importer/domain/legacy_atf_transformers.py @@ -2,6 +2,7 @@ from typing import Optional, List, Sequence, Union, Type from lark.visitors import Transformer, Tree, Token, v_args, Discard from ebl.transliteration.domain.atf import _SUB_SCRIPT +from ebl.transliteration.domain.common_transformer import CommonTransformer # ToDo: Continue from here # Make sure every transformer is implemented and works properly. @@ -31,46 +32,6 @@ def transform(self, tree: Tree) -> Tree: result = super().transform(tree) return result if result else tree - def _transform_children(self, children: Sequence[Tree]): - index_correction = 0 - for index, child in enumerate(children): - self._enter_node(index - index_correction) - result = self._get_child_result(child) - self._exit_node() - if result is not Discard: - yield result - - def _get_child_result(self, child: Tree) -> Tree: - if self.is_classes_break_at(self.get_ancestors()): - return child - elif isinstance(child, Tree): - return self._transform_tree(child) - elif self.__visit_tokens__ and isinstance(child, Token): - return self._call_userfunc_token(child) - else: - return child - - def _enter_node(self, index: int = 0) -> None: - self.current_path.append(index) - - def _exit_node(self) -> None: - if self.current_path: - self.current_path.pop() - - def get_ancestors(self) -> Sequence: - if not self.current_tree: - return [] - tree = self.current_tree - ancestors = [tree.data] - for parent_index in self.current_path[:-1]: - ancestor = tree.children[parent_index] - ancestors.append(ancestor.data) - tree = tree.children[parent_index] - return ancestors - - def is_classes_break_at(self, node_classes: Sequence[str]) -> bool: - return not set(node_classes).isdisjoint(self.break_at) - def to_token(self, name: str, string: Optional[str]) -> Token: return ( Token(f"{self.prefix}__{name}", string) @@ -245,6 +206,10 @@ def ebl_atf_text_line__VALUE_CHARACTER(self, token: Token) -> Token: class LegacyColumnTransformer(LegacyTransformer): prefix = "" + # ToDo: + # Add indexing to detect the beginnging of the text. + # Then reset the column number when a new text begins. + def __init__(self, **kwargs) -> None: super().__init__(**kwargs) self.column_number = 1 @@ -285,17 +250,20 @@ def __init__(self, **kwargs) -> None: def _reset(self) -> None: self.language: Optional[Token] = None - self.start: Optional[Sequence[Tree]] = None + self.start: Optional[str] = None self.extent: Optional[Sequence[Tree]] = None self.translation: Sequence[str] = [] @property def translation_c_line(self) -> Sequence[Union[Tree, Token]]: - return [ - self.language, - self._translation_extent, - self._translation_string_part, - ] + return self.to_tree( + "translation_line", + [ + self.language, + self._translation_extent, + self._translation_string_part, + ], + ) @property def _translation_extent(self) -> Tree: @@ -325,7 +293,7 @@ def ebl_atf_translation_line__legacy_translation_block_at_line( @v_args(inline=True) def ebl_atf_translation_line__labels_start(self, labels: Tree) -> None: self.legacy_found = True - self.start = labels.children + self.start = self._labels_to_string(labels) return @v_args(inline=True) @@ -334,8 +302,13 @@ def ebl_atf_translation_line__labels_extent(self, labels: Tree) -> None: self.extent = labels.children return - # def _extract_labels(self, labels: Tree) -> Sequence[Tree]: - # return labels.children[0].children + [labels.children[1]] + def _labels_to_string(self, labels: Tree) -> str: + labels, line_number = CommonTransformer().transform(labels).children + return ( + " ".join(label.to_value() for label in labels) + + " " + + str(line_number.number) + ) @v_args(inline=True) def ebl_atf_translation_line__legacy_translation_block_line( @@ -343,4 +316,4 @@ def ebl_atf_translation_line__legacy_translation_block_line( ) -> None: self.legacy_found = True self.translation.append(str(text.children[0])) - return + return self.translation_c_line diff --git a/ebl/atf_importer/domain/legacy_atf_visitor.py b/ebl/atf_importer/domain/legacy_atf_visitor.py index a5a539499..47104d0b1 100644 --- a/ebl/atf_importer/domain/legacy_atf_visitor.py +++ b/ebl/atf_importer/domain/legacy_atf_visitor.py @@ -32,7 +32,7 @@ prime_transformer = (LegacyPrimeTransformer(), "children") aleph_transformer = (LegacyAlephTransformer(), "children") column_transformer = (LegacyColumnTransformer(), "tree") -translation_block_transformer = (LegacyTranslationBlockTransformer(), "tree") +translation_block_transformer = (LegacyTranslationBlockTransformer(), "first_child") class LegacyAtfVisitor(Visitor): @@ -62,6 +62,7 @@ class LegacyAtfVisitor(Visitor): "value_name_part": [aleph_transformer], "at_line_value": [column_transformer], "legacy_column": [column_transformer], + "text_line": [translation_block_transformer], "legacy_translation_line": [translation_block_transformer], } @@ -72,8 +73,8 @@ def __init__(self): prefix = self.text_line_prefix if suffix in ["legacy_column"]: prefix = self.at_line_prefix - elif "legacy_translation" in suffix: - prefix = "" #self.translation_line_prefix + elif suffix in ["legacy_translation_line", "text_line"]: + prefix = "" self._set_rules(suffix, transformers, prefix) def _set_rules( @@ -108,7 +109,10 @@ def _transform( transformed_tree = transformer.transform(tree) if transformer.legacy_found: self.legacy_found = True - if replace == "tree": + if replace == "first_child" and transformed_tree.children[0]: + tree.data = transformed_tree.children[0].data + tree.children = transformed_tree.children[0].children + elif replace == "tree": tree.data = transformed_tree.data tree.children = transformed_tree.children elif replace == "children": diff --git a/ebl/tests/atf_importer/test_atf_preprocessor.py b/ebl/tests/atf_importer/test_atf_preprocessor.py index c16fb0114..3acf5046c 100644 --- a/ebl/tests/atf_importer/test_atf_preprocessor.py +++ b/ebl/tests/atf_importer/test_atf_preprocessor.py @@ -5,19 +5,22 @@ # ToDo: All transformers should be tested TRANSLATION_LEGACY = """ -@obverse +@right? +@column 1. a-na -2. a-bi-ya +2. a-bí-ya @translation en labelled -@label(o 1-o 2) -To my father +@label(r.e.? i 1-r.e.? i 2) +To my +father """ TRANSLATION_EXPECTED = """ -@obverse +@right? +@column 1 1. a-na -#tr.en.(o 2): To my father -2. a-bi-ya +#tr.en.(r.e.? i 2): To my father +2. a-bi2-ya """ PARSE_AND_TRANSFORM_LEGACY = [ @@ -76,8 +79,7 @@ ( "14. [...] x (x) še-e-hu $BAD $É $ME : ina GAŠAN-ia₅ {d}SUEN {" "d}INANA--<É>.AN.NA", - "14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {" - "d}INANA-.AN.NA", + "14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {d}INANA-.AN.NA", ), ] @@ -104,9 +106,9 @@ def test_legacy_translation(): legacy_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_LEGACY) expected_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_EXPECTED) # ToDo: Clean up - #print("RESULT:\n", legacy_tree) # .pretty()) - #print("EXPECTED:\n", expected_tree) # .pretty()) - #input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" + # print("RESULT:\n", legacy_tree) # .pretty()) + # print("EXPECTED:\n", expected_tree) # .pretty()) + # input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" assert legacy_tree == expected_tree @@ -126,9 +128,9 @@ def test_text_lines(legacy_line, ebl_line): legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree) expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line) # ToDo: Clean up - #print("RESULT:\n", legacy_tree) # .pretty()) - #print("EXPECTED:\n", expected_tree) # .pretty()) - #input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" + # print("RESULT:\n", legacy_tree) # .pretty()) + # print("EXPECTED:\n", expected_tree) # .pretty()) + # input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" assert legacy_tree == expected_tree diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark index 5a029d8d3..d5a706400 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark @@ -1,8 +1,7 @@ %import .ebl_atf_text_line (text_line, any_word) %import .ebl_atf_parallel_line (parallel_line) %import .ebl_atf_manuscript_line (manuscript_line, paratext, siglum) -%import .ebl_atf_translation_line (translation_line) -%import .ebl_atf_translation_line (legacy_translation_line) +%import .ebl_atf_translation_line (translation_line, legacy_translation_line) %import .ebl_atf_note_line (note_line, markup) %import .ebl_atf_dollar_line (dollar_line) %import .ebl_atf_at_line (at_line) diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark index a41f65467..444e03650 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark @@ -1,8 +1,5 @@ %import common.INT -%import .ebl_atf_common (status) -%import .ebl_atf_common (object, OBJECT, generic_object, fragment) -%import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge) -%import .ebl_atf_common (seal, free_text) +%import .ebl_atf_common (status, object, surface, seal, free_text) %import .ebl_atf_note_line(_markup) ?at_line: "@" at_line_value diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_dollar_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_dollar_line.lark index d21ae7a53..04c245437 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_dollar_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_dollar_line.lark @@ -2,8 +2,8 @@ %import common.INT -> INT %import common.LCASE_LETTER -> LCASE_LETTER %import .ebl_atf_common (free_text) -%import .ebl_atf_common (object, OBJECT, generic_object, fragment) -%import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge) +%import .ebl_atf_common (object, fragment) +%import .ebl_atf_common (surface, edge) %import .ebl_atf_common (seal) %import .legacy_atf (legacy_single_ruling)