Skip to content

Commit

Permalink
Implement translation injection
Browse files Browse the repository at this point in the history
  • Loading branch information
khoidt committed Feb 21, 2025
1 parent cedfb72 commit 69aa11f
Show file tree
Hide file tree
Showing 8 changed files with 173 additions and 82 deletions.
57 changes: 57 additions & 0 deletions ebl/atf_importer/domain/atf_indexing_visitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import roman
from lark.visitors import Visitor, Tree

surface_mapping = {
"obverse": "o",
"reverse": "r",
"bottom": "b.e.",
"edge": "e.",
"left": "l.e.",
"right": "r.e.",
"top": "t.e.",
}


class IndexingVisitor(Visitor):
def __init__(self):
super().__init__()
self._reset()

def _reset(self) -> None:
self.column_counter = 1
self.cursor = {"surface": None, "column": None, "line": None}

def ebl_atf_at_line__surface_with_status(self, tree: Tree) -> Tree:
surface = surface_mapping[str(tree.children[0])] + "".join(
str(child) for child in tree.children[1].children
)
self.cursor["surface"] = surface
return tree

def ebl_atf_at_line__legacy_column(self, tree: Tree) -> Tree:
self.cursor["column"] = roman.toRoman(self.column_counter).lower()
self.column_counter += 1
return tree

def ebl_atf_at_line__column(self, tree: Tree) -> Tree:
self.cursor["column"] = roman.toRoman(str(tree.children[0])).lower() + "".join(
str(child) for child in tree.children[1].children
)
return tree

def text_line(self, tree: Tree) -> Tree:
line_number = "".join(
str(child) for child in tree.children[0].children if child
)
self.cursor["line"] = line_number
return tree

@property
def cursor_index(self) -> str:
return " ".join(
[
str(self.cursor[key])
for key in ["surface", "column", "line"]
if self.cursor[key]
]
)
69 changes: 64 additions & 5 deletions ebl/atf_importer/domain/atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,20 @@
from typing import Tuple, Optional, List, Dict, Any
from ebl.atf_importer.domain.atf_preprocessor_base import AtfPreprocessorBase
from ebl.atf_importer.domain.atf_preprocessor_util import Util
from ebl.atf_importer.domain.legacy_atf_visitor import LegacyAtfVisitor
#from ebl.transliteration.domain.line_transformer import LineTransformer
from ebl.atf_importer.domain.legacy_atf_visitor import (
LegacyAtfVisitor,
translation_block_transformer,
)

# from ebl.transliteration.domain.line_transformer import LineTransformer
from ebl.atf_importer.domain.atf_indexing_visitor import IndexingVisitor


class AtfPreprocessor(AtfPreprocessorBase):
indexing_visitor = IndexingVisitor()
legacy_visitor = LegacyAtfVisitor()
#line_transformer = LineTransformer()
translation_block_transformer = translation_block_transformer[0]
# line_transformer = LineTransformer()

def convert_lines_from_string(self, text: str) -> List[Dict[str, Any]]:
return self._convert_lines(text.split("\n"))
Expand All @@ -19,11 +26,64 @@ def convert_lines_from_path(self, path: str, filename: str) -> List[Dict[str, An
lines = self.read_lines_from_path(path)
return self._convert_lines(lines)

def _parse_and_index_lines(self, lines: List[str]) -> None:
line_trees = []
for line in lines:
# ToDo: Parsing should happen here and ONLY here
# Continue from here. Translation injection seems to work.
# Now it should be properly tested.
# For that, modify the logic and output a proper result.
# Also, add column logic to detect the end of the text
# and reset the legacy column transformer.
line_tree = self.ebl_parser.parse(line)
self.indexing_visitor.visit(line_tree)
self.legacy_visitor.visit(line_tree)
cursor = (
self.indexing_visitor.cursor_index
if line_tree.data == "text_line"
else None
)
if "translation_line" in line_tree.data:
line_trees = self._handle_legacy_translation(line_tree, line_trees)
else:
line_trees.append({"line": line_tree, "cursor": cursor})

def _handle_legacy_translation(
self, translation_line: Tree, line_trees: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
if translation_line.data == "translation_line":
insert_at = self.translation_block_transformer.start
line_trees = self._insert_translation_line(
translation_line, insert_at, line_trees
)
return line_trees

def _insert_translation_line(
self, translation_line: Tree, insert_at: str, line_trees: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
for index, tree_line in enumerate(line_trees):
if insert_at == tree_line["cursor"]:
if (
index + 1 <= len(line_trees)
and line_trees[index + 1]["line"].data == "translation_line"
):
line_trees[index + 1] = {
"cursor": None,
"line": translation_line,
}
else:
line_trees.insert(
index + 1, {"cursor": None, "line": translation_line}
)
break
return line_trees

def _convert_lines(self, lines: List[str]) -> List[Dict[str, Any]]:
self._parse_and_index_lines(lines) # ToDo: Implement further logic
processed_lines = []
for line in lines:
result = self.process_line(line)
#c_line = self.line_transformer.transform(result[0])
# c_line = self.line_transformer.transform(result[0])
processed_lines.append(
{
"c_line": result[0],
Expand Down Expand Up @@ -74,7 +134,6 @@ def check_original_line(

def transform_legacy_atf(self, tree: Tree) -> Tree:
self.legacy_visitor.visit(tree)
# print('!!!! visitor.legacy_found', visitor.legacy_found)
if self.legacy_visitor.legacy_found:
self.logger.info("Legacy line successfully parsed")
return tree
Expand Down
73 changes: 23 additions & 50 deletions ebl/atf_importer/domain/legacy_atf_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, List, Sequence, Union, Type
from lark.visitors import Transformer, Tree, Token, v_args, Discard
from ebl.transliteration.domain.atf import _SUB_SCRIPT
from ebl.transliteration.domain.common_transformer import CommonTransformer

# ToDo: Continue from here
# Make sure every transformer is implemented and works properly.
Expand Down Expand Up @@ -31,46 +32,6 @@ def transform(self, tree: Tree) -> Tree:
result = super().transform(tree)
return result if result else tree

def _transform_children(self, children: Sequence[Tree]):
index_correction = 0
for index, child in enumerate(children):
self._enter_node(index - index_correction)
result = self._get_child_result(child)
self._exit_node()
if result is not Discard:
yield result

def _get_child_result(self, child: Tree) -> Tree:
if self.is_classes_break_at(self.get_ancestors()):
return child
elif isinstance(child, Tree):
return self._transform_tree(child)
elif self.__visit_tokens__ and isinstance(child, Token):
return self._call_userfunc_token(child)
else:
return child

def _enter_node(self, index: int = 0) -> None:
self.current_path.append(index)

def _exit_node(self) -> None:
if self.current_path:
self.current_path.pop()

def get_ancestors(self) -> Sequence:
if not self.current_tree:
return []
tree = self.current_tree
ancestors = [tree.data]
for parent_index in self.current_path[:-1]:
ancestor = tree.children[parent_index]
ancestors.append(ancestor.data)
tree = tree.children[parent_index]
return ancestors

def is_classes_break_at(self, node_classes: Sequence[str]) -> bool:
return not set(node_classes).isdisjoint(self.break_at)

def to_token(self, name: str, string: Optional[str]) -> Token:
return (
Token(f"{self.prefix}__{name}", string)
Expand Down Expand Up @@ -245,6 +206,10 @@ def ebl_atf_text_line__VALUE_CHARACTER(self, token: Token) -> Token:
class LegacyColumnTransformer(LegacyTransformer):
prefix = ""

# ToDo:
# Add indexing to detect the beginnging of the text.
# Then reset the column number when a new text begins.

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.column_number = 1
Expand Down Expand Up @@ -285,17 +250,20 @@ def __init__(self, **kwargs) -> None:

def _reset(self) -> None:
self.language: Optional[Token] = None
self.start: Optional[Sequence[Tree]] = None
self.start: Optional[str] = None
self.extent: Optional[Sequence[Tree]] = None
self.translation: Sequence[str] = []

@property
def translation_c_line(self) -> Sequence[Union[Tree, Token]]:
return [
self.language,
self._translation_extent,
self._translation_string_part,
]
return self.to_tree(
"translation_line",
[
self.language,
self._translation_extent,
self._translation_string_part,
],
)

@property
def _translation_extent(self) -> Tree:
Expand Down Expand Up @@ -325,7 +293,7 @@ def ebl_atf_translation_line__legacy_translation_block_at_line(
@v_args(inline=True)
def ebl_atf_translation_line__labels_start(self, labels: Tree) -> None:
self.legacy_found = True
self.start = labels.children
self.start = self._labels_to_string(labels)
return

@v_args(inline=True)
Expand All @@ -334,13 +302,18 @@ def ebl_atf_translation_line__labels_extent(self, labels: Tree) -> None:
self.extent = labels.children
return

# def _extract_labels(self, labels: Tree) -> Sequence[Tree]:
# return labels.children[0].children + [labels.children[1]]
def _labels_to_string(self, labels: Tree) -> str:
labels, line_number = CommonTransformer().transform(labels).children
return (
" ".join(label.to_value() for label in labels)
+ " "
+ str(line_number.number)
)

@v_args(inline=True)
def ebl_atf_translation_line__legacy_translation_block_line(
self, text: Tree
) -> None:
self.legacy_found = True
self.translation.append(str(text.children[0]))
return
return self.translation_c_line
12 changes: 8 additions & 4 deletions ebl/atf_importer/domain/legacy_atf_visitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
prime_transformer = (LegacyPrimeTransformer(), "children")
aleph_transformer = (LegacyAlephTransformer(), "children")
column_transformer = (LegacyColumnTransformer(), "tree")
translation_block_transformer = (LegacyTranslationBlockTransformer(), "tree")
translation_block_transformer = (LegacyTranslationBlockTransformer(), "first_child")


class LegacyAtfVisitor(Visitor):
Expand Down Expand Up @@ -62,6 +62,7 @@ class LegacyAtfVisitor(Visitor):
"value_name_part": [aleph_transformer],
"at_line_value": [column_transformer],
"legacy_column": [column_transformer],
"text_line": [translation_block_transformer],
"legacy_translation_line": [translation_block_transformer],
}

Expand All @@ -72,8 +73,8 @@ def __init__(self):
prefix = self.text_line_prefix
if suffix in ["legacy_column"]:
prefix = self.at_line_prefix
elif "legacy_translation" in suffix:
prefix = "" #self.translation_line_prefix
elif suffix in ["legacy_translation_line", "text_line"]:
prefix = ""
self._set_rules(suffix, transformers, prefix)

def _set_rules(
Expand Down Expand Up @@ -108,7 +109,10 @@ def _transform(
transformed_tree = transformer.transform(tree)
if transformer.legacy_found:
self.legacy_found = True
if replace == "tree":
if replace == "first_child" and transformed_tree.children[0]:
tree.data = transformed_tree.children[0].data
tree.children = transformed_tree.children[0].children
elif replace == "tree":
tree.data = transformed_tree.data
tree.children = transformed_tree.children
elif replace == "children":
Expand Down
32 changes: 17 additions & 15 deletions ebl/tests/atf_importer/test_atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,22 @@
# ToDo: All transformers should be tested

TRANSLATION_LEGACY = """
@obverse
@right?
@column
1. a-na
2. a-bi-ya
2. a--ya
@translation en labelled
@label(o 1-o 2)
To my father
@label(r.e.? i 1-r.e.? i 2)
To my
father
"""

TRANSLATION_EXPECTED = """
@obverse
@right?
@column 1
1. a-na
#tr.en.(o 2): To my father
2. a-bi-ya
#tr.en.(r.e.? i 2): To my father
2. a-bi2-ya
"""

PARSE_AND_TRANSFORM_LEGACY = [
Expand Down Expand Up @@ -76,8 +79,7 @@
(
"14. [...] x (x) še-e-hu $BAD $É $ME : ina GAŠAN-ia₅ {d}SUEN {"
"d}INANA--<É>.AN.NA",
"14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {"
"d}INANA-<E₂>.AN.NA",
"14. [...] x (x) še-e-hu BAD E₂ ME : ina GAŠAN-ia₅ {d}SUEN {d}INANA-<E₂>.AN.NA",
),
]

Expand All @@ -104,9 +106,9 @@ def test_legacy_translation():
legacy_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_LEGACY)
expected_tree = atf_preprocessor.convert_lines_from_string(TRANSLATION_EXPECTED)
# ToDo: Clean up
#print("RESULT:\n", legacy_tree) # .pretty())
#print("EXPECTED:\n", expected_tree) # .pretty())
#input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"
# print("RESULT:\n", legacy_tree) # .pretty())
# print("EXPECTED:\n", expected_tree) # .pretty())
# input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"

assert legacy_tree == expected_tree

Expand All @@ -126,9 +128,9 @@ def test_text_lines(legacy_line, ebl_line):
legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree)
expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line)
# ToDo: Clean up
#print("RESULT:\n", legacy_tree) # .pretty())
#print("EXPECTED:\n", expected_tree) # .pretty())
#input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"
# print("RESULT:\n", legacy_tree) # .pretty())
# print("EXPECTED:\n", expected_tree) # .pretty())
# input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"

assert legacy_tree == expected_tree

Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
%import .ebl_atf_text_line (text_line, any_word)
%import .ebl_atf_parallel_line (parallel_line)
%import .ebl_atf_manuscript_line (manuscript_line, paratext, siglum)
%import .ebl_atf_translation_line (translation_line)
%import .ebl_atf_translation_line (legacy_translation_line)
%import .ebl_atf_translation_line (translation_line, legacy_translation_line)
%import .ebl_atf_note_line (note_line, markup)
%import .ebl_atf_dollar_line (dollar_line)
%import .ebl_atf_at_line (at_line)
Expand Down
Loading

0 comments on commit 69aa11f

Please sign in to comment.