Skip to content

Commit

Permalink
Clean up, update, add transformers & tests
Browse files Browse the repository at this point in the history
  • Loading branch information
khoidt committed Feb 26, 2025
1 parent f2d1030 commit c432363
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 99 deletions.
23 changes: 14 additions & 9 deletions ebl/atf_importer/domain/atf_preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import traceback
import codecs
from lark.visitors import Tree
from typing import Tuple, Optional, List, Dict, Any
from ebl.atf_importer.domain.atf_preprocessor_base import AtfPreprocessorBase
Expand All @@ -8,8 +9,7 @@
translation_block_transformer,
column_transformer,
)

# from ebl.transliteration.domain.line_transformer import LineTransformer
from ebl.transliteration.domain.line_transformer import LineTransformer
from ebl.atf_importer.domain.atf_indexing_visitor import IndexingVisitor


Expand All @@ -18,14 +18,15 @@ class AtfPreprocessor(AtfPreprocessorBase):
legacy_visitor = LegacyAtfVisitor()
translation_block_transformer = translation_block_transformer[0]
column_transformer = column_transformer[0]
# line_transformer = LineTransformer()
line_transformer = LineTransformer()

def convert_lines_from_string(self, text: str) -> List[Dict[str, Any]]:
return self._convert_lines(text.split("\n"))

def convert_lines_from_path(self, path: str, filename: str) -> List[Dict[str, Any]]:
self.logger.info(Util.print_frame(f'Converting: "{filename}.atf"'))
lines = self.read_lines_from_path(path)
with codecs.open(path, "r", encoding="utf8") as f:
lines = f.read().split("\n")
return self._convert_lines(lines)

def _parse_lines(self, lines: List[str]) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -87,10 +88,12 @@ def _insert_translation_line(
return line_trees

def _convert_lines(self, lines: List[str]) -> List[Dict[str, Any]]:
self.translation_block_transformer.reset()
self.column_transformer.reset()
line_trees = self._parse_lines(lines) # ToDo: Implement further logic
processed_lines = [line["line"] for line in line_trees]
# processed_lines = []
processed_lines = [
self.line_transformer.transform(line["line"]) for line in line_trees
]
"""
for line in lines:
result = self.process_line(line)
Expand Down Expand Up @@ -136,31 +139,33 @@ def check_original_line(
atf = atf.replace("#", "#note:")
atf = atf.replace("# note:", "#note:")
tree = self.ebl_parser.parse(atf)
tree = self.transform_legacy_atf(tree)
# tree = self.transform_legacy_atf(tree)
self.logger.info("Line successfully parsed")
self.logger.debug(f"Parsed line as {tree.data}")
self.logger.info(
"----------------------------------------------------------------------"
)
return self.get_line_tree_data(tree)

"""
def transform_legacy_atf(self, tree: Tree) -> Tree:
self.legacy_visitor.visit(tree)
if self.legacy_visitor.legacy_found:
self.logger.info("Legacy line successfully parsed")
return tree
"""

def parse_and_convert_line(
self, atf: str
) -> Tuple[Optional[str], Optional[List[Any]], Optional[str], Optional[List[Any]]]:
result = (None, None, None, None)
try:
tree = self.ebl_parser.parse(atf)
#if tree.data in self.unused_lines:
# if tree.data in self.unused_lines:
# result = self.get_empty_conversion(tree)
# ToDo: Check original
# return tree
if tree.data == "lem_line": # elif ...
if tree.data == "lem_line": # elif ...
result = self.convert_lem_line(atf, tree)
else:
result = self.get_line_tree_data(tree)
Expand Down
61 changes: 1 addition & 60 deletions ebl/atf_importer/domain/atf_preprocessor_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import codecs
import logging
import re
from typing import Tuple, Optional, List, Any
Expand All @@ -8,31 +7,12 @@
GetWords,
)

opening_half_bracket = {"⌈", "⸢"}
closing_half_bracket = {"⌉", "⸣"}


# ToDo:
# Functionality should be mainly transferred
# to `transformers`.
# Extract oracc_atf_lem_line parser,
# use within ebl_atf parser or separately.

"""
unused_lines = {
"oracc_atf_at_line__object_with_status",
"oracc_atf_at_line__surface_with_status",
"oracc_atf_at_line__discourse",
"oracc_atf_at_line__column",
"oracc_atf_at_line__seal",
"dollar_line",
"note_line",
"control_line",
"empty_line",
"translation_line",
}
"""

special_chars = {
"sz": "š",
"c": "š",
Expand Down Expand Up @@ -85,7 +65,7 @@
"5/6": "⅚",
"\t": " ",
"$ rest broken": "$ rest of side broken",
"$ ruling": "$ single ruling",
#"$ ruling": "$ single ruling",
"]x": "] x",
"x[": "x [",
"]⸢x": "] ⸢x",
Expand All @@ -111,7 +91,6 @@ def __init__(self, logdir: str, style: int) -> None:
self.logger = logging.getLogger("Atf-Preprocessor")
self.logger.setLevel(logging.DEBUG)
self.skip_next_lem_line = False
#self.unused_lines = unused_lines
self.logdir = logdir
self.style = style
self.open_found = False
Expand Down Expand Up @@ -177,17 +156,12 @@ def serizalize_lemmas_and_guidewords(self, tree: Tree) -> List[Any]:
lemmas_and_guidewords_serializer.visit(tree)
return lemmas_and_guidewords_serializer.result

def read_lines_from_path(self, file: str) -> List[str]:
with codecs.open(file, "r", encoding="utf8") as f:
return f.read().split("\n")

def _handle_text_line(self, atf: str) -> str:
atf_text_line_methods = [
"_replace_dashes",
"replace_special_characters",
"_normalize_patterns",
"_replace_primed_digits",
"_process_bracketed_parts",
"_uppercase_underscore",
"_lowercase_braces",
"_replace_dollar_signs",
Expand Down Expand Up @@ -234,40 +208,7 @@ def _replace_tabs_and_excessive_whitespaces(self, atf: str) -> str:
return atf.replace("[\t ]*", " ")

def _handle_dollar_line(self, atf: str) -> str:
special_marks = {
"$ rest broken": "$ rest of side broken",
"$ ruling": "$ single ruling",
}
if atf in special_marks.keys():
return special_marks[atf]
if atf.startswith("$ "):
dollar_comment = atf.split("$ ")[1]
return f"$ ({dollar_comment})"
return atf

def _process_bracketed_parts(self, atf: str) -> str:
self.open_found = False
split = re.split(r"([⌈⌉⸢⸣])", atf)
# ToDo: Remove:
if len(split) > 1 and atf.startswith("9. ⸢4(BÁN)?⸣"):
# ToDo: Continue from here.
# Problem with `4(BÁN)#?`, which is not in lark grammer
# The issue is probably with SIGN - ? - #
print("! split", split)
print(
"! norm", "".join(self._process_bracketed_part(part) for part in split)
)
input("press enter")
return (
"".join(self._process_bracketed_part(part) for part in split)
if len(split) > 1
else atf
)

def _process_bracketed_part(self, part: str) -> str:
if part in opening_half_bracket.union(closing_half_bracket):
self.open_found = part in opening_half_bracket
return ""
if not self.open_found:
return part
return re.sub(r"([-.\s])", r"#\1", part) + "#"
45 changes: 35 additions & 10 deletions ebl/atf_importer/domain/legacy_atf_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,38 @@ def to_tree(
)


class LegacyStateTransformer(LegacyTransformer):
prefix = "ebl_atf_dollar_line"

@v_args(inline=True)
def ebl_atf_dollar_line__legacy_broken_state(self) -> Tree:
self.legacy_found = True
return self.to_tree(
"state_extent",
[
self.to_token("EXTENT", "rest of"),
self.to_token("SCOPE", "side"),
self.to_token("STATE", "broken"),
None,
],
)


class LegacyRulingTransformer(LegacyTransformer):
prefix = "ebl_atf_dollar_line"

@v_args(inline=True)
def ebl_atf_dollar_line__legacy_single_ruling(
self, status: Optional[Token] = None
) -> Tree:
self.legacy_found = True
if status:
status = self.to_token("DOLLAR_STATUS", str(status))
return self.to_tree(
"ruling", [self.to_token("RULING_NUMBER", "single"), status]
)


class HalfBracketsTransformer(LegacyTransformer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
Expand Down Expand Up @@ -173,9 +205,6 @@ def __init__(self, **kwargs):
super().__init__(**kwargs)
self.sub_index = None
self.break_at = ["ebl_atf_text_line__surrogate_text"]
# ToDo: Continue from here.
# `break_at` does not work.
# Check the older, remove code (and implement it again?)

def clear(self):
super().clear()
Expand Down Expand Up @@ -258,10 +287,6 @@ def ebl_atf_text_line__VALUE_CHARACTER(self, token: Token) -> Token:
class LegacyColumnTransformer(LegacyTransformer):
prefix = ""

# ToDo:
# Add indexing to detect the beginnging of the text.
# Then reset the column number when a new text begins.

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.reset()
Expand All @@ -288,9 +313,9 @@ class LegacyTranslationBlockTransformer(LegacyTransformer):

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self._reset()
self.reset()

def _reset(self) -> None:
def reset(self) -> None:
self.language: Optional[Token] = None
self.start: Optional[str] = None
self.extent: Optional[Sequence[Tree]] = None
Expand Down Expand Up @@ -327,7 +352,7 @@ def _translation_string_part(self) -> Tree:
def ebl_atf_translation_line__legacy_translation_block_at_line(
self, language: str
) -> None:
self._reset()
self.reset()
self.legacy_found = True
self.language = language
return
Expand Down
11 changes: 10 additions & 1 deletion ebl/atf_importer/domain/legacy_atf_visitor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Sequence, Tuple, Callable
from lark.visitors import Visitor, Tree
from ebl.atf_importer.domain.legacy_atf_transformers import (
LegacyTransformer,
AccentedIndexTransformer,
HalfBracketsTransformer,
OraccJoinerTransformer,
Expand All @@ -11,8 +12,9 @@
LegacyAlephTransformer,
LegacyColumnTransformer,
LegacyTranslationBlockTransformer,
LegacyRulingTransformer,
LegacyStateTransformer,
)
from ebl.atf_importer.domain.legacy_atf_transformers import LegacyTransformer

# ToDo: Continue from here
# Make sure every transformer is implemented and works properly.
Expand All @@ -33,11 +35,14 @@
aleph_transformer = (LegacyAlephTransformer(), "children")
column_transformer = (LegacyColumnTransformer(), "tree")
translation_block_transformer = (LegacyTranslationBlockTransformer(), "first_child")
ruling_transformer = (LegacyRulingTransformer(), "first_child")
state_transformer = (LegacyStateTransformer(), "first_child")


class LegacyAtfVisitor(Visitor):
text_line_prefix = "ebl_atf_text_line"
at_line_prefix = "ebl_atf_at_line"
dollar_line_prefix = "ebl_atf_dollar_line"

nodes_to_visit = {
"number": [oracc_modifier_prefix_transformer],
Expand All @@ -58,6 +63,8 @@ class LegacyAtfVisitor(Visitor):
],
"text": [half_brackets_transformer],
"status": [column_transformer],
"ruling": [ruling_transformer],
"state": [state_transformer],
"ebl_atf_common__single_line_number": [prime_transformer],
"value_name_part": [aleph_transformer],
"at_line_value": [column_transformer],
Expand All @@ -73,6 +80,8 @@ def __init__(self):
prefix = self.text_line_prefix
if suffix in ["legacy_column"]:
prefix = self.at_line_prefix
elif suffix in ["ruling", "state"]:
prefix = self.dollar_line_prefix
elif suffix in ["legacy_translation_line", "text_line"]:
prefix = ""
self._set_rules(suffix, transformers, prefix)
Expand Down
Loading

0 comments on commit c432363

Please sign in to comment.