diff --git a/ebl/tests/atf_importer/test_atf_preprocessor.py b/ebl/tests/atf_importer/test_atf_preprocessor.py index 2edd17523..f01d45b0a 100644 --- a/ebl/tests/atf_importer/test_atf_preprocessor.py +++ b/ebl/tests/atf_importer/test_atf_preprocessor.py @@ -5,6 +5,13 @@ # ToDo: All transformers should be tested PARSE_AND_TRANSFORM_LEGACY = [ + ("", ""), + ("@column", "@column 1"), + ("@column", "@column 2"), + ("@face a", "@face a"), + ("@obverse", "@obverse"), + ("@reverse", "@reverse"), + ("$ single ruling", "$ single ruling"), ("1. a'", "1. aʾ"), ("1′. A", "1'. A"), ("1’. A", "1'. A"), @@ -89,9 +96,9 @@ def test_text_lines(legacy_line, ebl_line): legacy_tree = atf_preprocessor.ebl_parser.parse(legacy_line) legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree) expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line) - # print("RESULT:\n", legacy_tree) # .pretty()) - # print("EXPECTED:\n", expected_tree) # .pretty()) - # input() <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" + print("RESULT:\n", legacy_tree) # .pretty()) + print("EXPECTED:\n", expected_tree) # .pretty()) + input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!" assert legacy_tree == expected_tree diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark index 595dc2b6d..eb9bf7b9b 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf.lark @@ -4,7 +4,7 @@ %import .ebl_atf_abbreviations (PROVENANCE, PERIOD, TYPE) %import .ebl_atf_text_line (text_line, any_word, note_line, parallel_line, translation_line, labels, manuscript_line, markup) %import .ebl_atf_dollar_line (dollar_line) -%import .ebl_atf_text_line (at_line) +%import .ebl_atf_at_line (at_line) %import .oracc_atf_lem_line (lem_line) ?start: line diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark index cde79ee58..9cf07e11e 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_at_line.lark @@ -1,24 +1,24 @@ %import .ebl_atf_common (INT, LCASE_LETTER) -%import .ebl_atf_common (free_text) +%import .ebl_atf_common (status) %import .ebl_atf_common (object, OBJECT, generic_object, fragment) %import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge) -%import .ebl_atf_common (seal, OBJECT, SURFACE, free_text, _markup) +%import .ebl_atf_common (seal, free_text) +%import .ebl_atf_text_line(_markup) -?at_line: "@" value +?at_line: "@" at_line_value -?value: seal | column | heading | discourse | object_with_status | surface_with_status | divisions | composite +?at_line_value: seal | column | legacy_column | heading | discourse | object_with_status + | surface_with_status | divisions | composite -STATUS: "'" | "?" | "!" | "*" +surface_with_status: surface " "? status -statuses: STATUS* +object_with_status: object " "? status -surface_with_status : surface " "? statuses +column: "column " INT " "? status -object_with_status : object " "? statuses +legacy_column : "column" -column : "column " INT " "? statuses - -heading: "h" INT [ _markup ] +heading: "h" INT [ " " _markup ] !discourse: "catchline" | "colophon" | "date" | "signature" | "signatures" | "summary" | "witnesses" @@ -29,4 +29,3 @@ composite_start: "div " free_text [" " INT] composite_end: "end " free_text composite_composite: "composite" composite_milestone: "m=locator " free_text [" " INT] - diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark index 209ff1066..3caa9e266 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_common.lark @@ -46,4 +46,35 @@ AKKADIAN_ALPHABET: "ʾ" | "A" | "B" | "D" | "E" | "G" | "H" | "I" | "K" | "L" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "k" | "l" | "m" | "n" | "p" | "q" | "r" | "s" | "t" | "u" | "w" | "y" | "z" | "É" | "â" | "ê" | "î" | "û" | "ā" | "Ē" | "ē" | "ī" | "Š" - | "š" | "Ś" | "ś" | "ū" | "ṣ" | "ṭ" | "₄" \ No newline at end of file + | "š" | "Ś" | "ś" | "ū" | "ṣ" | "ṭ" | "₄" + +labels: surface_label " " column_label + | surface_label + | column_label + +object_label: (TABLET | ENVELOPE | PRIMS | BULLA) status +TABLET: "tablet" +ENVELOPE: "envelope" +PRIMS: "prism" +BULLA: "bulla" + +column_label: ROMAN_NUMERAL status +ROMAN_NUMERAL: /[ivxlcdm]+/ +CAPITAL_ROMAN_NUMERAL: /[IVXLCDM]+/ + +surface_label: (OBVERSE | REVERSE | BOTTOM | EDGE | LEFT | RIGHT | TOP) status +OBVERSE: "o" +REVERSE:"r" +BOTTOM: "b.e." +EDGE: "e." +LEFT: "l.e." +RIGHT: "r.e." +TOP: "t.e." + +status: (PRIME | LEGACY_PRIME | UNCERTAIN | CORRECTION | COLLATION | NO_LONGER_VISIBLE)* +PRIME: "'" +LEGACY_PRIME: "′" | "’" +UNCERTAIN: "?" +CORRECTION: "!" +COLLATION: "*" +NO_LONGER_VISIBLE: "°" \ No newline at end of file diff --git a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark index 7c7b3be4c..577fc34cf 100644 --- a/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark +++ b/ebl/transliteration/domain/atf_parsers/lark_parser/ebl_atf_text_line.lark @@ -2,11 +2,13 @@ %import common.LETTER %import common.LCASE_LETTER %import .ebl_atf_common (free_text) -%import .ebl_atf_common (object, OBJECT, generic_object, fragment) -%import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge) %import .ebl_atf_common (seal) -%import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER) %import .ebl_atf_common (AKKADIAN_ALPHABET, GREEK_ALPHABET) +%import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER) +%import .ebl_atf_common (labels) +%import .ebl_atf_common (CAPITAL_ROMAN_NUMERAL) +%import .ebl_atf_common (PRIME, LEGACY_PRIME, UNCERTAIN, CORRECTION, COLLATION, NO_LONGER_VISIBLE) +%import .ebl_atf_common (object_label, surface_label, column_label) %import .ebl_atf_abbreviations (PERIOD) %import .legacy_atf (LEGACY_OPEN_HALF_BRACKET, LEGACY_CLOSE_HALF_BRACKET) %import .legacy_atf (LEGACY_ORACC_JOINER, LEGACY_ORACC_DISH_DIVIDER) @@ -14,6 +16,7 @@ note_line: "#note: " _markup markup: _markup + _markup: (emphasis_part | language_part | bibliography_part | string_part | url_part)+ language_part: "@" LANGUAGE "{" text "}" emphasis_part: "@i{" note_text "}" @@ -328,59 +331,4 @@ greek_word: (greek_enclosure | greek_word_part)* greek_word_part (greek_enclosur | unclear_sign | unknown_number_of_signs ?greek_enclosure: _any_open | _any_close -greek_letter: GREEK_ALPHABET flags - -labels: surface_label " " column_label - | surface_label - | column_label - -object_label: (TABLET | ENVELOPE | PRIMS | BULLA) status -TABLET: "tablet" -ENVELOPE: "envelope" -PRIMS: "prism" -BULLA: "bulla" - -column_label: ROMAN_NUMERAL status -ROMAN_NUMERAL: /[ivxlcdm]+/ -CAPITAL_ROMAN_NUMERAL: /[IVXLCDM]+/ - -surface_label: (OBVERSE | REVERSE | BOTTOM | EDGE | LEFT | RIGHT | TOP) status -OBVERSE: "o" -REVERSE:"r" -BOTTOM: "b.e." -EDGE: "e." -LEFT: "l.e." -RIGHT: "r.e." -TOP: "t.e." - - -?at_line: "@" at_line_value - -?at_line_value: seal | column | heading | discourse | object_with_status - | surface_with_status | divisions | composite - -surface_with_status: surface " "? status - -object_with_status: object " "? status - -column: "column " INT " "? status - -heading: "h" INT [ " " _markup ] - -!discourse: "catchline" | "colophon" | "date" | "signature" | "signatures" | "summary" | "witnesses" - -divisions: "m=division " free_text [" " INT] - -?composite: composite_composite | composite_start | composite_end | composite_milestone -composite_start: "div " free_text [" " INT] -composite_end: "end " free_text -composite_composite: "composite" -composite_milestone: "m=locator " free_text [" " INT] - -status: (PRIME | LEGACY_PRIME | UNCERTAIN | CORRECTION | COLLATION | NO_LONGER_VISIBLE)* -PRIME: "'" -LEGACY_PRIME: "′" | "’" -UNCERTAIN: "?" -CORRECTION: "!" -COLLATION: "*" -NO_LONGER_VISIBLE: "°" \ No newline at end of file +greek_letter: GREEK_ALPHABET flags \ No newline at end of file diff --git a/ebl/transliteration/domain/signs_transformer.py b/ebl/transliteration/domain/signs_transformer.py index 2d6b62190..7f71c75c1 100644 --- a/ebl/transliteration/domain/signs_transformer.py +++ b/ebl/transliteration/domain/signs_transformer.py @@ -91,6 +91,9 @@ def ebl_atf_text_line__number_name_part(self, children): def ebl_atf_text_line__sub_index(self, sub_index=""): return sub_index_to_int(sub_index) + def ebl_atf_text_line__modifier(self, tokens): + return "".join(map(str, tokens)) + def ebl_atf_text_line__modifiers(self, tokens): return tuple(map(str, tokens))