Skip to content

Commit

Permalink
Refactor lark grammar (correct at line and structure in general)
Browse files Browse the repository at this point in the history
  • Loading branch information
khoidt committed Nov 15, 2024
1 parent 7e20341 commit f1e3729
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 76 deletions.
13 changes: 10 additions & 3 deletions ebl/tests/atf_importer/test_atf_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
# ToDo: All transformers should be tested

PARSE_AND_TRANSFORM_LEGACY = [
("", ""),
("@column", "@column 1"),
("@column", "@column 2"),
("@face a", "@face a"),
("@obverse", "@obverse"),
("@reverse", "@reverse"),
("$ single ruling", "$ single ruling"),
("1. a'", "1. aʾ"),
("1′. A", "1'. A"),
("1’. A", "1'. A"),
Expand Down Expand Up @@ -89,9 +96,9 @@ def test_text_lines(legacy_line, ebl_line):
legacy_tree = atf_preprocessor.ebl_parser.parse(legacy_line)
legacy_tree = atf_preprocessor.transform_legacy_atf(legacy_tree)
expected_tree = atf_preprocessor.ebl_parser.parse(ebl_line)
# print("RESULT:\n", legacy_tree) # .pretty())
# print("EXPECTED:\n", expected_tree) # .pretty())
# input() <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"
print("RESULT:\n", legacy_tree) # .pretty())
print("EXPECTED:\n", expected_tree) # .pretty())
input() # <- With `task test`: "OSError: pytest: reading from stdin while output is captured!"

assert legacy_tree == expected_tree

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
%import .ebl_atf_abbreviations (PROVENANCE, PERIOD, TYPE)
%import .ebl_atf_text_line (text_line, any_word, note_line, parallel_line, translation_line, labels, manuscript_line, markup)
%import .ebl_atf_dollar_line (dollar_line)
%import .ebl_atf_text_line (at_line)
%import .ebl_atf_at_line (at_line)
%import .oracc_atf_lem_line (lem_line)

?start: line
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
%import .ebl_atf_common (INT, LCASE_LETTER)
%import .ebl_atf_common (free_text)
%import .ebl_atf_common (status)
%import .ebl_atf_common (object, OBJECT, generic_object, fragment)
%import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge)
%import .ebl_atf_common (seal, OBJECT, SURFACE, free_text, _markup)
%import .ebl_atf_common (seal, free_text)
%import .ebl_atf_text_line(_markup)

?at_line: "@" value
?at_line: "@" at_line_value

?value: seal | column | heading | discourse | object_with_status | surface_with_status | divisions | composite
?at_line_value: seal | column | legacy_column | heading | discourse | object_with_status
| surface_with_status | divisions | composite

STATUS: "'" | "?" | "!" | "*"
surface_with_status: surface " "? status

statuses: STATUS*
object_with_status: object " "? status

surface_with_status : surface " "? statuses
column: "column " INT " "? status

object_with_status : object " "? statuses
legacy_column : "column"

column : "column " INT " "? statuses

heading: "h" INT [ _markup ]
heading: "h" INT [ " " _markup ]

!discourse: "catchline" | "colophon" | "date" | "signature" | "signatures" | "summary" | "witnesses"

Expand All @@ -29,4 +29,3 @@ composite_start: "div " free_text [" " INT]
composite_end: "end " free_text
composite_composite: "composite"
composite_milestone: "m=locator " free_text [" " INT]

Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,35 @@ AKKADIAN_ALPHABET: "ʾ" | "A" | "B" | "D" | "E" | "G" | "H" | "I" | "K" | "L"
| "c" | "d" | "e" | "f" | "g" | "h" | "i" | "k" | "l" | "m"
| "n" | "p" | "q" | "r" | "s" | "t" | "u" | "w" | "y" | "z"
| "É" | "â" | "ê" | "î" | "û" | "ā" | "Ē" | "ē" | "ī" | "Š"
| "š" | "Ś" | "ś" | "ū" | "" | "" | ""
| "š" | "Ś" | "ś" | "ū" | "" | "" | ""

labels: surface_label " " column_label
| surface_label
| column_label

object_label: (TABLET | ENVELOPE | PRIMS | BULLA) status
TABLET: "tablet"
ENVELOPE: "envelope"
PRIMS: "prism"
BULLA: "bulla"

column_label: ROMAN_NUMERAL status
ROMAN_NUMERAL: /[ivxlcdm]+/
CAPITAL_ROMAN_NUMERAL: /[IVXLCDM]+/

surface_label: (OBVERSE | REVERSE | BOTTOM | EDGE | LEFT | RIGHT | TOP) status
OBVERSE: "o"
REVERSE:"r"
BOTTOM: "b.e."
EDGE: "e."
LEFT: "l.e."
RIGHT: "r.e."
TOP: "t.e."

status: (PRIME | LEGACY_PRIME | UNCERTAIN | CORRECTION | COLLATION | NO_LONGER_VISIBLE)*
PRIME: "'"
LEGACY_PRIME: "" | ""
UNCERTAIN: "?"
CORRECTION: "!"
COLLATION: "*"
NO_LONGER_VISIBLE: "°"
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,21 @@
%import common.LETTER
%import common.LCASE_LETTER
%import .ebl_atf_common (free_text)
%import .ebl_atf_common (object, OBJECT, generic_object, fragment)
%import .ebl_atf_common (surface, SURFACE, generic_surface, face, edge)
%import .ebl_atf_common (seal)
%import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER)
%import .ebl_atf_common (AKKADIAN_ALPHABET, GREEK_ALPHABET)
%import .ebl_atf_common (VALUE_CHARACTER, LOGOGRAM_CHARACTER)
%import .ebl_atf_common (labels)
%import .ebl_atf_common (CAPITAL_ROMAN_NUMERAL)
%import .ebl_atf_common (PRIME, LEGACY_PRIME, UNCERTAIN, CORRECTION, COLLATION, NO_LONGER_VISIBLE)
%import .ebl_atf_common (object_label, surface_label, column_label)
%import .ebl_atf_abbreviations (PERIOD)
%import .legacy_atf (LEGACY_OPEN_HALF_BRACKET, LEGACY_CLOSE_HALF_BRACKET)
%import .legacy_atf (LEGACY_ORACC_JOINER, LEGACY_ORACC_DISH_DIVIDER)
%import .legacy_atf (legacy_uncertain_sign_prefix)

note_line: "#note: " _markup
markup: _markup

_markup: (emphasis_part | language_part | bibliography_part | string_part | url_part)+
language_part: "@" LANGUAGE "{" text "}"
emphasis_part: "@i{" note_text "}"
Expand Down Expand Up @@ -328,59 +331,4 @@ greek_word: (greek_enclosure | greek_word_part)* greek_word_part (greek_enclosur
| unclear_sign
| unknown_number_of_signs
?greek_enclosure: _any_open | _any_close
greek_letter: GREEK_ALPHABET flags

labels: surface_label " " column_label
| surface_label
| column_label

object_label: (TABLET | ENVELOPE | PRIMS | BULLA) status
TABLET: "tablet"
ENVELOPE: "envelope"
PRIMS: "prism"
BULLA: "bulla"

column_label: ROMAN_NUMERAL status
ROMAN_NUMERAL: /[ivxlcdm]+/
CAPITAL_ROMAN_NUMERAL: /[IVXLCDM]+/

surface_label: (OBVERSE | REVERSE | BOTTOM | EDGE | LEFT | RIGHT | TOP) status
OBVERSE: "o"
REVERSE:"r"
BOTTOM: "b.e."
EDGE: "e."
LEFT: "l.e."
RIGHT: "r.e."
TOP: "t.e."


?at_line: "@" at_line_value

?at_line_value: seal | column | heading | discourse | object_with_status
| surface_with_status | divisions | composite

surface_with_status: surface " "? status

object_with_status: object " "? status

column: "column " INT " "? status

heading: "h" INT [ " " _markup ]

!discourse: "catchline" | "colophon" | "date" | "signature" | "signatures" | "summary" | "witnesses"

divisions: "m=division " free_text [" " INT]

?composite: composite_composite | composite_start | composite_end | composite_milestone
composite_start: "div " free_text [" " INT]
composite_end: "end " free_text
composite_composite: "composite"
composite_milestone: "m=locator " free_text [" " INT]

status: (PRIME | LEGACY_PRIME | UNCERTAIN | CORRECTION | COLLATION | NO_LONGER_VISIBLE)*
PRIME: "'"
LEGACY_PRIME: "" | ""
UNCERTAIN: "?"
CORRECTION: "!"
COLLATION: "*"
NO_LONGER_VISIBLE: "°"
greek_letter: GREEK_ALPHABET flags
3 changes: 3 additions & 0 deletions ebl/transliteration/domain/signs_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def ebl_atf_text_line__number_name_part(self, children):
def ebl_atf_text_line__sub_index(self, sub_index=""):
return sub_index_to_int(sub_index)

def ebl_atf_text_line__modifier(self, tokens):
return "".join(map(str, tokens))

def ebl_atf_text_line__modifiers(self, tokens):
return tuple(map(str, tokens))

Expand Down

0 comments on commit f1e3729

Please sign in to comment.