-
Notifications
You must be signed in to change notification settings - Fork 2
/
flexion.fst
103 lines (85 loc) · 3.09 KB
/
flexion.fst
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
% Define the set of valid symbol pairs for the two-level rules.
% The symbol = is used to mark the boundary between the stem and
% the inflectional suffix. It is deleted here.
% \$ is the end symbol
%ALPHABET = [\-A-Za-z0-9é] [\.\=0]:<>
% Read the lexical items from a separate file
$WORDS$ = "lexicon"
% morphophonological rules: generating surface forms
% remove empty morphemes
%$R0$ = {\=0}<=><>
ALPHABET = [\-A-Za-z0-9ø'\(\)] [\.\=0]:<>
$R0$ = (\=:<>) 0<=><>
ALPHABET = [\=A-Za-z0-9ø'\(\)] [\.\-0]:<>
$R1$ = (\-:<>) 0<=><>
$SPELLOUT$ = $R0$ || $R1$
% transliteration
% insertion of character separators and sign numbers seems to be *impossible*
% so we work with a representation that strips off all
ALPHABET = [\-A-Za-z0-9ø'\(\)] k:[k<>]
$k#$ = \
k <=> k (.:[aeiou])
ALPHABET = [\-A-Za-z0-9ø'\(\)] g:[g<>]
$g#$ = \
g <=> g (.:[aeiou])
ALPHABET = [\-A-Za-z0-9ø'\(\)] m:[m<>] n:[n<>]
$n$ = [mn] <=> <> (.:n)
ALPHABET = [\-A-Za-z0-9ø'\(\)] m:[m<>] n:[n<>]
$r$ = [mn] <=> <> (.:r)
ALPHABET = [\-A-Za-z0-9ø'\(\)] a:[a<>]
$gen$ = \
a <=> <> (.:ka)
$TRANSLIT$ = $gen$ || $g#$ || $k#$ || $n$ || $r$
% morphological rules: generating deep morphology
% Define a transducer for case endings
% There can be multiple case endings in Sumerian
$NCASE$ = <GEN>:{ak}? <ABL>:{ta}?
$CASE$ = <ABS>:{ø} | <ERG>:{e} | $NCASE$
% rudimentary verbal morphology
% http://oracc.museum.upenn.edu/etcsri/parsing/index.html#VPT
% abbreviations partially changed to match Jagersma
$V1$ = <NEG>:{nu} | <MOD>:{ga} | <MOD>:{ha} | <MOD>:{nan} | <ANT>:{u}
$V2$ = <FIN>:{i}
% $V3$ = <COOR>:{nga}
$V4$ = <VEN>:{mu} | <VEN>:{m}
$V5$ = <MID>:{ba} | <3-SG-NH>:{b}
$V6$ = <2-SG-A>:{e} | <3-SG-H>:{nn} | <3-PL>:{nnee}
$V7$ = <DAT>:a
$V8$ = <COM>:{da}
$V9$ = <ABL>:{ta} | <TERM>:{szi} | <TERM>:{sze}
$V10$ = <L1>:{ni} | <L1-SYN>:{n} | <L2>:{i} | <L3>:{i}
$V11$ = <3-SG-H-A>:{n} | <3-SG-HN-P>:{b} | <3-SG-NH-P>:{b} | <3-SG-NH-L3>:{b}
$V13$ = <PF>:{ed} | <PL>:{ene}
$V14$ = <3-SG-A>:{e} | <3-SG-S>:{ø} | <3-SG-P>:{ø} | <3-PL>:{esz}
% $V15$ = <SUB>:a
% Concatenate the lexical forms and the inflectional endings and
% put a morpheme boundary in between which is not printed in the analysis
$GENERATE$ = \
$WORDS$ <NOUN>:<> | \
% $WORDS$ <NOUN>:<> (<>:\= $NCASE$)* <>:\= $CASE$
% works, but it's safer to limit recursion depth
$WORDS$ <NOUN>:<> <>:\= $CASE$ | \
$WORDS$ <NOUN>:<> <>:\= $NCASE$ <>:\= $CASE$ | \
$WORDS$ <NOUN>:<> <>:\= $NCASE$ <>:\= $NCASE$ <>:\= $CASE$ |\
$WORDS$ <NOUN>:<> <>:\= $NCASE$ <>:\= $NCASE$ <>:\= $NCASE$ <>:\= $CASE$ | \
($V1$ <>:\-)? \
($V2$ <>:\-)? \
% ($V3$ <>:\-)? \
($V4$ <>:\-)? \
($V5$ <>:\-)? \
($V6$ <>:\- \
% as soon as one of the following is used, V6 is obligatory
($V7$ <>:\-)? \
($V8$ <>:\-)? \
($V9$ <>:\-)? \
($V10$ <>:\-)? )? \
($V11$ <>:\-)? \
$WORDS$ <VERB>:<> \
(<>:\- $V13$)? \
(<>:\- $V14$)? \
% (<>:\- $V15$)?
% Apply the two level rules
% we actually do three levels: transliteration
% The result transducer is stored in the output file
$GENERATE$ || $SPELLOUT$ %|| $TRANSLIT$
% skip $SPELLOUT$ to generate deep morphology from features