Move writing system keys to representations parent key in word data

lipu-linku · Dec 1, 2023 · 9b78f22 · 9b78f22
1 parent b3b13cd
commit 9b78f22
Show file tree

Hide file tree

Showing 260 changed files with 1,602 additions and 1,066 deletions.
diff --git a/.scripts/converter.py b/.scripts/converter.py
@@ -70,44 +70,60 @@ def trash(word: str, data: dict):
 
 
 WORDS = nested_defaultdict()
+REPRESENTATIONS = nested_defaultdict()
 DEFINITIONS = nested_defaultdict()
 
 COMMENTARY = nested_defaultdict()
 SP_ETYMOLOGY = nested_defaultdict()
 ETYMOLOGY = nested_defaultdict()
 
+TRANSFORMER = "t"
+DESTINATION = "d"
+
 
 TRANSFORM_MAP = {
-    # NOTE: nasin nimi li sama nimi Linku
-    "word": noop,
-    "sitelen_pona": partial(transform_to_list, splitter=" "),
-    "ucsur": noop,
-    "sitelen_pona_etymology": trash,  # send to translate
-    "sitelen_sitelen": noop,
-    "sitelen_emosi": noop,
-    # "luka_pona": partial(noop, _return_if_null=dict()),
-    "luka_pona": trash,  # to be replaced with totally different doc
-    "coined_year": noop,
-    "coined_era": noop,
-    "book": partial(noop, _return_if_null="none"),
-    "usage_category": partial(noop, _return_if_null="obscure"),
-    "source_language": partial(noop, _return_if_null="unknown"),
-    "etymology": trash,
-    "etymology_data": trash,  # to transform and send to translate
-    "ku_data": transform_ku_data,
-    "recognition": transform_recognition_data,
-    "see_also": partial(transform_to_list, splitter=","),
-    "tags": trash,
-    "author_verbatim": noop,
-    "author_verbatim_source": noop,
-    "pu_verbatim": partial(noop, _return_if_null=None),
-    "commentary": trash,  # send to translate
-    "def": trash,
+    "word": {TRANSFORMER: noop, DESTINATION: WORDS},
+    # NOTE: this could be in `representations` but we decided against that
+    "sitelen_pona": {
+        TRANSFORMER: partial(transform_to_list, splitter=" "),
+        DESTINATION: REPRESENTATIONS,
+    },
+    "ucsur": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
+    "sitelen_pona_etymology": {TRANSFORMER: trash},  # send to translate
+    "sitelen_sitelen": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
+    "sitelen_emosi": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
+    # "luka_pona": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS},
+    "luka_pona": {TRANSFORMER: trash},  # to be replaced with totally different doc
+    "coined_year": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "coined_era": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "book": {TRANSFORMER: partial(noop, _return_if_null="none"), DESTINATION: WORDS},
+    "usage_category": {
+        TRANSFORMER: partial(noop, _return_if_null="obscure"),
+        DESTINATION: WORDS,
+    },
+    "source_language": {
+        TRANSFORMER: partial(noop, _return_if_null="unknown"),
+        DESTINATION: WORDS,
+    },
+    "etymology": {TRANSFORMER: trash},
+    "etymology_data": {TRANSFORMER: trash},  # to transform and send to translate
+    "ku_data": {TRANSFORMER: transform_ku_data, DESTINATION: WORDS},
+    "recognition": {TRANSFORMER: transform_recognition_data, DESTINATION: WORDS},
+    "see_also": {
+        TRANSFORMER: partial(transform_to_list, splitter=","),
+        DESTINATION: WORDS,
+    },
+    "tags": {TRANSFORMER: trash},
+    "author_verbatim": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "author_verbatim_source": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "pu_verbatim": {
+        TRANSFORMER: partial(noop, _return_if_null=None),
+        DESTINATION: WORDS,
+    },
+    "commentary": {TRANSFORMER: trash},  # send to translate
+    "def": {TRANSFORMER: trash},  # translate special case
 }
 
-TRANSFORMER = "t"
-DESTINATION = "d"
-
 TRANSLATION_MAP = {
     "etymology_data": {
         TRANSFORMER: transform_etym_data,
@@ -144,9 +160,10 @@ def main():
     for word in data.keys():
         for field in TRANSFORM_MAP.keys():
             fetched = data[word].get(field)
-            formatted = TRANSFORM_MAP[field](word, fetched)
+            formatted = TRANSFORM_MAP[field][TRANSFORMER](word, fetched)
             if formatted is not None:
-                WORDS[word][field] = formatted
+                write_to = TRANSFORM_MAP[field][DESTINATION]
+                write_to[word][field] = formatted
 
             # if field == "ucsur":
             #     codepoint = data[word].get("ucsur")
@@ -177,6 +194,7 @@ def main():
     # TODO: order keys freely instead of alphabetically
     # or crowdin will solve this for us
     for word, worddata in WORDS.items():
+        worddata["representations"] = REPRESENTATIONS[word]
         with open(f"../words/{word}.toml", "w") as f:
             tomlified = tomlkit.dumps(worddata, sort_keys=True)
             f.write(tomlified)

diff --git a/words/Pingo.toml b/words/Pingo.toml
@@ -4,11 +4,7 @@ book = "ku lili"
 coined_era = "post-pu"
 coined_year = "2020"
 see_also = []
-sitelen_emosi = ""
-sitelen_pona = ["Pingo"]
-sitelen_sitelen = ""
 source_language = "a priori"
-ucsur = ""
 usage_category = "obscure"
 word = "Pingo"
 
@@ -21,3 +17,9 @@ car = 6
 2021-10 = 20
 2022-08 = 5
 2023-09 = 4
+
+[representations]
+sitelen_emosi = ""
+sitelen_pona = ["Pingo"]
+sitelen_sitelen = ""
+ucsur = ""
diff --git a/words/a.toml b/words/a.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = ["kin"]
-sitelen_emosi = "❗"
-sitelen_pona = ["a"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/a.jpg"
 source_language = "onomatopoeia"
-ucsur = "U+F1900"
 usage_category = "core"
 word = "a"
 
@@ -43,3 +39,9 @@ fr = "PARTICULE (accent, émotion ou confirmation)"
 2020-04 = 99
 2022-08 = 99
 2023-09 = 99
+
+[representations]
+sitelen_emosi = "❗"
+sitelen_pona = ["a"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/a.jpg"
+ucsur = "U+F1900"
diff --git a/words/aka.toml b/words/aka.toml
@@ -4,11 +4,7 @@ book = "none"
 coined_era = "post-ku"
 coined_year = "2021"
 see_also = ["natu"]
-sitelen_emosi = ""
-sitelen_pona = ["aka"]
-sitelen_sitelen = ""
 source_language = "multiple"
-ucsur = ""
 usage_category = "obscure"
 word = "aka"
 
@@ -22,3 +18,9 @@ word = "across"
 [recognition]
 2022-08 = 0
 2023-09 = 1
+
+[representations]
+sitelen_emosi = ""
+sitelen_pona = ["aka"]
+sitelen_sitelen = ""
+ucsur = ""
diff --git a/words/akesi.toml b/words/akesi.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = []
-sitelen_emosi = "🦎"
-sitelen_pona = ["akesi", "akesi2"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/akesi.jpg"
 source_language = "Dutch"
-ucsur = "U+F1901"
 usage_category = "core"
 word = "akesi"
 
@@ -28,3 +24,9 @@ fr = "NOM animal non-mignon ; reptile, amphibien"
 [recognition]
 2022-08 = 98
 2023-09 = 99
+
+[representations]
+sitelen_emosi = "🦎"
+sitelen_pona = ["akesi", "akesi2"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/akesi.jpg"
+ucsur = "U+F1901"
diff --git a/words/ako.toml b/words/ako.toml
@@ -5,14 +5,16 @@ coined_era = ""
 coined_year = ""
 etymology = []
 see_also = ["a"]
-sitelen_emosi = ""
-sitelen_pona = ["ako"]
-sitelen_sitelen = ""
 source_language = "unknown"
-ucsur = ""
 usage_category = "obscure"
 word = "ako"
 
 [recognition]
 2022-08 = 0
 2023-09 = 0
+
+[representations]
+sitelen_emosi = ""
+sitelen_pona = ["ako"]
+sitelen_sitelen = ""
+ucsur = ""
diff --git a/words/ala.toml b/words/ala.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = []
-sitelen_emosi = "❌"
-sitelen_pona = ["ala"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/ala.jpg"
 source_language = "Georgian"
-ucsur = "U+F1902"
 usage_category = "core"
 word = "ala"
 
@@ -35,3 +31,9 @@ fr = "ADJECTIF non, ne, pas, aucun, zéro"
 [recognition]
 2022-08 = 100
 2023-09 = 100
+
+[representations]
+sitelen_emosi = "❌"
+sitelen_pona = ["ala"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/ala.jpg"
+ucsur = "U+F1902"
diff --git a/words/alasa.toml b/words/alasa.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = ["lukin"]
-sitelen_emosi = "🏹"
-sitelen_pona = ["alasa"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/alasa.jpg"
 source_language = "Acadian French"
-ucsur = "U+F1903"
 usage_category = "core"
 word = "alasa"
 
@@ -42,3 +38,9 @@ fr = "VERBE chasser, cueillir"
 2020-04 = 97
 2022-08 = 97
 2023-09 = 99
+
+[representations]
+sitelen_emosi = "🏹"
+sitelen_pona = ["alasa"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/alasa.jpg"
+ucsur = "U+F1903"
diff --git a/words/ale.toml b/words/ale.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = ["ali"]
-sitelen_emosi = "♾️"
-sitelen_pona = ["ale"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/ale.jpg"
 source_language = "Dutch"
-ucsur = "U+F1904"
 usage_category = "core"
 word = "ale"
 
@@ -49,3 +45,9 @@ fr = "ADJECTIF tous, abondants, innombrables\nNOM tout, abondance, la vie, l'uni
 [recognition]
 2022-08 = 92
 2023-09 = 90
+
+[representations]
+sitelen_emosi = "♾️"
+sitelen_pona = ["ale"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/ale.jpg"
+ucsur = "U+F1904"
diff --git a/words/alente.toml b/words/alente.toml
@@ -4,11 +4,7 @@ book = "none"
 coined_era = "post-ku"
 coined_year = "2022"
 see_also = []
-sitelen_emosi = ""
-sitelen_pona = ["alente"]
-sitelen_sitelen = ""
 source_language = "toki pona"
-ucsur = ""
 usage_category = "obscure"
 word = "alente"
 
@@ -18,3 +14,9 @@ word = "ale ante"
 [recognition]
 2022-08 = 2
 2023-09 = 2
+
+[representations]
+sitelen_emosi = ""
+sitelen_pona = ["alente"]
+sitelen_sitelen = ""
+ucsur = ""
diff --git a/words/ali.toml b/words/ali.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = ["ale"]
-sitelen_emosi = "♾️"
-sitelen_pona = ["ali"]
-sitelen_sitelen = ""
 source_language = "Dutch"
-ucsur = ""
 usage_category = "uncommon"
 word = "ali"
 
@@ -23,3 +19,9 @@ universe = 21
 [recognition]
 2022-08 = 35
 2023-09 = 32
+
+[representations]
+sitelen_emosi = "♾️"
+sitelen_pona = ["ali"]
+sitelen_sitelen = ""
+ucsur = ""
diff --git a/words/alu.toml b/words/alu.toml
@@ -4,11 +4,7 @@ book = "none"
 coined_era = "post-pu"
 coined_year = "2018"
 see_also = ["la"]
-sitelen_emosi = ""
-sitelen_pona = ["alu"]
-sitelen_sitelen = ""
 source_language = "toki pona"
-ucsur = ""
 usage_category = "obscure"
 word = "alu"
 
@@ -20,3 +16,9 @@ word = "la"
 2021-10 = 24
 2022-08 = 3
 2023-09 = 3
+
+[representations]
+sitelen_emosi = ""
+sitelen_pona = ["alu"]
+sitelen_sitelen = ""
+ucsur = ""
diff --git a/words/anpa.toml b/words/anpa.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = ["noka"]
-sitelen_emosi = "⬇️"
-sitelen_pona = ["anpa"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/anpa.jpg"
 source_language = "Acadian French"
-ucsur = "U+F1905"
 usage_category = "core"
 word = "anpa"
 
@@ -35,3 +31,9 @@ fr = "ADJECTIF incliné vers le bas, humble, qui dépend"
 [recognition]
 2022-08 = 99
 2023-09 = 99
+
+[representations]
+sitelen_emosi = "⬇️"
+sitelen_pona = ["anpa"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/anpa.jpg"
+ucsur = "U+F1905"
diff --git a/words/ante.toml b/words/ante.toml
@@ -4,11 +4,7 @@ book = "pu"
 coined_era = "pre-pu"
 coined_year = ""
 see_also = ["sama"]
-sitelen_emosi = "🔀"
-sitelen_pona = ["ante"]
-sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/ante.jpg"
 source_language = "Dutch"
-ucsur = "U+F1906"
 usage_category = "core"
 word = "ante"
 
@@ -63,3 +59,9 @@ fr = "ADJECTIF différent, changé, autre"
 [recognition]
 2022-08 = 99
 2023-09 = 99
+
+[representations]
+sitelen_emosi = "🔀"
+sitelen_pona = ["ante"]
+sitelen_sitelen = "https://raw.githubusercontent.com/lipu-linku/ijo/main/sitelensitelen/jonathangabel/ante.jpg"
+ucsur = "U+F1906"