Merge pull request #1 from lipu-linku/feature/split

Finalize schema
lipu-linku · Dec 5, 2023 · 6a4411f · 6a4411f
2 parents 0d0b8be + 4be788b
commit 6a4411f
Show file tree

Hide file tree

Showing 515 changed files with 98,114 additions and 139,363 deletions.
diff --git a/.github/workflows/generate_schema.yml b/.github/workflows/generate_schema.yml
@@ -0,0 +1,53 @@
+name: Generate JSON Schemas
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  generate:
+    runs-on: ubuntu-latest
+    name: Generate JSON schemas from Zod schemas
+    defaults:
+      run:
+        working-directory: ./schemas
+    outputs:
+      schema_changed: ${{ steps.commit.outputs.committed }}
+      commit_sha: ${{ steps.commit.outputs.commit_long_sha }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          repository: ${{ github.event.pull_request.head.repo.full_name }}
+          ref: ${{ github.event.pull_request.head.ref }}
+
+      - name: Setup pnpm
+        uses: pnpm/action-setup@v2
+        with:
+          version: 8
+          run_install: |
+            - args: [--frozen-lockfile]
+              cwd: ./schemas
+
+      - name: Generate JSON Schemas
+        run: pnpm run generate:schemas
+
+      - name: Commit schemas
+        id: commit
+        uses: EndBug/add-and-commit@v9
+        with:
+          message: "Generated schemas for ${{ github.event.pull_request.head.sha || github.event.head_commit.id || vars.GITHUB_SHA }}"
+
+  validate:
+    name: Validate TOMLs based on schema
+    needs: generate
+    uses: ./.github/workflows/validate_toml.yml
+    with:
+      commit_sha: ${{ needs.generate.outputs.commit_sha }}
+    if: ${{ needs.generate.outputs.schema_changed == 'true' }}
diff --git a/.github/workflows/validate_toml.yml b/.github/workflows/validate_toml.yml
@@ -2,23 +2,40 @@ name: Validate TOML Files
 
 on:
   push:
-    branches: ["main"]
-  pull_request:
-    branches: ["main"]
+    paths:
+      - "**.toml"
+  workflow_call:
+    inputs:
+      commit_sha:
+        type: string
+        required: true
   workflow_dispatch:
 
 permissions:
   contents: read
 
 jobs:
   validate:
+    env:
+      BranchRef: ${{ github.event_name == 'workflow_run' && inputs.commit_sha || github.sha }}
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+        with:
+          ref: ${{ env.BranchRef }}
 
-      - name: Validate translations
-        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/${{vars.GITHUB_SHA || 'main'}}/schemas/translation.schema.json ./translations/*.toml
+      - name: Validate definition files
+        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/definition_translation.json ./translations/**/definitions.toml
 
-      - name: Validate words.toml
-        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/${{vars.GITHUB_SHA || 'main'}}/schemas/words.schema.json ./words.toml
+      - name: Validate commentary files
+        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/commentary_translation.json ./translations/**/commentary.toml
+
+      - name: Validate etymology files
+        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/etymology_translation.json ./translations/**/etymology.toml
+
+      - name: Validate sitelen pona files
+        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/sitelen_pona_translation.json ./translations/**/sp_etymology.toml
+
+      - name: Validate word files
+        run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/word.json ./words/*.toml
diff --git a/.scripts/.gitignore b/.scripts/.gitignore
@@ -0,0 +1,3 @@
+*.txt
+*.json
+__pycache__
diff --git a/.scripts/Makefile b/.scripts/Makefile
@@ -0,0 +1,18 @@
+PROJDIR=.
+INDIR=.
+OUTDIR=.
+
+init: ${INDIR}/nimi_pu.txt ${INDIR}/nimi_pi_pu_ala.txt ${INDIR}/compounds.txt ${INDIR}/data.json
+process: ${OUTDIR}/nimi.json
+
+${INDIR}/nimi_pu.txt:
+	curl -s https://tokipona.org/nimi_pu.txt > ${INDIR}/nimi_pu.txt
+${INDIR}/nimi_pi_pu_ala.txt:
+	curl -s https://tokipona.org/nimi_pi_pu_ala.txt > ${INDIR}/nimi_pi_pu_ala.txt
+${INDIR}/compounds.txt:
+	curl -s https://tokipona.org/compounds.txt > ${INDIR}/compounds.txt
+${INDIR}/data.json:
+	curl -s https://linku.la/jasima/data.json > ${INDIR}/data.json
+
+${OUTDIR}/nimi.json: ${PROJDIR}/jsonify_nimi.py ${INDIR}/nimi_pu.txt ${INDIR}/nimi_pi_pu_ala.txt
+	python ${PROJDIR}/jsonify_nimi.py | jq > ${OUTDIR}/nimi.json
diff --git a/.scripts/converter.py b/.scripts/converter.py
@@ -0,0 +1,241 @@
+from collections import defaultdict
+import os
+import json
+from typing import Any
+import tomlkit
+from functools import partial
+
+from jsonify_nimi import jsonify_nimi
+
+TXT_DATA = jsonify_nimi()
+
+
+JASIMA_DATA = "data.json"
+
+
+def nested_defaultdict():
+    return defaultdict(partial(defaultdict, dict))
+
+
+def transform_ku_data(word: str, data: dict):
+    return TXT_DATA.get(word) or None
+
+
+def transform_etym_data(word: str, data: dict):
+    if not data:
+        return []
+    transable_etyms = []
+
+    transable_etyms = []
+    untransable_etyms = []
+
+    langs = data.get("langs", "").split(";")
+    defs = data.get("defs", "").split(";")
+    words = data.get("words", "").split(";")
+    alts = data.get("alts", "").split(";")
+
+    for lang, _def, word, alt in zip(langs, defs, words, alts):
+        transable = dict()
+        untransable = dict()
+        if lang:
+            transable["language"] = lang
+        if _def:
+            transable["definition"] = _def
+        if word:
+            untransable["word"] = word
+        if alt:
+            untransable["alt"] = alt
+        transable_etyms.append(transable)
+        untransable_etyms.append(untransable)
+    return transable_etyms, untransable_etyms
+
+
+def transform_recognition_data(word: str, data: dict):
+    new_recog = dict()
+    for key, value in data.items():
+        new_recog[key] = int(value)
+    return new_recog
+
+
+def transform_to_list(word: str, data: str, splitter: str = ",") -> list:
+    return [elem.strip() for elem in data.split(splitter)] if data else []
+
+
+def noop(word: str, data: dict, _return_if_null: Any = ""):
+    return data if data else _return_if_null
+
+
+def trash(word: str, data: dict):
+    return None
+
+
+WORDS = nested_defaultdict()
+REPRESENTATIONS = nested_defaultdict()
+DEFINITIONS = nested_defaultdict()
+
+COMMENTARY = nested_defaultdict()
+SP_ETYMOLOGY = nested_defaultdict()
+ETYMOLOGY = nested_defaultdict()
+
+TRANSFORMER = "t"
+DESTINATION = "d"
+
+
+TRANSFORM_MAP = {
+    "word": {TRANSFORMER: noop, DESTINATION: WORDS},
+    # NOTE: this could be in `representations` but we decided against that
+    "sitelen_pona": {
+        TRANSFORMER: partial(transform_to_list, splitter=" "),
+        DESTINATION: REPRESENTATIONS,
+    },
+    "ucsur": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
+    "sitelen_pona_etymology": {TRANSFORMER: trash},  # send to translate
+    "sitelen_sitelen": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
+    "sitelen_emosi": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
+    # "luka_pona": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS},
+    "luka_pona": {TRANSFORMER: trash},  # to be replaced with totally different doc
+    "audio": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS},
+    "coined_year": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "coined_era": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "book": {TRANSFORMER: partial(noop, _return_if_null="none"), DESTINATION: WORDS},
+    "usage_category": {
+        TRANSFORMER: partial(noop, _return_if_null="obscure"),
+        DESTINATION: WORDS,
+    },
+    "source_language": {
+        TRANSFORMER: partial(noop, _return_if_null="unknown"),
+        DESTINATION: WORDS,
+    },
+    "etymology": {TRANSFORMER: trash},
+    "etymology_data": {TRANSFORMER: trash},  # to transform and send to translate
+    "creator": {
+        TRANSFORMER: partial(transform_to_list, splitter=","),
+        DESTINATION: WORDS,
+    },
+    "ku_data": {TRANSFORMER: transform_ku_data, DESTINATION: WORDS},
+    "recognition": {TRANSFORMER: transform_recognition_data, DESTINATION: WORDS},
+    "see_also": {
+        TRANSFORMER: partial(transform_to_list, splitter=","),
+        DESTINATION: WORDS,
+    },
+    "tags": {TRANSFORMER: trash},
+    "author_verbatim": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "author_verbatim_source": {TRANSFORMER: noop, DESTINATION: WORDS},
+    "pu_verbatim": {
+        TRANSFORMER: partial(noop, _return_if_null=None),
+        DESTINATION: WORDS,
+    },
+    "commentary": {TRANSFORMER: trash},  # send to translate
+    "def": {TRANSFORMER: trash},  # translate special case
+}
+
+TRANSLATION_MAP = {
+    "etymology_data": {
+        TRANSFORMER: transform_etym_data,
+        DESTINATION: ETYMOLOGY,
+    },
+    "commentary": {
+        TRANSFORMER: noop,
+        DESTINATION: COMMENTARY,
+    },
+    "sitelen_pona_etymology": {
+        TRANSFORMER: noop,
+        DESTINATION: SP_ETYMOLOGY,
+    },
+}
+
+
+def write_translated(
+    data: dict,
+    dir: str,
+    filename: str,
+    schema: str = "../../schema/generated/word.json",
+):
+    for lang, d in data.items():
+        d["$schema"] = schema
+        os.makedirs(f"{dir}/{lang}", exist_ok=True)
+        with open(f"{dir}/{lang}/{filename}", "w") as f:
+            tomlified = tomlkit.dumps(d, sort_keys=True)
+            f.write(tomlified)
+
+
+def main():
+    os.makedirs("../translations", exist_ok=True)
+    os.makedirs("../words", exist_ok=True)
+
+    with open(JASIMA_DATA, "r") as f:
+        jasima = json.loads(f.read())
+        langs = jasima["languages"]
+        data = jasima["data"]
+
+    for word in data.keys():
+        for field in TRANSFORM_MAP.keys():
+            fetched = data[word].get(field)
+            formatted = TRANSFORM_MAP[field][TRANSFORMER](word, fetched)
+            if formatted is not None:
+                write_to = TRANSFORM_MAP[field][DESTINATION]
+                write_to[word][field] = formatted
+
+            # if field == "ucsur":
+            #     codepoint = data[word].get("ucsur")
+            #     character = ""
+            #     if codepoint:
+            #         character = chr(int(codepoint[2:], base=16))
+            #     words[word]["ucsur_codepoint"] = codepoint
+            #     words[word]["ucsur_character"] = character
+            #     continue
+
+        for lang in langs.keys():
+            DEFINITIONS[lang][word] = data[word]["def"].get(lang) or ""
+            for field in TRANSLATION_MAP:
+                fetched = data[word].get(field)
+                formatted = TRANSLATION_MAP[field][TRANSFORMER](word, fetched)
+
+                # TODO: key-aware transform
+
+                if formatted is not None:
+                    write_to = TRANSLATION_MAP[field][DESTINATION]
+                    if field == "etymology_data":
+                        untransable = formatted[1] if formatted else []
+                        formatted = formatted[0] if formatted else []
+                        field = "etymology"
+                        WORDS[word][field] = untransable
+                    write_to[lang][word] = formatted
+
+    # TODO: order keys freely instead of alphabetically
+    # or crowdin will solve this for us
+    for word, worddata in WORDS.items():
+        worddata["representations"] = REPRESENTATIONS[word]
+        worddata["$schema"] = "../schemas/generated/word.json"
+        with open(f"../words/{word}.toml", "w") as f:
+            tomlified = tomlkit.dumps(worddata, sort_keys=True)
+            f.write(tomlified)
+
+    write_translated(
+        DEFINITIONS,
+        "../translations",
+        "definitions.toml",
+        schema="../../schemas/generated/definition_translation.json",
+    )
+    write_translated(
+        COMMENTARY,
+        "../translations",
+        "commentary.toml",
+        schema="../../schemas/generated/commentary_translation.json",
+    )
+    write_translated(
+        ETYMOLOGY,
+        "../translations",
+        "etymology.toml",
+        schema="../../schemas/generated/etymology_translation.json",
+    )
+    write_translated(
+        SP_ETYMOLOGY,
+        "../translations",
+        "sp_etymology.toml",
+        schema="../../schemas/generated/sitelen_pona_translation.json",
+    )
+
+
+if __name__ == "__main__":
+    main()