-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from lipu-linku/feature/split
Finalize schema
- Loading branch information
Showing
515 changed files
with
98,114 additions
and
139,363 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
name: Generate JSON Schemas | ||
|
||
on: | ||
push: | ||
branches: ["main"] | ||
pull_request: | ||
branches: ["main"] | ||
workflow_dispatch: | ||
|
||
permissions: | ||
contents: write | ||
|
||
jobs: | ||
generate: | ||
runs-on: ubuntu-latest | ||
name: Generate JSON schemas from Zod schemas | ||
defaults: | ||
run: | ||
working-directory: ./schemas | ||
outputs: | ||
schema_changed: ${{ steps.commit.outputs.committed }} | ||
commit_sha: ${{ steps.commit.outputs.commit_long_sha }} | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
with: | ||
repository: ${{ github.event.pull_request.head.repo.full_name }} | ||
ref: ${{ github.event.pull_request.head.ref }} | ||
|
||
- name: Setup pnpm | ||
uses: pnpm/action-setup@v2 | ||
with: | ||
version: 8 | ||
run_install: | | ||
- args: [--frozen-lockfile] | ||
cwd: ./schemas | ||
- name: Generate JSON Schemas | ||
run: pnpm run generate:schemas | ||
|
||
- name: Commit schemas | ||
id: commit | ||
uses: EndBug/add-and-commit@v9 | ||
with: | ||
message: "Generated schemas for ${{ github.event.pull_request.head.sha || github.event.head_commit.id || vars.GITHUB_SHA }}" | ||
|
||
validate: | ||
name: Validate TOMLs based on schema | ||
needs: generate | ||
uses: ./.github/workflows/validate_toml.yml | ||
with: | ||
commit_sha: ${{ needs.generate.outputs.commit_sha }} | ||
if: ${{ needs.generate.outputs.schema_changed == 'true' }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
*.txt | ||
*.json | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
PROJDIR=. | ||
INDIR=. | ||
OUTDIR=. | ||
|
||
init: ${INDIR}/nimi_pu.txt ${INDIR}/nimi_pi_pu_ala.txt ${INDIR}/compounds.txt ${INDIR}/data.json | ||
process: ${OUTDIR}/nimi.json | ||
|
||
${INDIR}/nimi_pu.txt: | ||
curl -s https://tokipona.org/nimi_pu.txt > ${INDIR}/nimi_pu.txt | ||
${INDIR}/nimi_pi_pu_ala.txt: | ||
curl -s https://tokipona.org/nimi_pi_pu_ala.txt > ${INDIR}/nimi_pi_pu_ala.txt | ||
${INDIR}/compounds.txt: | ||
curl -s https://tokipona.org/compounds.txt > ${INDIR}/compounds.txt | ||
${INDIR}/data.json: | ||
curl -s https://linku.la/jasima/data.json > ${INDIR}/data.json | ||
|
||
${OUTDIR}/nimi.json: ${PROJDIR}/jsonify_nimi.py ${INDIR}/nimi_pu.txt ${INDIR}/nimi_pi_pu_ala.txt | ||
python ${PROJDIR}/jsonify_nimi.py | jq > ${OUTDIR}/nimi.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
from collections import defaultdict | ||
import os | ||
import json | ||
from typing import Any | ||
import tomlkit | ||
from functools import partial | ||
|
||
from jsonify_nimi import jsonify_nimi | ||
|
||
TXT_DATA = jsonify_nimi() | ||
|
||
|
||
JASIMA_DATA = "data.json" | ||
|
||
|
||
def nested_defaultdict(): | ||
return defaultdict(partial(defaultdict, dict)) | ||
|
||
|
||
def transform_ku_data(word: str, data: dict): | ||
return TXT_DATA.get(word) or None | ||
|
||
|
||
def transform_etym_data(word: str, data: dict): | ||
if not data: | ||
return [] | ||
transable_etyms = [] | ||
|
||
transable_etyms = [] | ||
untransable_etyms = [] | ||
|
||
langs = data.get("langs", "").split(";") | ||
defs = data.get("defs", "").split(";") | ||
words = data.get("words", "").split(";") | ||
alts = data.get("alts", "").split(";") | ||
|
||
for lang, _def, word, alt in zip(langs, defs, words, alts): | ||
transable = dict() | ||
untransable = dict() | ||
if lang: | ||
transable["language"] = lang | ||
if _def: | ||
transable["definition"] = _def | ||
if word: | ||
untransable["word"] = word | ||
if alt: | ||
untransable["alt"] = alt | ||
transable_etyms.append(transable) | ||
untransable_etyms.append(untransable) | ||
return transable_etyms, untransable_etyms | ||
|
||
|
||
def transform_recognition_data(word: str, data: dict): | ||
new_recog = dict() | ||
for key, value in data.items(): | ||
new_recog[key] = int(value) | ||
return new_recog | ||
|
||
|
||
def transform_to_list(word: str, data: str, splitter: str = ",") -> list: | ||
return [elem.strip() for elem in data.split(splitter)] if data else [] | ||
|
||
|
||
def noop(word: str, data: dict, _return_if_null: Any = ""): | ||
return data if data else _return_if_null | ||
|
||
|
||
def trash(word: str, data: dict): | ||
return None | ||
|
||
|
||
WORDS = nested_defaultdict() | ||
REPRESENTATIONS = nested_defaultdict() | ||
DEFINITIONS = nested_defaultdict() | ||
|
||
COMMENTARY = nested_defaultdict() | ||
SP_ETYMOLOGY = nested_defaultdict() | ||
ETYMOLOGY = nested_defaultdict() | ||
|
||
TRANSFORMER = "t" | ||
DESTINATION = "d" | ||
|
||
|
||
TRANSFORM_MAP = { | ||
"word": {TRANSFORMER: noop, DESTINATION: WORDS}, | ||
# NOTE: this could be in `representations` but we decided against that | ||
"sitelen_pona": { | ||
TRANSFORMER: partial(transform_to_list, splitter=" "), | ||
DESTINATION: REPRESENTATIONS, | ||
}, | ||
"ucsur": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS}, | ||
"sitelen_pona_etymology": {TRANSFORMER: trash}, # send to translate | ||
"sitelen_sitelen": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS}, | ||
"sitelen_emosi": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS}, | ||
# "luka_pona": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS}, | ||
"luka_pona": {TRANSFORMER: trash}, # to be replaced with totally different doc | ||
"audio": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS}, | ||
"coined_year": {TRANSFORMER: noop, DESTINATION: WORDS}, | ||
"coined_era": {TRANSFORMER: noop, DESTINATION: WORDS}, | ||
"book": {TRANSFORMER: partial(noop, _return_if_null="none"), DESTINATION: WORDS}, | ||
"usage_category": { | ||
TRANSFORMER: partial(noop, _return_if_null="obscure"), | ||
DESTINATION: WORDS, | ||
}, | ||
"source_language": { | ||
TRANSFORMER: partial(noop, _return_if_null="unknown"), | ||
DESTINATION: WORDS, | ||
}, | ||
"etymology": {TRANSFORMER: trash}, | ||
"etymology_data": {TRANSFORMER: trash}, # to transform and send to translate | ||
"creator": { | ||
TRANSFORMER: partial(transform_to_list, splitter=","), | ||
DESTINATION: WORDS, | ||
}, | ||
"ku_data": {TRANSFORMER: transform_ku_data, DESTINATION: WORDS}, | ||
"recognition": {TRANSFORMER: transform_recognition_data, DESTINATION: WORDS}, | ||
"see_also": { | ||
TRANSFORMER: partial(transform_to_list, splitter=","), | ||
DESTINATION: WORDS, | ||
}, | ||
"tags": {TRANSFORMER: trash}, | ||
"author_verbatim": {TRANSFORMER: noop, DESTINATION: WORDS}, | ||
"author_verbatim_source": {TRANSFORMER: noop, DESTINATION: WORDS}, | ||
"pu_verbatim": { | ||
TRANSFORMER: partial(noop, _return_if_null=None), | ||
DESTINATION: WORDS, | ||
}, | ||
"commentary": {TRANSFORMER: trash}, # send to translate | ||
"def": {TRANSFORMER: trash}, # translate special case | ||
} | ||
|
||
TRANSLATION_MAP = { | ||
"etymology_data": { | ||
TRANSFORMER: transform_etym_data, | ||
DESTINATION: ETYMOLOGY, | ||
}, | ||
"commentary": { | ||
TRANSFORMER: noop, | ||
DESTINATION: COMMENTARY, | ||
}, | ||
"sitelen_pona_etymology": { | ||
TRANSFORMER: noop, | ||
DESTINATION: SP_ETYMOLOGY, | ||
}, | ||
} | ||
|
||
|
||
def write_translated( | ||
data: dict, | ||
dir: str, | ||
filename: str, | ||
schema: str = "../../schema/generated/word.json", | ||
): | ||
for lang, d in data.items(): | ||
d["$schema"] = schema | ||
os.makedirs(f"{dir}/{lang}", exist_ok=True) | ||
with open(f"{dir}/{lang}/{filename}", "w") as f: | ||
tomlified = tomlkit.dumps(d, sort_keys=True) | ||
f.write(tomlified) | ||
|
||
|
||
def main(): | ||
os.makedirs("../translations", exist_ok=True) | ||
os.makedirs("../words", exist_ok=True) | ||
|
||
with open(JASIMA_DATA, "r") as f: | ||
jasima = json.loads(f.read()) | ||
langs = jasima["languages"] | ||
data = jasima["data"] | ||
|
||
for word in data.keys(): | ||
for field in TRANSFORM_MAP.keys(): | ||
fetched = data[word].get(field) | ||
formatted = TRANSFORM_MAP[field][TRANSFORMER](word, fetched) | ||
if formatted is not None: | ||
write_to = TRANSFORM_MAP[field][DESTINATION] | ||
write_to[word][field] = formatted | ||
|
||
# if field == "ucsur": | ||
# codepoint = data[word].get("ucsur") | ||
# character = "" | ||
# if codepoint: | ||
# character = chr(int(codepoint[2:], base=16)) | ||
# words[word]["ucsur_codepoint"] = codepoint | ||
# words[word]["ucsur_character"] = character | ||
# continue | ||
|
||
for lang in langs.keys(): | ||
DEFINITIONS[lang][word] = data[word]["def"].get(lang) or "" | ||
for field in TRANSLATION_MAP: | ||
fetched = data[word].get(field) | ||
formatted = TRANSLATION_MAP[field][TRANSFORMER](word, fetched) | ||
|
||
# TODO: key-aware transform | ||
|
||
if formatted is not None: | ||
write_to = TRANSLATION_MAP[field][DESTINATION] | ||
if field == "etymology_data": | ||
untransable = formatted[1] if formatted else [] | ||
formatted = formatted[0] if formatted else [] | ||
field = "etymology" | ||
WORDS[word][field] = untransable | ||
write_to[lang][word] = formatted | ||
|
||
# TODO: order keys freely instead of alphabetically | ||
# or crowdin will solve this for us | ||
for word, worddata in WORDS.items(): | ||
worddata["representations"] = REPRESENTATIONS[word] | ||
worddata["$schema"] = "../schemas/generated/word.json" | ||
with open(f"../words/{word}.toml", "w") as f: | ||
tomlified = tomlkit.dumps(worddata, sort_keys=True) | ||
f.write(tomlified) | ||
|
||
write_translated( | ||
DEFINITIONS, | ||
"../translations", | ||
"definitions.toml", | ||
schema="../../schemas/generated/definition_translation.json", | ||
) | ||
write_translated( | ||
COMMENTARY, | ||
"../translations", | ||
"commentary.toml", | ||
schema="../../schemas/generated/commentary_translation.json", | ||
) | ||
write_translated( | ||
ETYMOLOGY, | ||
"../translations", | ||
"etymology.toml", | ||
schema="../../schemas/generated/etymology_translation.json", | ||
) | ||
write_translated( | ||
SP_ETYMOLOGY, | ||
"../translations", | ||
"sp_etymology.toml", | ||
schema="../../schemas/generated/sitelen_pona_translation.json", | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.