Skip to content

Commit

Permalink
Merge pull request #1 from lipu-linku/feature/split
Browse files Browse the repository at this point in the history
Finalize schema
  • Loading branch information
gregdan3 authored Dec 5, 2023
2 parents 0d0b8be + 4be788b commit 6a4411f
Show file tree
Hide file tree
Showing 515 changed files with 98,114 additions and 139,363 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/generate_schema.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Generate JSON Schemas

on:
push:
branches: ["main"]
pull_request:
branches: ["main"]
workflow_dispatch:

permissions:
contents: write

jobs:
generate:
runs-on: ubuntu-latest
name: Generate JSON schemas from Zod schemas
defaults:
run:
working-directory: ./schemas
outputs:
schema_changed: ${{ steps.commit.outputs.committed }}
commit_sha: ${{ steps.commit.outputs.commit_long_sha }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}

- name: Setup pnpm
uses: pnpm/action-setup@v2
with:
version: 8
run_install: |
- args: [--frozen-lockfile]
cwd: ./schemas
- name: Generate JSON Schemas
run: pnpm run generate:schemas

- name: Commit schemas
id: commit
uses: EndBug/add-and-commit@v9
with:
message: "Generated schemas for ${{ github.event.pull_request.head.sha || github.event.head_commit.id || vars.GITHUB_SHA }}"

validate:
name: Validate TOMLs based on schema
needs: generate
uses: ./.github/workflows/validate_toml.yml
with:
commit_sha: ${{ needs.generate.outputs.commit_sha }}
if: ${{ needs.generate.outputs.schema_changed == 'true' }}
31 changes: 24 additions & 7 deletions .github/workflows/validate_toml.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,40 @@ name: Validate TOML Files

on:
push:
branches: ["main"]
pull_request:
branches: ["main"]
paths:
- "**.toml"
workflow_call:
inputs:
commit_sha:
type: string
required: true
workflow_dispatch:

permissions:
contents: read

jobs:
validate:
env:
BranchRef: ${{ github.event_name == 'workflow_run' && inputs.commit_sha || github.sha }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
ref: ${{ env.BranchRef }}

- name: Validate translations
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/${{vars.GITHUB_SHA || 'main'}}/schemas/translation.schema.json ./translations/*.toml
- name: Validate definition files
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/definition_translation.json ./translations/**/definitions.toml

- name: Validate words.toml
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/${{vars.GITHUB_SHA || 'main'}}/schemas/words.schema.json ./words.toml
- name: Validate commentary files
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/commentary_translation.json ./translations/**/commentary.toml

- name: Validate etymology files
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/etymology_translation.json ./translations/**/etymology.toml

- name: Validate sitelen pona files
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/sitelen_pona_translation.json ./translations/**/sp_etymology.toml

- name: Validate word files
run: npx --yes @taplo/cli check --schema https://raw.githubusercontent.com/lipu-linku/sona/$BranchRef/schemas/generated/word.json ./words/*.toml
3 changes: 3 additions & 0 deletions .scripts/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.txt
*.json
__pycache__
18 changes: 18 additions & 0 deletions .scripts/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
PROJDIR=.
INDIR=.
OUTDIR=.

init: ${INDIR}/nimi_pu.txt ${INDIR}/nimi_pi_pu_ala.txt ${INDIR}/compounds.txt ${INDIR}/data.json
process: ${OUTDIR}/nimi.json

${INDIR}/nimi_pu.txt:
curl -s https://tokipona.org/nimi_pu.txt > ${INDIR}/nimi_pu.txt
${INDIR}/nimi_pi_pu_ala.txt:
curl -s https://tokipona.org/nimi_pi_pu_ala.txt > ${INDIR}/nimi_pi_pu_ala.txt
${INDIR}/compounds.txt:
curl -s https://tokipona.org/compounds.txt > ${INDIR}/compounds.txt
${INDIR}/data.json:
curl -s https://linku.la/jasima/data.json > ${INDIR}/data.json

${OUTDIR}/nimi.json: ${PROJDIR}/jsonify_nimi.py ${INDIR}/nimi_pu.txt ${INDIR}/nimi_pi_pu_ala.txt
python ${PROJDIR}/jsonify_nimi.py | jq > ${OUTDIR}/nimi.json
241 changes: 241 additions & 0 deletions .scripts/converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
from collections import defaultdict
import os
import json
from typing import Any
import tomlkit
from functools import partial

from jsonify_nimi import jsonify_nimi

TXT_DATA = jsonify_nimi()


JASIMA_DATA = "data.json"


def nested_defaultdict():
return defaultdict(partial(defaultdict, dict))


def transform_ku_data(word: str, data: dict):
return TXT_DATA.get(word) or None


def transform_etym_data(word: str, data: dict):
if not data:
return []
transable_etyms = []

transable_etyms = []
untransable_etyms = []

langs = data.get("langs", "").split(";")
defs = data.get("defs", "").split(";")
words = data.get("words", "").split(";")
alts = data.get("alts", "").split(";")

for lang, _def, word, alt in zip(langs, defs, words, alts):
transable = dict()
untransable = dict()
if lang:
transable["language"] = lang
if _def:
transable["definition"] = _def
if word:
untransable["word"] = word
if alt:
untransable["alt"] = alt
transable_etyms.append(transable)
untransable_etyms.append(untransable)
return transable_etyms, untransable_etyms


def transform_recognition_data(word: str, data: dict):
new_recog = dict()
for key, value in data.items():
new_recog[key] = int(value)
return new_recog


def transform_to_list(word: str, data: str, splitter: str = ",") -> list:
return [elem.strip() for elem in data.split(splitter)] if data else []


def noop(word: str, data: dict, _return_if_null: Any = ""):
return data if data else _return_if_null


def trash(word: str, data: dict):
return None


WORDS = nested_defaultdict()
REPRESENTATIONS = nested_defaultdict()
DEFINITIONS = nested_defaultdict()

COMMENTARY = nested_defaultdict()
SP_ETYMOLOGY = nested_defaultdict()
ETYMOLOGY = nested_defaultdict()

TRANSFORMER = "t"
DESTINATION = "d"


TRANSFORM_MAP = {
"word": {TRANSFORMER: noop, DESTINATION: WORDS},
# NOTE: this could be in `representations` but we decided against that
"sitelen_pona": {
TRANSFORMER: partial(transform_to_list, splitter=" "),
DESTINATION: REPRESENTATIONS,
},
"ucsur": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
"sitelen_pona_etymology": {TRANSFORMER: trash}, # send to translate
"sitelen_sitelen": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
"sitelen_emosi": {TRANSFORMER: noop, DESTINATION: REPRESENTATIONS},
# "luka_pona": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS},
"luka_pona": {TRANSFORMER: trash}, # to be replaced with totally different doc
"audio": {TRANSFORMER: partial(noop, _return_if_null=dict()), DESTINATION: WORDS},
"coined_year": {TRANSFORMER: noop, DESTINATION: WORDS},
"coined_era": {TRANSFORMER: noop, DESTINATION: WORDS},
"book": {TRANSFORMER: partial(noop, _return_if_null="none"), DESTINATION: WORDS},
"usage_category": {
TRANSFORMER: partial(noop, _return_if_null="obscure"),
DESTINATION: WORDS,
},
"source_language": {
TRANSFORMER: partial(noop, _return_if_null="unknown"),
DESTINATION: WORDS,
},
"etymology": {TRANSFORMER: trash},
"etymology_data": {TRANSFORMER: trash}, # to transform and send to translate
"creator": {
TRANSFORMER: partial(transform_to_list, splitter=","),
DESTINATION: WORDS,
},
"ku_data": {TRANSFORMER: transform_ku_data, DESTINATION: WORDS},
"recognition": {TRANSFORMER: transform_recognition_data, DESTINATION: WORDS},
"see_also": {
TRANSFORMER: partial(transform_to_list, splitter=","),
DESTINATION: WORDS,
},
"tags": {TRANSFORMER: trash},
"author_verbatim": {TRANSFORMER: noop, DESTINATION: WORDS},
"author_verbatim_source": {TRANSFORMER: noop, DESTINATION: WORDS},
"pu_verbatim": {
TRANSFORMER: partial(noop, _return_if_null=None),
DESTINATION: WORDS,
},
"commentary": {TRANSFORMER: trash}, # send to translate
"def": {TRANSFORMER: trash}, # translate special case
}

TRANSLATION_MAP = {
"etymology_data": {
TRANSFORMER: transform_etym_data,
DESTINATION: ETYMOLOGY,
},
"commentary": {
TRANSFORMER: noop,
DESTINATION: COMMENTARY,
},
"sitelen_pona_etymology": {
TRANSFORMER: noop,
DESTINATION: SP_ETYMOLOGY,
},
}


def write_translated(
data: dict,
dir: str,
filename: str,
schema: str = "../../schema/generated/word.json",
):
for lang, d in data.items():
d["$schema"] = schema
os.makedirs(f"{dir}/{lang}", exist_ok=True)
with open(f"{dir}/{lang}/{filename}", "w") as f:
tomlified = tomlkit.dumps(d, sort_keys=True)
f.write(tomlified)


def main():
os.makedirs("../translations", exist_ok=True)
os.makedirs("../words", exist_ok=True)

with open(JASIMA_DATA, "r") as f:
jasima = json.loads(f.read())
langs = jasima["languages"]
data = jasima["data"]

for word in data.keys():
for field in TRANSFORM_MAP.keys():
fetched = data[word].get(field)
formatted = TRANSFORM_MAP[field][TRANSFORMER](word, fetched)
if formatted is not None:
write_to = TRANSFORM_MAP[field][DESTINATION]
write_to[word][field] = formatted

# if field == "ucsur":
# codepoint = data[word].get("ucsur")
# character = ""
# if codepoint:
# character = chr(int(codepoint[2:], base=16))
# words[word]["ucsur_codepoint"] = codepoint
# words[word]["ucsur_character"] = character
# continue

for lang in langs.keys():
DEFINITIONS[lang][word] = data[word]["def"].get(lang) or ""
for field in TRANSLATION_MAP:
fetched = data[word].get(field)
formatted = TRANSLATION_MAP[field][TRANSFORMER](word, fetched)

# TODO: key-aware transform

if formatted is not None:
write_to = TRANSLATION_MAP[field][DESTINATION]
if field == "etymology_data":
untransable = formatted[1] if formatted else []
formatted = formatted[0] if formatted else []
field = "etymology"
WORDS[word][field] = untransable
write_to[lang][word] = formatted

# TODO: order keys freely instead of alphabetically
# or crowdin will solve this for us
for word, worddata in WORDS.items():
worddata["representations"] = REPRESENTATIONS[word]
worddata["$schema"] = "../schemas/generated/word.json"
with open(f"../words/{word}.toml", "w") as f:
tomlified = tomlkit.dumps(worddata, sort_keys=True)
f.write(tomlified)

write_translated(
DEFINITIONS,
"../translations",
"definitions.toml",
schema="../../schemas/generated/definition_translation.json",
)
write_translated(
COMMENTARY,
"../translations",
"commentary.toml",
schema="../../schemas/generated/commentary_translation.json",
)
write_translated(
ETYMOLOGY,
"../translations",
"etymology.toml",
schema="../../schemas/generated/etymology_translation.json",
)
write_translated(
SP_ETYMOLOGY,
"../translations",
"sp_etymology.toml",
schema="../../schemas/generated/sitelen_pona_translation.json",
)


if __name__ == "__main__":
main()
Loading

0 comments on commit 6a4411f

Please sign in to comment.