Skip to content

Commit

Permalink
refactor: massive refactor to pydantic
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Aug 31, 2023
1 parent b413089 commit 84abb83
Show file tree
Hide file tree
Showing 26 changed files with 965 additions and 938 deletions.
9 changes: 5 additions & 4 deletions g2p/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@
from g2p.mappings import Mapping
from g2p.mappings.langs import LANGS, LANGS_NETWORK
from g2p.mappings.tokenizer import Tokenizer, make_tokenizer
from g2p.mappings.utils import _MappingModelDefinition
from g2p.transducer import CompositeTransducer, TokenizingTransducer, Transducer


_g2p_cache: Dict[
Tuple[str, str, Optional[str], bool, int],
Union[Transducer, CompositeTransducer, TokenizingTransducer],
Expand Down Expand Up @@ -181,10 +181,11 @@ def get_arpabet_langs():
for _, v in LANGS.items():
for mapping in v["mappings"]:
# add mapping to names hash table
full_lang_names[mapping["in_lang"]] = mapping["language_name"]
config: _MappingModelDefinition = mapping
full_lang_names[config.in_lang] = config.language_name
# add input id to all available langs list
if mapping["in_lang"] not in langs_available:
langs_available.append(mapping["in_lang"])
if config.in_lang not in langs_available:
langs_available.append(config.in_lang)

# get the key from all networks in g2p module that have a path to 'eng-arpabet',
# which is needed for the readalongs
Expand Down
16 changes: 5 additions & 11 deletions g2p/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,11 @@ def get(self, node):

class Langs(Resource):
def __init__(self):
self.AVAILABLE_MAPPINGS = sorted(
[
{
k: v
for k, v in x.items()
if k not in ["mapping_data", "abbreviations_data"]
}
for x in MAPPINGS_AVAILABLE
],
key=lambda x: x["in_lang"],
)
# TODO: exclude parent dir and maybe null values too
self.AVAILABLE_MAPPINGS = [
json.loads(mapping.json())
for mapping in sorted(MAPPINGS_AVAILABLE, key=lambda x: x.in_lang)
]
self.parser = reqparse.RequestParser()
self.parser.add_argument(
"verbose",
Expand Down
48 changes: 24 additions & 24 deletions g2p/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
Views and config to the g2p Studio web app
"""
from typing import Union
import json
from typing import List, Union

from flask import Flask, render_template
from flask_cors import CORS
Expand All @@ -14,7 +15,11 @@
from g2p.api import g2p_api
from g2p.mappings import Mapping
from g2p.mappings.langs import LANGS_NETWORK
from g2p.mappings.utils import expand_abbreviations_format, flatten_abbreviations_format
from g2p.mappings.utils import (
_MappingModelDefinition,
expand_abbreviations_format,
flatten_abbreviations_format,
)
from g2p.transducer import (
CompositeTransducer,
CompositeTransductionGraph,
Expand Down Expand Up @@ -133,13 +138,13 @@ def docs():
def convert(message):
"""Convert input text and return output"""
transducers = []

for mapping in message["data"]["mappings"]:
mappings_obj = Mapping(
mapping["mapping"],
abbreviations=flatten_abbreviations_format(mapping["abbreviations"]),
**mapping["kwargs"],
mapping_args = {**mapping["kwargs"]}
mapping_args["abbreviations"] = flatten_abbreviations_format(
mapping["abbreviations"]
)
mapping_args["mapping"] = mapping["mapping"]
mappings_obj = Mapping(**mapping_args)
transducer = Transducer(mappings_obj)
transducers.append(transducer)
if len(transducers) == 0:
Expand Down Expand Up @@ -174,21 +179,16 @@ def change_table(message):
{"in": "", "out": "", "context_before": "", "context_after": ""}
] * DEFAULT_N
abbs = [[""] * 6] * DEFAULT_N
kwargs = {
"language_name": "Custom",
"display_name": "Custom",
"in_lang": "custom",
"out_lang": "custom",
"include": False,
"type": "mapping",
"case_sensitive": True,
"norm_form": "NFC",
"escape_special": False,
"prevent_feeding": False,
"reverse": False,
"rule_ordering": "as-written",
"out_delimiter": "",
}

kwargs = _MappingModelDefinition(
language_name="Custom",
display_name="Custom",
in_lang="custom",
out_lang="custom",
type="mapping",
norm_form="NFC",
).dict()
kwargs["include"] = False
emit(
"table response",
[
Expand All @@ -201,7 +201,7 @@ def change_table(message):
)
else:
path = shortest_path(LANGS_NETWORK, message["in_lang"], message["out_lang"])
mappings = []
mappings: List[Mapping] = []
for lang1, lang2 in zip(path[:-1], path[1:]):
transducer = make_g2p(lang1, lang2, tokenize=False)
mappings.append(transducer.mapping)
Expand All @@ -211,7 +211,7 @@ def change_table(message):
{
"mappings": x.plain_mapping(),
"abbs": expand_abbreviations_format(x.abbreviations),
"kwargs": x.kwargs,
"kwargs": json.loads(x.mapping_config.json()),
}
for x in mappings
],
Expand Down
45 changes: 24 additions & 21 deletions g2p/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pprint
import re
import sys
from typing import List, Tuple

import click
import yaml
Expand Down Expand Up @@ -118,9 +119,9 @@ def parse_from_or_to_lang_spec(lang_spec):
out_lang = in_lang + "-ipa"
# check_ipa_known_segs([out_lang]) # this outputs a lot of spurious noise...
mappings = [
(Mapping(in_lang=m["in_lang"], out_lang=m["out_lang"]), "out")
(Mapping(in_lang=m.in_lang, out_lang=m.out_lang), "out")
for m in MAPPINGS_AVAILABLE
if m["out_lang"] == out_lang and not is_ipa(m["in_lang"])
if m.out_lang == out_lang and not is_ipa(m.in_lang)
]
if not mappings:
raise click.BadParameter(f'No IPA mappings found for "{lang_spec}".')
Expand Down Expand Up @@ -399,10 +400,10 @@ def generate_mapping( # noqa: C901
# --from/--to mode
assert to_langs is not None

from_mappings = []
from_mappings: List[Tuple[Mapping, str]] = []
for from_lang in re.split(r"[:,]", from_langs):
from_mappings.extend(parse_from_or_to_lang_spec(from_lang))
to_mappings = []
to_mappings: List[Tuple[Mapping, str]] = []
for to_lang in re.split(r"[:,]", to_langs):
to_mappings.extend(parse_from_or_to_lang_spec(to_lang))

Expand All @@ -417,11 +418,11 @@ def generate_mapping( # noqa: C901

for from_mapping, in_or_out in from_mappings:
LOGGER.info(
f'From mapping: {from_mapping.kwargs["in_lang"]}_to_{from_mapping.kwargs["out_lang"]}[{in_or_out}]'
f"From mapping: {from_mapping.mapping_config.in_lang}_to_{from_mapping.mapping_config.out_lang}[{in_or_out}]"
)
for to_mapping, in_or_out in to_mappings:
LOGGER.info(
f'To mapping: {to_mapping.kwargs["in_lang"]}_to_{to_mapping.kwargs["out_lang"]}[{in_or_out}]'
f"To mapping: {to_mapping.mapping_config.in_lang}_to_{to_mapping.mapping_config.out_lang}[{in_or_out}]"
)

new_mapping = create_multi_mapping(
Expand Down Expand Up @@ -529,7 +530,7 @@ def convert( # noqa: C901
else:
mapping = load_mapping_from_path(config)
data["mappings"] = [mapping]
mappings_legal_pairs.append((mapping["in_lang"], mapping["out_lang"]))
mappings_legal_pairs.append((mapping.in_lang, mapping.out_lang))
for pair in mappings_legal_pairs:
if pair[0] in LANGS_NETWORK.nodes:
LOGGER.warning(
Expand Down Expand Up @@ -604,7 +605,7 @@ def doctor(mapping, list_all, list_ipa):
http://g2p-studio.herokuapp.com/api/v1/langs .
"""
if list_all or list_ipa:
out_langs = sorted({x["out_lang"] for x in MAPPINGS_AVAILABLE})
out_langs = sorted({x.out_lang for x in MAPPINGS_AVAILABLE})
if list_ipa:
out_langs = [x for x in out_langs if is_ipa(x)]
LOGGER.info(
Expand All @@ -614,16 +615,14 @@ def doctor(mapping, list_all, list_ipa):
print(f"{m}: ", end="")
print(
("\n" + " " * len(m) + " ").join(
sorted(
x["in_lang"] for x in MAPPINGS_AVAILABLE if x["out_lang"] == m
)
sorted(x.in_lang for x in MAPPINGS_AVAILABLE if x.out_lang == m)
)
)
print("")
return

for m in mapping:
if m not in [x["out_lang"] for x in MAPPINGS_AVAILABLE]:
if m not in [x.out_lang for x in MAPPINGS_AVAILABLE]:
raise click.UsageError(
f"No known mappings into '{m}'. "
"Use --list-all or --list-ipa to list valid options."
Expand Down Expand Up @@ -695,16 +694,16 @@ def scan(lang, path):
case_sensitive = True
mappings = []
for mapping in MAPPINGS_AVAILABLE:
mapping_name = mapping["in_lang"]
mapping_name = mapping.in_lang
# Exclude mappings for converting between IPAs
if mapping_name.startswith(lang) and "ipa" not in mapping_name:
case_sensitive = case_sensitive and mapping.get("case_sensitive", True)
case_sensitive = case_sensitive and mapping.case_sensitive
mappings.append(mapping)

# Get input chars in mapping
mapped_chars = set()
for lang_mapping in mappings:
for x in lang_mapping["mapping_data"]:
for x in lang_mapping.mapping:
mapped_chars.add(normalize(x["in"], "NFD"))
# Find unmapped chars
filter_chars = " \n"
Expand Down Expand Up @@ -755,9 +754,9 @@ def show_mappings(lang1, lang2, verbose, csv):

elif lang1 is not None:
mappings = [
Mapping(in_lang=m["in_lang"], out_lang=m["out_lang"])
Mapping(in_lang=m.in_lang, out_lang=m.out_lang)
for m in MAPPINGS_AVAILABLE
if m["in_lang"] == lang1 or m["out_lang"] == lang1
if m.in_lang == lang1 or m.out_lang == lang1
]
if not mappings:
raise click.BadParameter(
Expand All @@ -766,18 +765,22 @@ def show_mappings(lang1, lang2, verbose, csv):

else:
mappings = (
Mapping(in_lang=m["in_lang"], out_lang=m["out_lang"])
for m in MAPPINGS_AVAILABLE
Mapping(in_lang=m.in_lang, out_lang=m.out_lang) for m in MAPPINGS_AVAILABLE
)

file_type = "csv" if csv else "json"
if verbose:
for m in mappings:
json.dump(m.kwargs, sys.stdout, indent=4, ensure_ascii=False)
json.dump(
json.loads(m.mapping_config.json()),
sys.stdout,
indent=4,
ensure_ascii=False,
)
print()
m.mapping_to_stream(sys.stdout, file_type=file_type)
print()
print()
else:
for i, m in enumerate(mappings):
print(f"{i+1}: {m.kwargs['in_lang']}{m.kwargs['out_lang']}")
print(f"{i+1}: {m.mapping_config.in_lang}{m.mapping_config.out_lang}")
Loading

0 comments on commit 84abb83

Please sign in to comment.