-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #163 from googlefonts/move-lang-sample-text
Move sample text updater from gftools to here
- Loading branch information
Showing
2 changed files
with
317 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
from gflanguages import languages_public_pb2 | ||
import enum | ||
import re | ||
|
||
|
||
class Udhr: | ||
def __init__( | ||
self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name | ||
): | ||
self.key = key | ||
self.iso639_3 = iso639_3 | ||
self.iso15924 = iso15924 | ||
self.bcp47 = bcp47 | ||
self.direction = direction | ||
self.ohchr = ohchr | ||
self.stage = stage | ||
self.loc = loc | ||
self.name = name | ||
|
||
self.title = None | ||
self.preamble = None | ||
self.articles = [] | ||
|
||
def Parse(self, translation_data): | ||
if translation_data is None or self.stage < 2: | ||
return | ||
|
||
if translation_data.find("./{*}title") is not None: | ||
self.title = translation_data.find("./{*}title").text | ||
|
||
preamble_data = translation_data.find("./{*}preamble") | ||
if preamble_data is not None: | ||
if preamble_data.find("./{*}title") is not None: | ||
self.preamble = { | ||
"title": preamble_data.find("./{*}title").text, | ||
"content": [ | ||
para.text for para in preamble_data.findall("./{*}para") | ||
], | ||
} | ||
|
||
articles_data = translation_data.findall("./{*}article") | ||
for article_data in articles_data: | ||
title_data = article_data.find("./{*}title") | ||
article = { | ||
"id": int(article_data.get("number")), | ||
"title": None if title_data is None else title_data.text, | ||
"content": [para.text for para in article_data.findall("./{*}para")], | ||
} | ||
self.articles.append(article) | ||
|
||
def LoadArticleOne(self, article_one): | ||
self.articles.append({"id": 0, "title": None, "content": [article_one]}) | ||
|
||
def GetSampleTexts(self): | ||
extractor = SampleTextExtractor(self) | ||
return extractor.GetSampleTexts() | ||
|
||
|
||
class SampleTextExtractor: | ||
class TextType(enum.Enum): | ||
GLYPHS = 1 | ||
WORD = 2 | ||
PHRASE = 3 | ||
SENTENCE = 4 | ||
PARAGRAPH = 5 | ||
PASSAGE = 6 | ||
|
||
def __init__(self, udhr): | ||
self._udhr = udhr | ||
self._glyphs = iter(self._GetGlyphs()) | ||
self._words = iter(self._GetWords()) | ||
self._paragraphs = iter(self._GetParagraphs()) | ||
self._phrase_history = set() | ||
|
||
self._non_word_regex = re.compile(r"[^\w]+") | ||
self._space_regex = re.compile(r"\s+") | ||
self._non_space_regex = re.compile(r"[^\s]+") | ||
self._non_word_space_regex = re.compile(r"[^\w\s]+") | ||
self._any_regex = re.compile(r".") | ||
|
||
def _DisplayLength(self, s): | ||
"""Returns length of given string. Omits combining characters. | ||
Some entire scripts will not be counted; in those cases, the raw length of | ||
the string is returned. | ||
""" | ||
word_space_length = len(self._non_word_space_regex.sub("", s)) | ||
space_length = len(self._non_space_regex.sub("", s)) | ||
if word_space_length == space_length: | ||
return len(s) | ||
return word_space_length | ||
|
||
def _GetGlyphs(self): | ||
seen = set() | ||
for article in self._udhr.articles: | ||
for para in article["content"]: | ||
for ch in self._non_word_regex.sub("", para) or self._space_regex.sub( | ||
"", para | ||
): | ||
ch = ch.lower() | ||
if ch not in seen: | ||
seen.add(ch) | ||
yield ch | ||
|
||
def _GetWords(self): | ||
if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None: | ||
splitter = self._space_regex | ||
else: | ||
splitter = self._non_word_regex | ||
|
||
seen = set() | ||
for article in self._udhr.articles: | ||
for para in article["content"]: | ||
for s in splitter.split(para): | ||
if s not in seen: | ||
seen.add(s) | ||
yield s | ||
|
||
def _GetParagraphs(self): | ||
if self._udhr.preamble is not None: | ||
for para in self._udhr.preamble["content"]: | ||
yield para | ||
for article in self._udhr.articles: | ||
for para in article["content"]: | ||
yield para | ||
|
||
def _ExtractGlyphs(self, min_chars, max_chars): | ||
s = "" | ||
for ch in self._glyphs: | ||
s += ch.upper() | ||
if len(s) >= min_chars: | ||
break | ||
if ch != ch.upper(): | ||
s += ch | ||
if len(s) >= min_chars: | ||
break | ||
return s | ||
|
||
def _ExtractWord(self, min_chars, max_chars): | ||
for iterator in [self._words, self._GetWords()]: | ||
for w in iterator: | ||
if w is None: | ||
continue | ||
if min_chars <= self._DisplayLength(w) <= max_chars: | ||
return w | ||
# Fallback to using multiple words for languages with very small words | ||
return self._ExtractPhrase(min_chars, max_chars) | ||
|
||
def _ExtractPhrase(self, min_chars, max_chars): | ||
for iterator in [self._paragraphs, self._GetParagraphs()]: | ||
for para in iterator: | ||
if para is None: | ||
continue | ||
for regex in [self._any_regex, self._space_regex, self._non_word_regex]: | ||
breaks = [-1] | ||
for match in regex.finditer(para, min_chars): | ||
breaks.append(match.start()) | ||
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]] | ||
p_size = self._DisplayLength(phrase) | ||
while p_size > max_chars and len(breaks) > 1: | ||
breaks.pop() | ||
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]] | ||
p_size = self._DisplayLength(phrase) | ||
if min_chars <= p_size and phrase not in self._phrase_history: | ||
self._phrase_history.add(phrase) | ||
return phrase | ||
return self._ExtractParagraph(min_chars, max_chars) | ||
|
||
def _ExtractSentence(self, min_chars, max_chars): | ||
# Sentence delimination may differ between scripts, so tokenizing on spaces | ||
# would be unreliable. Prefer to use _ExtractPhrase. | ||
return self._ExtractPhrase(min_chars, max_chars) | ||
|
||
def _ExtractParagraph(self, min_chars, max_chars): | ||
for iterator in [self._paragraphs, self._GetParagraphs()]: | ||
for para in iterator: | ||
if para is None: | ||
continue | ||
if min_chars <= self._DisplayLength(para) <= max_chars: | ||
return para | ||
# Paragraphs likely insufficient length; try combining into passages | ||
return self._ExtractPassage(min_chars, max_chars) | ||
|
||
def _ExtractPassage(self, min_chars, max_chars): | ||
p = [] | ||
p_size = 0 | ||
while p_size < min_chars: | ||
for iterator in [self._paragraphs, self._GetParagraphs()]: | ||
for para in iterator: | ||
if para is None: | ||
continue | ||
p.append(para) | ||
p_size = self._DisplayLength(" ".join(p)) | ||
if max_chars < p_size: | ||
p.pop() | ||
elif min_chars <= p_size: | ||
return "\n".join(p) | ||
assert len(p) > 0, "Unable to extract passage: " + self._udhr.key | ||
if len(p) == 0: | ||
p.append([p for p in self._GetParagraphs()][0]) | ||
return "\n".join(p) | ||
|
||
def _Get(self, text_type, **kwargs): | ||
if "char_count" in kwargs: | ||
min_chars = kwargs["char_count"] | ||
max_chars = kwargs["char_count"] | ||
else: | ||
min_chars = kwargs["min_chars"] | ||
max_chars = kwargs["max_chars"] | ||
if text_type == self.TextType.GLYPHS: | ||
return self._ExtractGlyphs(min_chars, max_chars) | ||
if text_type == self.TextType.WORD: | ||
return self._ExtractWord(min_chars, max_chars) | ||
if text_type == self.TextType.PHRASE: | ||
return self._ExtractPhrase(min_chars, max_chars) | ||
if text_type == self.TextType.SENTENCE: | ||
return self._ExtractSentence(min_chars, max_chars) | ||
if text_type == self.TextType.PARAGRAPH: | ||
return self._ExtractParagraph(min_chars, max_chars) | ||
if text_type == self.TextType.PASSAGE: | ||
return self._ExtractPassage(min_chars, max_chars) | ||
raise Exception("Unsupported text type: " + text_type) | ||
|
||
def GetSampleTexts(self): | ||
sample_text = languages_public_pb2.SampleTextProto() | ||
sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4) | ||
sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2) | ||
sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60) | ||
sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90) | ||
sample_text.poster_sm = self._Get( | ||
self.TextType.PHRASE, min_chars=10, max_chars=17 | ||
) | ||
sample_text.poster_md = self._Get( | ||
self.TextType.PHRASE, min_chars=6, max_chars=12 | ||
) | ||
sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8) | ||
sample_text.specimen_48 = self._Get( | ||
self.TextType.SENTENCE, min_chars=50, max_chars=80 | ||
) | ||
sample_text.specimen_36 = self._Get( | ||
self.TextType.PARAGRAPH, min_chars=100, max_chars=120 | ||
) | ||
sample_text.specimen_32 = self._Get( | ||
self.TextType.PARAGRAPH, min_chars=140, max_chars=180 | ||
) | ||
sample_text.specimen_21 = self._Get( | ||
self.TextType.PASSAGE, min_chars=300, max_chars=500 | ||
) | ||
sample_text.specimen_16 = self._Get( | ||
self.TextType.PASSAGE, min_chars=550, max_chars=750 | ||
) | ||
return sample_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
lang-sample-text | ||
Adds sample text for a given language using the specified UDHR translation. | ||
Usage: | ||
lang-sample-text -l ./languages/en.textproto ./udhr_translations/en.xml | ||
""" | ||
|
||
from gflanguages import LoadLanguages, languages_public_pb2 | ||
from gftools.util.google_fonts import ReadProto, WriteProto | ||
from gflanguages.udhr import Udhr | ||
from lxml import etree | ||
import os | ||
import re | ||
import argparse | ||
|
||
|
||
def main(argv=None): | ||
parser = argparse.ArgumentParser( | ||
description="Update UDHR sample text for a given language" | ||
) | ||
parser.add_argument( | ||
"-l", | ||
"--lang", | ||
help="Language proto file to update", | ||
required=True, | ||
) | ||
parser.add_argument( | ||
"-u", | ||
"--udhr", | ||
help="Path to UDHR translation (XML)", | ||
required=True, | ||
) | ||
args = parser.parse_args(argv) | ||
|
||
language = ReadProto(languages_public_pb2.LanguageProto(), args.lang) | ||
|
||
udhr_data = etree.parse(args.udhr) | ||
head = udhr_data.getroot() | ||
for name, value in head.attrib.items(): | ||
if re.search(r"\{.*\}lang", name): | ||
bcp47 = value.replace("-", "_") | ||
udhr = Udhr( | ||
key=head.get("key"), | ||
iso639_3=head.get("iso639-3"), | ||
iso15924=head.get("iso15924"), | ||
bcp47=bcp47, | ||
direction=head.get("dir"), | ||
ohchr=None, | ||
stage=4, | ||
loc=None, | ||
name=head.get("n"), | ||
) | ||
udhr.Parse(udhr_data) | ||
|
||
language.sample_text.MergeFrom(udhr.GetSampleTexts()) | ||
WriteProto(language, args.lang) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |