diff --git a/Lib/gflanguages/udhr.py b/Lib/gflanguages/udhr.py new file mode 100644 index 00000000..ea076bef --- /dev/null +++ b/Lib/gflanguages/udhr.py @@ -0,0 +1,252 @@ +from gflanguages import languages_public_pb2 +import enum +import re + + +class Udhr: + def __init__( + self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name + ): + self.key = key + self.iso639_3 = iso639_3 + self.iso15924 = iso15924 + self.bcp47 = bcp47 + self.direction = direction + self.ohchr = ohchr + self.stage = stage + self.loc = loc + self.name = name + + self.title = None + self.preamble = None + self.articles = [] + + def Parse(self, translation_data): + if translation_data is None or self.stage < 2: + return + + if translation_data.find("./{*}title") is not None: + self.title = translation_data.find("./{*}title").text + + preamble_data = translation_data.find("./{*}preamble") + if preamble_data is not None: + if preamble_data.find("./{*}title") is not None: + self.preamble = { + "title": preamble_data.find("./{*}title").text, + "content": [ + para.text for para in preamble_data.findall("./{*}para") + ], + } + + articles_data = translation_data.findall("./{*}article") + for article_data in articles_data: + title_data = article_data.find("./{*}title") + article = { + "id": int(article_data.get("number")), + "title": None if title_data is None else title_data.text, + "content": [para.text for para in article_data.findall("./{*}para")], + } + self.articles.append(article) + + def LoadArticleOne(self, article_one): + self.articles.append({"id": 0, "title": None, "content": [article_one]}) + + def GetSampleTexts(self): + extractor = SampleTextExtractor(self) + return extractor.GetSampleTexts() + + +class SampleTextExtractor: + class TextType(enum.Enum): + GLYPHS = 1 + WORD = 2 + PHRASE = 3 + SENTENCE = 4 + PARAGRAPH = 5 + PASSAGE = 6 + + def __init__(self, udhr): + self._udhr = udhr + self._glyphs = iter(self._GetGlyphs()) + self._words = iter(self._GetWords()) + self._paragraphs = iter(self._GetParagraphs()) + self._phrase_history = set() + + self._non_word_regex = re.compile(r"[^\w]+") + self._space_regex = re.compile(r"\s+") + self._non_space_regex = re.compile(r"[^\s]+") + self._non_word_space_regex = re.compile(r"[^\w\s]+") + self._any_regex = re.compile(r".") + + def _DisplayLength(self, s): + """Returns length of given string. Omits combining characters. + + Some entire scripts will not be counted; in those cases, the raw length of + the string is returned. + """ + word_space_length = len(self._non_word_space_regex.sub("", s)) + space_length = len(self._non_space_regex.sub("", s)) + if word_space_length == space_length: + return len(s) + return word_space_length + + def _GetGlyphs(self): + seen = set() + for article in self._udhr.articles: + for para in article["content"]: + for ch in self._non_word_regex.sub("", para) or self._space_regex.sub( + "", para + ): + ch = ch.lower() + if ch not in seen: + seen.add(ch) + yield ch + + def _GetWords(self): + if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None: + splitter = self._space_regex + else: + splitter = self._non_word_regex + + seen = set() + for article in self._udhr.articles: + for para in article["content"]: + for s in splitter.split(para): + if s not in seen: + seen.add(s) + yield s + + def _GetParagraphs(self): + if self._udhr.preamble is not None: + for para in self._udhr.preamble["content"]: + yield para + for article in self._udhr.articles: + for para in article["content"]: + yield para + + def _ExtractGlyphs(self, min_chars, max_chars): + s = "" + for ch in self._glyphs: + s += ch.upper() + if len(s) >= min_chars: + break + if ch != ch.upper(): + s += ch + if len(s) >= min_chars: + break + return s + + def _ExtractWord(self, min_chars, max_chars): + for iterator in [self._words, self._GetWords()]: + for w in iterator: + if w is None: + continue + if min_chars <= self._DisplayLength(w) <= max_chars: + return w + # Fallback to using multiple words for languages with very small words + return self._ExtractPhrase(min_chars, max_chars) + + def _ExtractPhrase(self, min_chars, max_chars): + for iterator in [self._paragraphs, self._GetParagraphs()]: + for para in iterator: + if para is None: + continue + for regex in [self._any_regex, self._space_regex, self._non_word_regex]: + breaks = [-1] + for match in regex.finditer(para, min_chars): + breaks.append(match.start()) + phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]] + p_size = self._DisplayLength(phrase) + while p_size > max_chars and len(breaks) > 1: + breaks.pop() + phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]] + p_size = self._DisplayLength(phrase) + if min_chars <= p_size and phrase not in self._phrase_history: + self._phrase_history.add(phrase) + return phrase + return self._ExtractParagraph(min_chars, max_chars) + + def _ExtractSentence(self, min_chars, max_chars): + # Sentence delimination may differ between scripts, so tokenizing on spaces + # would be unreliable. Prefer to use _ExtractPhrase. + return self._ExtractPhrase(min_chars, max_chars) + + def _ExtractParagraph(self, min_chars, max_chars): + for iterator in [self._paragraphs, self._GetParagraphs()]: + for para in iterator: + if para is None: + continue + if min_chars <= self._DisplayLength(para) <= max_chars: + return para + # Paragraphs likely insufficient length; try combining into passages + return self._ExtractPassage(min_chars, max_chars) + + def _ExtractPassage(self, min_chars, max_chars): + p = [] + p_size = 0 + while p_size < min_chars: + for iterator in [self._paragraphs, self._GetParagraphs()]: + for para in iterator: + if para is None: + continue + p.append(para) + p_size = self._DisplayLength(" ".join(p)) + if max_chars < p_size: + p.pop() + elif min_chars <= p_size: + return "\n".join(p) + assert len(p) > 0, "Unable to extract passage: " + self._udhr.key + if len(p) == 0: + p.append([p for p in self._GetParagraphs()][0]) + return "\n".join(p) + + def _Get(self, text_type, **kwargs): + if "char_count" in kwargs: + min_chars = kwargs["char_count"] + max_chars = kwargs["char_count"] + else: + min_chars = kwargs["min_chars"] + max_chars = kwargs["max_chars"] + if text_type == self.TextType.GLYPHS: + return self._ExtractGlyphs(min_chars, max_chars) + if text_type == self.TextType.WORD: + return self._ExtractWord(min_chars, max_chars) + if text_type == self.TextType.PHRASE: + return self._ExtractPhrase(min_chars, max_chars) + if text_type == self.TextType.SENTENCE: + return self._ExtractSentence(min_chars, max_chars) + if text_type == self.TextType.PARAGRAPH: + return self._ExtractParagraph(min_chars, max_chars) + if text_type == self.TextType.PASSAGE: + return self._ExtractPassage(min_chars, max_chars) + raise Exception("Unsupported text type: " + text_type) + + def GetSampleTexts(self): + sample_text = languages_public_pb2.SampleTextProto() + sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4) + sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2) + sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60) + sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90) + sample_text.poster_sm = self._Get( + self.TextType.PHRASE, min_chars=10, max_chars=17 + ) + sample_text.poster_md = self._Get( + self.TextType.PHRASE, min_chars=6, max_chars=12 + ) + sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8) + sample_text.specimen_48 = self._Get( + self.TextType.SENTENCE, min_chars=50, max_chars=80 + ) + sample_text.specimen_36 = self._Get( + self.TextType.PARAGRAPH, min_chars=100, max_chars=120 + ) + sample_text.specimen_32 = self._Get( + self.TextType.PARAGRAPH, min_chars=140, max_chars=180 + ) + sample_text.specimen_21 = self._Get( + self.TextType.PASSAGE, min_chars=300, max_chars=500 + ) + sample_text.specimen_16 = self._Get( + self.TextType.PASSAGE, min_chars=550, max_chars=750 + ) + return sample_text diff --git a/snippets/lang_sample_text.py b/snippets/lang_sample_text.py new file mode 100755 index 00000000..10b38913 --- /dev/null +++ b/snippets/lang_sample_text.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +""" +lang-sample-text + +Adds sample text for a given language using the specified UDHR translation. + +Usage: + +lang-sample-text -l ./languages/en.textproto ./udhr_translations/en.xml + +""" + +from gflanguages import LoadLanguages, languages_public_pb2 +from gftools.util.google_fonts import ReadProto, WriteProto +from gflanguages.udhr import Udhr +from lxml import etree +import os +import re +import argparse + + +def main(argv=None): + parser = argparse.ArgumentParser( + description="Update UDHR sample text for a given language" + ) + parser.add_argument( + "-l", + "--lang", + help="Language proto file to update", + required=True, + ) + parser.add_argument( + "-u", + "--udhr", + help="Path to UDHR translation (XML)", + required=True, + ) + args = parser.parse_args(argv) + + language = ReadProto(languages_public_pb2.LanguageProto(), args.lang) + + udhr_data = etree.parse(args.udhr) + head = udhr_data.getroot() + for name, value in head.attrib.items(): + if re.search(r"\{.*\}lang", name): + bcp47 = value.replace("-", "_") + udhr = Udhr( + key=head.get("key"), + iso639_3=head.get("iso639-3"), + iso15924=head.get("iso15924"), + bcp47=bcp47, + direction=head.get("dir"), + ohchr=None, + stage=4, + loc=None, + name=head.get("n"), + ) + udhr.Parse(udhr_data) + + language.sample_text.MergeFrom(udhr.GetSampleTexts()) + WriteProto(language, args.lang) + + +if __name__ == "__main__": + main()