Skip to content

Commit

Permalink
Merge pull request #163 from googlefonts/move-lang-sample-text
Browse files Browse the repository at this point in the history
Move sample text updater from gftools to here
  • Loading branch information
simoncozens authored Sep 12, 2024
2 parents a16f59d + 15a5c30 commit 257af53
Show file tree
Hide file tree
Showing 2 changed files with 317 additions and 0 deletions.
252 changes: 252 additions & 0 deletions Lib/gflanguages/udhr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
from gflanguages import languages_public_pb2
import enum
import re


class Udhr:
def __init__(
self, key, iso639_3, iso15924, bcp47, direction, ohchr, stage, loc, name
):
self.key = key
self.iso639_3 = iso639_3
self.iso15924 = iso15924
self.bcp47 = bcp47
self.direction = direction
self.ohchr = ohchr
self.stage = stage
self.loc = loc
self.name = name

self.title = None
self.preamble = None
self.articles = []

def Parse(self, translation_data):
if translation_data is None or self.stage < 2:
return

if translation_data.find("./{*}title") is not None:
self.title = translation_data.find("./{*}title").text

preamble_data = translation_data.find("./{*}preamble")
if preamble_data is not None:
if preamble_data.find("./{*}title") is not None:
self.preamble = {
"title": preamble_data.find("./{*}title").text,
"content": [
para.text for para in preamble_data.findall("./{*}para")
],
}

articles_data = translation_data.findall("./{*}article")
for article_data in articles_data:
title_data = article_data.find("./{*}title")
article = {
"id": int(article_data.get("number")),
"title": None if title_data is None else title_data.text,
"content": [para.text for para in article_data.findall("./{*}para")],
}
self.articles.append(article)

def LoadArticleOne(self, article_one):
self.articles.append({"id": 0, "title": None, "content": [article_one]})

def GetSampleTexts(self):
extractor = SampleTextExtractor(self)
return extractor.GetSampleTexts()


class SampleTextExtractor:
class TextType(enum.Enum):
GLYPHS = 1
WORD = 2
PHRASE = 3
SENTENCE = 4
PARAGRAPH = 5
PASSAGE = 6

def __init__(self, udhr):
self._udhr = udhr
self._glyphs = iter(self._GetGlyphs())
self._words = iter(self._GetWords())
self._paragraphs = iter(self._GetParagraphs())
self._phrase_history = set()

self._non_word_regex = re.compile(r"[^\w]+")
self._space_regex = re.compile(r"\s+")
self._non_space_regex = re.compile(r"[^\s]+")
self._non_word_space_regex = re.compile(r"[^\w\s]+")
self._any_regex = re.compile(r".")

def _DisplayLength(self, s):
"""Returns length of given string. Omits combining characters.
Some entire scripts will not be counted; in those cases, the raw length of
the string is returned.
"""
word_space_length = len(self._non_word_space_regex.sub("", s))
space_length = len(self._non_space_regex.sub("", s))
if word_space_length == space_length:
return len(s)
return word_space_length

def _GetGlyphs(self):
seen = set()
for article in self._udhr.articles:
for para in article["content"]:
for ch in self._non_word_regex.sub("", para) or self._space_regex.sub(
"", para
):
ch = ch.lower()
if ch not in seen:
seen.add(ch)
yield ch

def _GetWords(self):
if self._space_regex.search(self._udhr.articles[0]["content"][0]) is not None:
splitter = self._space_regex
else:
splitter = self._non_word_regex

seen = set()
for article in self._udhr.articles:
for para in article["content"]:
for s in splitter.split(para):
if s not in seen:
seen.add(s)
yield s

def _GetParagraphs(self):
if self._udhr.preamble is not None:
for para in self._udhr.preamble["content"]:
yield para
for article in self._udhr.articles:
for para in article["content"]:
yield para

def _ExtractGlyphs(self, min_chars, max_chars):
s = ""
for ch in self._glyphs:
s += ch.upper()
if len(s) >= min_chars:
break
if ch != ch.upper():
s += ch
if len(s) >= min_chars:
break
return s

def _ExtractWord(self, min_chars, max_chars):
for iterator in [self._words, self._GetWords()]:
for w in iterator:
if w is None:
continue
if min_chars <= self._DisplayLength(w) <= max_chars:
return w
# Fallback to using multiple words for languages with very small words
return self._ExtractPhrase(min_chars, max_chars)

def _ExtractPhrase(self, min_chars, max_chars):
for iterator in [self._paragraphs, self._GetParagraphs()]:
for para in iterator:
if para is None:
continue
for regex in [self._any_regex, self._space_regex, self._non_word_regex]:
breaks = [-1]
for match in regex.finditer(para, min_chars):
breaks.append(match.start())
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
p_size = self._DisplayLength(phrase)
while p_size > max_chars and len(breaks) > 1:
breaks.pop()
phrase = para[breaks[0] + 1 : breaks[len(breaks) - 1]]
p_size = self._DisplayLength(phrase)
if min_chars <= p_size and phrase not in self._phrase_history:
self._phrase_history.add(phrase)
return phrase
return self._ExtractParagraph(min_chars, max_chars)

def _ExtractSentence(self, min_chars, max_chars):
# Sentence delimination may differ between scripts, so tokenizing on spaces
# would be unreliable. Prefer to use _ExtractPhrase.
return self._ExtractPhrase(min_chars, max_chars)

def _ExtractParagraph(self, min_chars, max_chars):
for iterator in [self._paragraphs, self._GetParagraphs()]:
for para in iterator:
if para is None:
continue
if min_chars <= self._DisplayLength(para) <= max_chars:
return para
# Paragraphs likely insufficient length; try combining into passages
return self._ExtractPassage(min_chars, max_chars)

def _ExtractPassage(self, min_chars, max_chars):
p = []
p_size = 0
while p_size < min_chars:
for iterator in [self._paragraphs, self._GetParagraphs()]:
for para in iterator:
if para is None:
continue
p.append(para)
p_size = self._DisplayLength(" ".join(p))
if max_chars < p_size:
p.pop()
elif min_chars <= p_size:
return "\n".join(p)
assert len(p) > 0, "Unable to extract passage: " + self._udhr.key
if len(p) == 0:
p.append([p for p in self._GetParagraphs()][0])
return "\n".join(p)

def _Get(self, text_type, **kwargs):
if "char_count" in kwargs:
min_chars = kwargs["char_count"]
max_chars = kwargs["char_count"]
else:
min_chars = kwargs["min_chars"]
max_chars = kwargs["max_chars"]
if text_type == self.TextType.GLYPHS:
return self._ExtractGlyphs(min_chars, max_chars)
if text_type == self.TextType.WORD:
return self._ExtractWord(min_chars, max_chars)
if text_type == self.TextType.PHRASE:
return self._ExtractPhrase(min_chars, max_chars)
if text_type == self.TextType.SENTENCE:
return self._ExtractSentence(min_chars, max_chars)
if text_type == self.TextType.PARAGRAPH:
return self._ExtractParagraph(min_chars, max_chars)
if text_type == self.TextType.PASSAGE:
return self._ExtractPassage(min_chars, max_chars)
raise Exception("Unsupported text type: " + text_type)

def GetSampleTexts(self):
sample_text = languages_public_pb2.SampleTextProto()
sample_text.masthead_full = self._Get(self.TextType.GLYPHS, char_count=4)
sample_text.masthead_partial = self._Get(self.TextType.GLYPHS, char_count=2)
sample_text.styles = self._Get(self.TextType.PHRASE, min_chars=40, max_chars=60)
sample_text.tester = self._Get(self.TextType.PHRASE, min_chars=60, max_chars=90)
sample_text.poster_sm = self._Get(
self.TextType.PHRASE, min_chars=10, max_chars=17
)
sample_text.poster_md = self._Get(
self.TextType.PHRASE, min_chars=6, max_chars=12
)
sample_text.poster_lg = self._Get(self.TextType.WORD, min_chars=3, max_chars=8)
sample_text.specimen_48 = self._Get(
self.TextType.SENTENCE, min_chars=50, max_chars=80
)
sample_text.specimen_36 = self._Get(
self.TextType.PARAGRAPH, min_chars=100, max_chars=120
)
sample_text.specimen_32 = self._Get(
self.TextType.PARAGRAPH, min_chars=140, max_chars=180
)
sample_text.specimen_21 = self._Get(
self.TextType.PASSAGE, min_chars=300, max_chars=500
)
sample_text.specimen_16 = self._Get(
self.TextType.PASSAGE, min_chars=550, max_chars=750
)
return sample_text
65 changes: 65 additions & 0 deletions snippets/lang_sample_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
lang-sample-text
Adds sample text for a given language using the specified UDHR translation.
Usage:
lang-sample-text -l ./languages/en.textproto ./udhr_translations/en.xml
"""

from gflanguages import LoadLanguages, languages_public_pb2
from gftools.util.google_fonts import ReadProto, WriteProto
from gflanguages.udhr import Udhr
from lxml import etree
import os
import re
import argparse


def main(argv=None):
parser = argparse.ArgumentParser(
description="Update UDHR sample text for a given language"
)
parser.add_argument(
"-l",
"--lang",
help="Language proto file to update",
required=True,
)
parser.add_argument(
"-u",
"--udhr",
help="Path to UDHR translation (XML)",
required=True,
)
args = parser.parse_args(argv)

language = ReadProto(languages_public_pb2.LanguageProto(), args.lang)

udhr_data = etree.parse(args.udhr)
head = udhr_data.getroot()
for name, value in head.attrib.items():
if re.search(r"\{.*\}lang", name):
bcp47 = value.replace("-", "_")
udhr = Udhr(
key=head.get("key"),
iso639_3=head.get("iso639-3"),
iso15924=head.get("iso15924"),
bcp47=bcp47,
direction=head.get("dir"),
ohchr=None,
stage=4,
loc=None,
name=head.get("n"),
)
udhr.Parse(udhr_data)

language.sample_text.MergeFrom(udhr.GetSampleTexts())
WriteProto(language, args.lang)


if __name__ == "__main__":
main()

0 comments on commit 257af53

Please sign in to comment.