diff --git a/officinam/999999999/0/1603_1.py b/officinam/999999999/0/1603_1.py index b10844b..4c9319f 100755 --- a/officinam/999999999/0/1603_1.py +++ b/officinam/999999999/0/1603_1.py @@ -171,6 +171,404 @@ ] +def bcp47_langtag( + rem: str, + clavem: Type[Union[str, list]] = None, + strictum: bool = True +) -> dict: + """Public domain python function to process BCP47 langtag + The BCP47Langtag is an public domain python function to + aid parsing of the IETF BCP 47 language tag. It implements the syntactic + analysis of RFC 5646 and does not require lookup tables which makes + it friendly for quick analysis. + Limitations: + - subtags such as "-t-" and "-u- '' are not parsed on this current + version. They are treated as generic BCP47 extensions. + - The Language-Tag_normalized result would still need external data + lookups to stricter normalization. Yet BCP47Langtag can be used as + the very first step to analyze a language tag which is viable to port + across programming languages. + - The default behavior is to throw exceptions with at least some types + of syntactic errors, a feature which can be disabled yet reasoning is + exposed with _error. However, already very malformated can have + further bugs which different programming implementations do not need + to be consistent with each other. + Versions in other programming languages: + The initial author encourages versions to other programming languages or + libraries or tools which make use of this. No need to ask permission + upfront and there is no problem with using other licenses + than public domain. + Changelog: + - 2021-11-22: Partial implementation of BCP47 (RFC 5646) + - 2021-01-02: Fixes on Language-Tag_normalized (discoversed when ported + JavaScript version was created) + Args: + rem (str): The BCP47 langtag + clavem (Type[Union[str, list]]): Key (string) for specific value or keys + (list) to return a dict (optional) + strictum (bool): Throw exceptions. False replace values + with False (optional) + Returns: + dict: Python dictionary. None means not found. False means the feature + is not implemented + Author: + Emerson Rocha + License: + SPDX-License-Identifier: Unlicense OR 0BSD + ------------- + The syntax of the language tag in ABNF [RFC5234] is: + Language-Tag = langtag ; normal language tags + / privateuse ; private use tag + / grandfathered ; grandfathered tags + langtag = language + ["-" script] + ["-" region] + *("-" variant) + *("-" extension) + ["-" privateuse] + language = 2*3ALPHA ; shortest ISO 639 code + ["-" extlang] ; sometimes followed by + ; extended language subtags + / 4ALPHA ; or reserved for future use + / 5*8ALPHA ; or registered language subtag + extlang = 3ALPHA ; selected ISO 639 codes + *2("-" 3ALPHA) ; permanently reserved + script = 4ALPHA ; ISO 15924 code + region = 2ALPHA ; ISO 3166-1 code + / 3DIGIT ; UN M.49 code + variant = 5*8alphanum ; registered variants + / (DIGIT 3alphanum) + extension = singleton 1*("-" (2*8alphanum)) + ; Single alphanumerics + ; "x" reserved for private use + singleton = DIGIT ; 0 - 9 + / %x41-57 ; A - W + / %x59-5A ; Y - Z + / %x61-77 ; a - w + / %x79-7A ; y - z + privateuse = "x" 1*("-" (1*8alphanum)) + grandfathered = irregular ; non-redundant tags registered + / regular ; during the RFC 3066 era + irregular = "en-GB-oed" ; irregular tags do not match + / "i-ami" ; the 'langtag' production and + / "i-bnn" ; would not otherwise be + / "i-default" ; considered 'well-formed' + / "i-enochian" ; These tags are all valid, + / "i-hak" ; but most are deprecated + / "i-klingon" ; in favor of more modern + / "i-lux" ; subtags or subtag + / "i-mingo" ; combination + / "i-navajo" + / "i-pwn" + / "i-tao" + / "i-tay" + / "i-tsu" + / "sgn-BE-FR" + / "sgn-BE-NL" + / "sgn-CH-DE" + regular = "art-lojban" ; these tags match the 'langtag' + / "cel-gaulish" ; production, but their subtags + / "no-bok" ; are not extended language + / "no-nyn" ; or variant subtags: their meaning + / "zh-guoyu" ; is defined by their registration + / "zh-hakka" ; and all of these are deprecated + / "zh-min" ; in favor of a more modern + / "zh-min-nan" ; subtag or sequence of subtags + / "zh-xiang" + alphanum = (ALPHA / DIGIT) ; letters and numbers + ------------- + Most tests use examples from https://tools.ietf.org/search/bcp47 and + https://github.com/unicode-org/cldr/blob/main/tools/cldr-code + /src/main/resources/org/unicode/cldr/util/data/langtagTest.txt + Exemplōrum gratiā (et Python doctest, id est, testum automata): + (run with python3 -m doctest myscript.py) + >>> bcp47_langtag('pt-Latn-BR', 'language') + 'pt' + >>> bcp47_langtag('pt-Latn-BR', 'script') + 'Latn' + >>> bcp47_langtag('pt-Latn-BR', 'region') + 'BR' + >>> bcp47_langtag('de-CH-1996', 'variant') + ['1996'] + >>> bcp47_langtag('x-fr-CH', ['language', 'region', 'privateuse']) + {'language': None, 'region': None, 'privateuse': ['fr', 'CH']} + >>> bcp47_langtag('i-klingon', 'grandfathered') + 'i-klingon' + >>> bcp47_langtag('zh-min-nan', 'language') + 'zh' + >>> bcp47_langtag('zh-min-nan', 'variant') + ['min-nan'] + >>> bcp47_langtag('es-419', 'region') + '419' + >>> bcp47_langtag('en-oxendict', 'variant') # Oxford English Dictionary + ['oxendict'] + >>> bcp47_langtag('zh-pinyin', 'variant') # Pinyin romanization + ['pinyin'] + >>> bcp47_langtag('zh-pinyin', 'script') # Limitation: cannot infer Latn + >>> bcp47_langtag('en-a-bbb-x-a-ccc', 'privateuse') + ['a', 'ccc'] + >>> bcp47_langtag('en-a-bbb-x-a-ccc', 'extension') + {'a': 'bbb'} + >>> bcp47_langtag('tlh-a-b-foo', '_error') + Traceback (most recent call last): + ... + ValueError: Errors for [tlh-a-b-foo]: extension [a] empty + >>> bcp47_langtag('tlh-a-b-foo', '_error', False) + ['extension [a] empty'] + >>> bcp47_langtag( + ... 'zh-Latn-CN-variant1-a-extend1-x-wadegile-private1', + ... ['variant', 'extension', 'privateuse']) + {'variant': ['variant1'], 'extension': {'a': 'extend1'}, \ +'privateuse': ['wadegile', 'private1']} + >>> bcp47_langtag( + ... 'en-Latn-US-lojban-gaulish-a-12345678-ABCD-b-ABCDEFGH-x-a-b-c-12345678') + {'Language-Tag': \ +'en-Latn-US-lojban-gaulish-a-12345678-ABCD-b-ABCDEFGH-x-a-b-c-12345678', \ +'Language-Tag_normalized': \ +'en-Latn-US-lojban-gaulish-a-12345678-ABCD-b-ABCDEFGH-x-a-b-c-12345678', \ +'language': 'en', 'script': 'Latn', 'region': 'US', \ +'variant': ['lojban', 'gaulish'], \ +'extension': {'a': '12345678-ABCD', 'b': 'ABCDEFGH'}, \ +'privateuse': ['a', 'b', 'c', '12345678'], \ +'grandfathered': None, '_unknown': [], '_error': []} + + # BCP47: "Example: The language tag "en-a-aaa-b-ccc-bbb-x-xyz" is in + # canonical form, while "en-b-ccc-bbb-a-aaa-X-xyz" is well-formed (...) + >>> bcp47_langtag( + ... 'en-b-ccc-bbb-a-aaa-X-xyz') + {'Language-Tag': 'en-b-ccc-bbb-a-aaa-X-xyz', \ +'Language-Tag_normalized': 'en-a-aaa-b-ccc-bbb-x-xyz', \ +'language': 'en', 'script': None, 'region': None, 'variant': [], \ +'extension': {'a': 'aaa', 'b': 'ccc-bbb'}, 'privateuse': ['xyz'], \ +'grandfathered': None, '_unknown': [], '_error': []} + """ + # For sake of copy-and-paste portability, we ignore a few pylints: + # pylint: disable=too-many-branches,too-many-statements,too-many-locals + result = { + # The input Language-Tag, _as it is_ + 'Language-Tag': rem, + # The Language-Tag normalized syntax, if no errors + 'Language-Tag_normalized': None, + 'language': None, + 'script': None, + 'region': None, + 'variant': [], + 'extension': {}, # Example {'a': ['bbb', 'ccc'], 'd': True} + 'privateuse': [], # Example: ['wadegile', 'private1'] + 'grandfathered': None, + '_unknown': [], + '_error': [], + } + + skip = 0 + + if not isinstance(rem, str) or len(rem) == 0: + result['_error'].append('Empty/wrong type') + skip = 1 + else: + rem = rem.replace('_', '-').strip() + + # The weird tags first: grandfathered/irregular + if rem in [ + 'en-GB-oed', 'i-ami', 'i-bnn', 'i-default', 'i-enochian', + 'i-hak', 'i-klingon', 'i-lux', 'i-ming', 'i-navajo', 'i-pwn', + 'i-tao', 'i-tay', 'i-tsu', 'sgn-BE-FR', 'sgn-BE-NL', 'sgn-CH-DE']: + # result['langtag'] = None + result['language'] = rem.lower() + result['grandfathered'] = rem + skip = 1 + # The weird tags first: grandfathered/regular + if rem in [ + 'art-lojban', 'cel-gaulish', 'no-bok', 'no-nyn', 'zh-guoyu', + 'zh-hakka', 'zh-min', 'zh-min-nan', 'zh-xiang']: + + parts_r = rem.split('-') + # result['langtag'] = None + result['language'] = parts_r.pop(0).lower() + result['variant'].append('-'.join(parts_r).lower()) + result['grandfathered'] = rem + skip = 1 + + parts = rem.split('-') + leftover = [] + + deep = 0 + while len(parts) > 0 and skip == 0 and deep < 100: + deep = deep + 1 + + # BCP47 can start with private tag, without language at all + if parts[0].lower() == 'x': + parts.pop(0) + while len(parts) > 0: + result['privateuse'].append(parts.pop(0)) + break + + # BCP47 extensions start with one US-ASCII letter. + if len(parts[0]) == 1 and parts[0].isalpha(): + if parts[0].isalpha() == 'i': + result['_error'].append('Only grandfathered can use i-') + + extension_key = parts.pop(0).lower() + if len(parts) == 0 or len(parts[0]) == 1: + # BCP47 2.2.6. : "Each singleton MUST be followed by at least + # one extension subtag (...) + # result['extension'][extension_key] = [None] + result['extension'][extension_key] = {} + result['_error'].append( + 'extension [' + extension_key + '] empty') + continue + + result['extension'][extension_key] = '' + while len(parts) > 0 and len(parts[0]) != 1: + # Extensions may have more strict rules than -x- + # @see https://datatracker.ietf.org/doc/html/rfc6497 (-t-) + # @see https://datatracker.ietf.org/doc/html/rfc6067 (-u-) + + # Let's avoid atempt to lowercase extensions, since this is not + # not explicity on BCP47 for unknow extensions + # result['extension'][extension_key] = \ + # result['extension'][extension_key] + \ + # '-' + parts.pop(0).lower() + result['extension'][extension_key] = \ + result['extension'][extension_key] + \ + '-' + parts.pop(0) + + result['extension'][extension_key] = \ + result['extension'][extension_key].strip('-') + + continue + + # for part in parts: + if result['language'] is None: + if parts[0].isalnum() and len(parts[0]) == 2 or len(parts[0]) == 3: + result['language'] = parts[0].lower() + else: + result['language'] = False + result['_error'].append('language?') + parts.pop(0) + continue + + # Edge case to test for numeric in 4 (not 3): 'de-CH-1996' + if len(parts[0]) == 4 and parts[0].isalpha() \ + and result['script'] is None: + # if parts[0].isalpha() and result['script'] is None: + if parts[0].isalpha(): + if result['region'] is None and len(result['privateuse']) == 0: + result['script'] = parts[0].capitalize() + else: + result['script'] = False + result['_error'].append('script after region/privateuse') + else: + result['script'] = False + result['_error'].append('script?') + parts.pop(0) + continue + + # Regions, such as ISO 3661-1, like BR + if len(parts[0]) == 2 and result['region'] is None: + if parts[0].isalpha(): + result['region'] = parts[0].upper() + else: + result['region'] = False + result['_error'].append('region?') + parts.pop(0) + continue + + # Regions, such as ISO 3661-1, like 076 + if len(parts[0]) == 3 and result['region'] is None: + if parts[0].isnumeric(): + result['region'] = parts.pop(0) + else: + result['region'] = False + result['_error'].append('region?') + parts.pop(0) + continue + + if len(result['extension']) == 0 and len(result['privateuse']) == 0: + # 2.2.5. Variant Subtags + # 4.1 "Variant subtags that begin with a (US-ASCII)* letter + # (a-z, A-Z) MUST be at least five characters long." + # 4.2 "Variant subtags that begin with a digit (0-9) MUST be at + # least four characters long." + if parts[0][0].isalpha() and len(parts[0]) >= 5: + result['variant'].append(parts.pop(0)) + continue + if parts[0][0].isnumeric() and len(parts[0]) >= 4: + result['variant'].append(parts.pop(0)) + continue + + leftover.append(parts.pop(0)) + + result['_unknown'] = leftover + + # @TODO: maybe re-implement only for know extensions, like -t-, -u-, -h- + # if len(result['extension']) > 0: + # extension_norm = {} + # # keys + # keys_sorted = sorted(result['extension']) + # # values + # for key in keys_sorted: + # extension_norm[key] = sorted(result['extension'][key]) + + # result['extension'] = extension_norm + + # Language-Tag_normalized + if len(result['_error']) == 0: + + if result['grandfathered']: + result['Language-Tag_normalized'] = result['grandfathered'] + else: + norm = [] + if result['language']: + norm.append(result['language']) + if result['script']: + norm.append(result['script']) + if result['region']: + norm.append(result['region']) + if len(result['variant']) > 0: + norm.append('-'.join(result['variant'])) + + if len(result['extension']) > 0: + # TODO: maybe re-implement only for know extensions, + # like -t-, -u-, -h-. For now we're not trying to + # normalize ordering of unknow future extensions, BUT + # we sort key from different extensions + sorted_extension = {} + for key in sorted(result['extension']): + sorted_extension[key] = result['extension'][key] + result['extension'] = sorted_extension + + for key in result['extension']: + if result['extension'][key][0] is None: + norm.append(key) + else: + norm.append(key) + # norm.extend(result['extension'][key]) + norm.append(result['extension'][key]) + + if len(result['privateuse']) > 0: + norm.append('x-' + '-'.join(result['privateuse'])) + + result['Language-Tag_normalized'] = '-'.join(norm) + + if strictum and len(result['_error']) > 0: + raise ValueError( + 'Errors for [' + rem + ']: ' + ', '.join(result['_error'])) + + if clavem is not None: + if isinstance(clavem, str): + return result[clavem] + if isinstance(clavem, list): + result_partial = {} + for item in clavem: + result_partial[item] = result[item] + return result_partial + raise TypeError( + 'clavem [' + str(type(clavem)) + '] != [str, list]') + + return result + + def numerordinatio_neo_separatum( numerordinatio: str, separatum: str = "_") -> str: resultatum = '' @@ -246,6 +644,8 @@ def numerordinatio_nomen( resultatum = '' + # TODO: migrate to numerordinatio_nomen_v2() + # TODO: this obviously is hardcoded; Implement full inferences # if '#item+rem+i_lat+is_latn' in rem and rem['#item+rem+i_lat+is_latn']: # return '/' + rem['#item+rem+i_lat+is_latn'] + '/@lat-Latn' @@ -257,7 +657,8 @@ def numerordinatio_nomen( resultatum = '/' + rem['#item+rem+i_eng+is_latn'] + '/@eng-Latn' # TODO: temporary (2022-03-01) hacky on 1603:45:1 - elif '#item+rem+i_eng+is_latn+ix_completum' in rem and rem['#item+rem+i_eng+is_latn+ix_completum']: + elif '#item+rem+i_eng+is_latn+ix_completum' in rem and \ + rem['#item+rem+i_eng+is_latn+ix_completum']: resultatum = '/' + \ rem['#item+rem+i_eng+is_latn+ix_completum'] + '/@eng-Latn' @@ -267,6 +668,54 @@ def numerordinatio_nomen( return '' +def numerordinatio_nomen_v2( + rem: dict, objectivum_linguam: str = 'mul-Zyyy', + auxilium_linguam: list = None) -> str: + """numerordinatio_nomen_v2 + + _extended_summary_ + + Args: + rem (dict): _description_ + objectivum_linguam (str, optional): _description_. Defaults to 'mul-Zyyy'. + auxilium_linguam (list, optional): _description_. Defaults to None. + + Returns: + str: _description_ + + >>> rem = {'#item+rem+i_lat+is_latn': 'Salvi, Mundi!'} + >>> numerordinatio_nomen_v2(rem, 'mul-Latn', ['por-Latn', 'lat-Latn']) + '/Salvi, Mundi!/@lat-Latn' + >>> numerordinatio_nomen_v2(rem, 'lat-Latn', ['por-Latn']) + 'Salvi, Mundi!' + """ + + def helper(rem, attr): + if '#item+rem' + attr in rem and rem['#item+rem' + attr]: + return rem['#item+rem' + attr] + return None + + objectivum_attr = qhxl_bcp47_2_hxlattr(objectivum_linguam) + + # auxilium = list(map(qhxl_bcp47_2_hxlattr, auxilium_linguam)) + + perfectum = helper(rem, objectivum_attr) + if perfectum: + return _brevis(perfectum) + + if auxilium_linguam is not None and len(auxilium_linguam) > 0: + for item in auxilium_linguam: + item_attr = qhxl_bcp47_2_hxlattr(item) + resultatum_item = helper(rem, item_attr) + if resultatum_item: + if resultatum_item.lower().endswith('@' + item.lower()): + return _brevis(resultatum_item) + else: + return _brevis('/' + resultatum_item + '/@' + item) + + return '' + + def _brevis(rem: str) -> str: linguam = '' pseudo_ipa = '' @@ -467,7 +916,45 @@ def qhxl(rem: dict, query: Union[str, list]): return None -def qhxl_attr_2_bcp47(hxlatt: str): +def qhxl_bcp47_2_hxlattr(bcp47: str) -> str: + """qhxl_bcp47_2_hxlattr + + Convert BCP47 to HXL attribute part + + Args: + hxlatt (str): + + Returns: + str: + + >>> qhxl_bcp47_2_hxlattr('lat-Latn-x-private1-private2') + '+i_lat+is_latn+ix_private1+ix_private2' + >>> qhxl_bcp47_2_hxlattr('qcc-Zxxx-x-wikip') + '+i_qcc+is_zxxx+ix_wikip' + """ + resultatum = '' + bcp47_parsed = bcp47_langtag(bcp47) + + resultatum = '+i_' + bcp47_parsed['language'] + resultatum += '+is_' + bcp47_parsed['script'].lower() + if len(bcp47_parsed['privateuse']) > 0: + for item in bcp47_parsed['privateuse']: + resultatum += '+ix_' + item + + return resultatum + + +def qhxl_attr_2_bcp47(hxlatt: str) -> str: + """qhxl_attr_2_bcp47 + + Convert HXL attribute part to BCP47 + + Args: + hxlatt (str): + + Returns: + str: + """ resultatum = '' tempus1 = hxlatt.replace('+i_', '') tempus1 = tempus1.split('+is_') @@ -727,11 +1214,9 @@ def __init__( self.m1603_1_1 = None self._init_1603_1_1() self.dictionaria_linguarum = DictionariaLinguarum() - self.notitiae = DictionariaNotitiae( - auxilium_linguam=auxilium_linguam, - dictionaria_linguarum=self.dictionaria_linguarum - ) + self.notitiae = DictionariaNotitiae(codex=self) self.dictionaria_interlinguarum = DictionariaInterlinguarum() + self.notitiae = DictionariaNotitiae(codex=self) self.codex = self._init_codex() self.n1603ia = self._init_ix_n1603ia() # self.annexa = self._init_annexa() @@ -2379,7 +2864,38 @@ def methodi_ex_codice(self) -> list: return paginae + # def quod_hxlhashtag(self, template: str, linguae: list = None) -> dict: + # """quod_res Which thing? + + # Return the entire concept if exist on the focused group of dictionaries + + # Args: + # codicem (str): + + # Returns: + # dict: + # """ + # codicem = numerordinatio_neo_separatum(codicem, '_') + # for res in self.codex: + # res_codicem = \ + # numerordinatio_neo_separatum( + # res['#item+conceptum+codicem'], '_') + # if res_codicem == codicem: + # return res + + # return None + def quod_res(self, codicem: str) -> dict: + """quod_res Which thing? + + Return the entire concept if exist on the focused group of dictionaries + + Args: + codicem (str): + + Returns: + dict: + """ codicem = numerordinatio_neo_separatum(codicem, '_') for res in self.codex: res_codicem = \ @@ -2391,6 +2907,14 @@ def quod_res(self, codicem: str) -> dict: return None def quod_res_caveat_lector(self) -> str: + """quod_res_caveat_lector quod res caveat lector? + + Return the text of caveat from the focused group of dictionaries + (if have such information) + + Returns: + str: + """ meta_langs = [ '#item+rem+i_qcc+is_zxxx+ix_codexfacto' ] @@ -2404,6 +2928,14 @@ def quod_res_caveat_lector(self) -> str: return None def quod_res_methodi_ex_dictionariorum_corde(self) -> str: + """quod_res_methodi_ex_dictionariorum_corde + + Return the text of method of the focused group of dictionaries + (if have such information) + + Returns: + str: + """ meta_langs = [ '#item+rem+i_qcc+is_zxxx+ix_codexfacto' ] @@ -4201,23 +4733,15 @@ class DictionariaNotitiae: """ - auxilium_linguam = ['lat-Latn'] - def __init__( self, - auxilium_linguam: list = None, - dictionaria_linguarum: Type['DictionariaLinguarum'] = None + codex: Type['Codex'] ): self.fontem = NUMERORDINATIO_BASIM + \ "/1603/1/99/1603_1_99.no1.tm.hxl.csv" - if auxilium_linguam: - self.auxilium_linguam = auxilium_linguam - if dictionaria_linguarum: - self.dictionaria_linguarum = dictionaria_linguarum - else: - self.dictionaria_linguarum = DictionariaLinguarum() + self.codex = codex self.dictionaria = self._init_dictionaria()