From 86b94517f1df7d705564d5a93e0d0c431ccd9120 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Apr 2021 18:52:28 +0200 Subject: [PATCH 01/14] Enable packratting for pyparser Delivers significant performance improvements by caching previously computed results. --- edtf/parser/grammar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index c028c6e..d612c5f 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -1,5 +1,9 @@ from pyparsing import Literal as L, ParseException, Optional, OneOrMore, \ - ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums + ZeroOrMore, oneOf, Regex, Combine, Word, NotAny, nums, ParserElement + +# From the pyparsing performance improvement tips: +# https://github.com/pyparsing/pyparsing/wiki/Performance-Tips +ParserElement.enablePackrat() # (* ************************** Level 0 *************************** *) from edtf.parser.parser_classes import Date, DateAndTime, Interval, Unspecified, \ From 7fdf8dd8b649a5085d8f2aed3b66a8734f2ce915 Mon Sep 17 00:00:00 2001 From: jacobcolyvan Date: Mon, 26 Jul 2021 12:29:25 +1000 Subject: [PATCH 02/14] #37 update for Django 3.x compat --- edtf/fields.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edtf/fields.py b/edtf/fields.py index 83d10a7..52b9171 100644 --- a/edtf/fields.py +++ b/edtf/fields.py @@ -53,7 +53,7 @@ def deconstruct(self): del kwargs["max_length"] return name, path, args, kwargs - def from_db_value(self, value, expression, connection, context): + def from_db_value(self, value, expression, connection, context=None): # Converting values to Python objects if not value: return None From 6e4a627df5447b76db492b1603f95bbd55524346 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:43:38 +0200 Subject: [PATCH 03/14] Minor updates --- edtf/natlang/en.py | 3 ++- poetry.lock | 45 +++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 18 ++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 poetry.lock create mode 100644 pyproject.toml diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ec7842b..5263e07 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -89,6 +89,7 @@ def text_to_edtf(text): is_before = re.findall(r'\bbefore\b', t) is_before = is_before or re.findall(r'\bearlier\b', t) + is_before = is_before or re.findall(r'\baprés\b', t) is_after = re.findall(r'\bafter\b', t) is_after = is_after or re.findall(r'\bsince\b', t) @@ -133,7 +134,7 @@ def text_to_edtf_date(text): is_approximate = is_approximate or re.findall(r'\bcirca\b', t) # the word 'approx'/'around'/'about' anywhere is_approximate = is_approximate or \ - re.findall(r'\b(approx|around|about)', t) + re.findall(r'\b(approx|approximately|around|about)', t) # a ~ before a year-ish number is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) # a ~ at the beginning diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..745843e --- /dev/null +++ b/poetry.lock @@ -0,0 +1,45 @@ +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. + +[[package]] +name = "pyparsing" +version = "3.1.2" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, + {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.11" +content-hash = "822c6f7ddf2552d097c1bfc8399a2492c845c74cb4576a423adf3ad62850ffc3" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f203360 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[tool.poetry] +name = "python-edtf" +version = "0.1.0" +description = "" +authors = ["Andrew Hankinson "] +readme = "README.md" +packages = [{include = "python_edtf"}] + +[tool.poetry.dependencies] +python = "^3.11" +python-dateutil = "^2.9.0.post0" +pyparsing = "^3.1.2" +six = "^1.16.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" From 80fdd60cbb590d7139341293185628d6aa8cac5b Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:49:58 +0200 Subject: [PATCH 04/14] Update dependency management --- pyproject.toml | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f203360..f1d7c5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "python-edtf" +name = "edtf" version = "0.1.0" description = "" authors = ["Andrew Hankinson "] diff --git a/setup.py b/setup.py index f0f1849..f2cc7d5 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,6 @@ from __future__ import print_function import setuptools -import sys def readme(): with open('README.md') as f: From c12d759732d393ac66faa462b8d61b057c675d17 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Fri, 26 Apr 2024 15:55:52 +0200 Subject: [PATCH 05/14] Deps --- poetry.lock | 4 ++-- pyproject.toml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 745843e..c4b40b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -41,5 +41,5 @@ files = [ [metadata] lock-version = "2.0" -python-versions = "^3.11" -content-hash = "822c6f7ddf2552d097c1bfc8399a2492c845c74cb4576a423adf3ad62850ffc3" +python-versions = "^3.9" +content-hash = "e6be32f86f1a6af0695f6846b57ed289e015b5634c7f574c45800095a84e2200" diff --git a/pyproject.toml b/pyproject.toml index f1d7c5f..9af9ee4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] name = "edtf" -version = "0.1.0" +version = "4.0.1+enh" description = "" authors = ["Andrew Hankinson "] readme = "README.md" -packages = [{include = "python_edtf"}] +packages = [{include = "edtf"}] [tool.poetry.dependencies] -python = "^3.11" +python = "^3.9" python-dateutil = "^2.9.0.post0" pyparsing = "^3.1.2" six = "^1.16.0" From 6e508d016e9bbcc49b90d3c88ca3512d69a0d193 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 23 Jul 2024 17:03:14 +0200 Subject: [PATCH 06/14] Optimized regexes --- edtf/natlang/en.py | 126 ++++++++++++++++++++++----------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 5263e07..4f68f21 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,9 +1,10 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" from datetime import datetime +from typing import Optional + from dateutil.parser import parse import re from edtf import appsettings -from six.moves import xrange # two dates where every digit of an ISO date representation is different, @@ -12,24 +13,43 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = r'(-?)([\du])([\dxu])([\dxu])([\dxu])' -LONG_YEAR_RE = r'y(-?)([1-9]\d\d\d\d+)' -CENTURY_RE = r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?' -CE_RE = r'(\d{1,4}) (ad|ce|bc|bce)' +SHORT_YEAR_RE = re.compile(r'(-?)([\du])([\dxu])([\dxu])([\dxu])') +LONG_YEAR_RE = re.compile(r'y(-?)([1-9]\d\d\d\d+)') +CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') +CENTURY_RANGE = re.compile(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]') +CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)') +ONE_DIGIT_PARTIAL_FIRST = re.compile(r'\d\D\b') +TWO_DIGIT_PARTIAL_FIRST = re.compile(r'\d\d\b') +PARTIAL_CHECK = re.compile(r'\b\d\d\d\d$') +SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") +BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") +AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") +APPROX_CHECK = re.compile(r'\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)') +UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") +UNCERTAIN_REPL = re.compile(r'(\d{4})\?') +MIGHT_BE_CENTURY = re.compile(r'(\d{2}00)s') +MIGHT_BE_DECADE = re.compile(r'(\d{3}0)s') + +APPROX_CENTURY_RE = re.compile(r'\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') +UNCERTAIN_CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?') + +APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') +UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') + # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. REJECT_RULES = ( - r'.*dynasty.*', # Don't parse '23rd Dynasty' to 'uuuu-uu-23' + re.compile(r'.*dynasty.*'), # Don't parse '23rd Dynasty' to 'uuuu-uu-23' ) -def text_to_edtf(text): +def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. """ if not text: - return + return None t = text.lower() @@ -51,18 +71,18 @@ def text_to_edtf(text): # match looks from the beginning of the string, search # looks anywhere. - if re.match(r'\d\D\b', d2): # 1-digit year partial e.g. 1868-9 - if re.search(r'\b\d\d\d\d$', d1): # TODO: evaluate it and see if it's a year + if re.match(ONE_DIGIT_PARTIAL_FIRST, d2): # 1-digit year partial e.g. 1868-9 + if re.search(PARTIAL_CHECK, d1): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(r'\d\d\b', d2): # 2-digit year partial e.g. 1809-10 - if re.search(r'\b\d\d\d\d$', d1): + elif re.match(TWO_DIGIT_PARTIAL_FIRST, d2): # 2-digit year partial e.g. 1809-10 + if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: - century_range_match = re.search(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]', "%s-%s" % (d1,d2)) + century_range_match = re.search(CENTURY_RANGE, f"{d1}-{d2}") if century_range_match: g = century_range_match.groups() - d1 = "%sC" % g[0] - d2 = "%sC" % g[2] + d1 = f"{g[0]}C" + d2 = f"{g[2]}C" r1 = text_to_edtf_date(d1) r2 = text_to_edtf_date(d2) @@ -77,9 +97,9 @@ def text_to_edtf(text): # This whole section could be more friendly. else: - int_match = re.search(r"(\d\d\d\d)\/(\d\d\d\d)", list_item) + int_match = re.search(SLASH_YEAR, list_item) if int_match: - return "[%s, %s]" % (int_match.group(1), int_match.group(2)) + return f"[{int_match.group(1)}, {int_match.group(2)}]" result = text_to_edtf_date(list_item) if result: @@ -87,23 +107,18 @@ def text_to_edtf(text): if result: break - is_before = re.findall(r'\bbefore\b', t) - is_before = is_before or re.findall(r'\bearlier\b', t) - is_before = is_before or re.findall(r'\baprés\b', t) - - is_after = re.findall(r'\bafter\b', t) - is_after = is_after or re.findall(r'\bsince\b', t) - is_after = is_after or re.findall(r'\blater\b', t) + is_before = re.findall(BEFORE_CHECK, t) + is_after = re.findall(AFTER_CHECK, t) if is_before: - result = u"unknown/%s" % result + result = f"unknown/{result}" elif is_after: - result = u"%s/unknown" % result + result = f"{result}/unknown" return result -def text_to_edtf_date(text): +def text_to_edtf_date(text) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -112,39 +127,29 @@ def text_to_edtf_date(text): differ are undefined. """ if not text: - return + return None t = text.lower() result = '' for reject_re in REJECT_RULES: if re.match(reject_re, t): - return + return None # matches on '1800s'. Needs to happen before is_decade. - could_be_century = re.findall(r'(\d{2}00)s', t) + could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(r'(\d{3}0)s', r'\1', t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r'\1', t) # detect approximation signifiers # a few 'circa' abbreviations just before the year - is_approximate = re.findall(r'\b(ca?\.?) ?\d{4}', t) + is_approximate = re.findall(APPROX_CHECK, t) # the word 'circa' anywhere - is_approximate = is_approximate or re.findall(r'\bcirca\b', t) - # the word 'approx'/'around'/'about' anywhere - is_approximate = is_approximate or \ - re.findall(r'\b(approx|approximately|around|about)', t) - # a ~ before a year-ish number - is_approximate = is_approximate or re.findall(r'\b~\d{4}', t) - # a ~ at the beginning - is_approximate = is_approximate or re.findall(r'^~', t) # detect uncertainty signifiers - t, is_uncertain = re.subn(r'(\d{4})\?', r'\1', t) - # the words uncertain/maybe/guess anywhere - is_uncertain = is_uncertain or re.findall( - r'\b(uncertain|possibly|maybe|guess)', t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r'\1', t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms is_century = re.findall(CENTURY_RE, t) @@ -153,27 +158,23 @@ def text_to_edtf_date(text): is_ce = re.findall(CE_RE, t) if is_century: result = "%02dxx" % (int(is_century[0][0]) - 1,) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CENTURY_RE, t) - is_uncertain = is_uncertain or re.findall(CENTURY_RE + r'\?', t) + is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) try: - is_bc = is_century[0][-1] in ("bc", "bce") - if is_bc: - result = "-%s" % result + if is_century[0][-1] in ("bc", "bce"): + result = f"-{result}" except IndexError: pass elif is_ce: result = "%04d" % (int(is_ce[0][0])) - is_approximate = is_approximate or \ - re.findall(r'\b(ca?\.?) ?' + CE_RE, t) - is_uncertain = is_uncertain or re.findall(CE_RE + r'\?', t) + is_approximate = is_approximate or re.findall(APPROX_CE_RE, t) + is_uncertain = is_uncertain or re.findall(UNCERTAIN_CE_RE, t) try: - is_bc = is_ce[0][-1] in ("bc", "bce") - if is_bc: - result = "-%s" % result + if is_ce[0][-1] in ("bc", "bce"): + result = f"-{result}" except IndexError: pass @@ -200,12 +201,12 @@ def text_to_edtf_date(text): ) except ValueError: - return + return None if dt1.date() == DEFAULT_DATE_1.date() and \ dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. - return + return None date1 = dt1.isoformat()[:10] date2 = dt2.isoformat()[:10] @@ -215,14 +216,13 @@ def text_to_edtf_date(text): mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) - for i in xrange(len(date1)): + for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use # approximate/uncertain markers to decide whether we treat it as # a century or a decade. - if i == 2 and could_be_century and \ - not (is_approximate or is_uncertain): + if i == 2 and could_be_century and not (is_approximate or is_uncertain): result += 'x' - elif i == 3 and is_decade > 0: + elif i == 3 and is_decade: if mentions_year: result += 'u' # year precision else: @@ -238,7 +238,7 @@ def text_to_edtf_date(text): # strip off unknown chars from end of string - except the first 4 - for i in reversed(xrange(len(result))): + for i in reversed(range(len(result))): if result[i] not in ('u', 'x', '-'): smallest_length = 4 From f2252f03c23b1f7a6a153ccf750e97a94ce71dd2 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 23 Jul 2024 17:18:26 +0200 Subject: [PATCH 07/14] Package updates --- edtf/convert.py | 8 +- edtf/jdutil.py | 32 +++---- edtf/natlang/en.py | 11 ++- edtf/natlang/tests.py | 4 +- edtf/parser/grammar.py | 14 +-- edtf/parser/parser_classes.py | 159 +++++++++++++++++----------------- edtf/parser/tests.py | 66 +++++++------- 7 files changed, 152 insertions(+), 142 deletions(-) diff --git a/edtf/convert.py b/edtf/convert.py index c1bfd3a..de1f2a2 100644 --- a/edtf/convert.py +++ b/edtf/convert.py @@ -59,8 +59,7 @@ def trim_struct_time(st, strip_time=False): """ if strip_time: return struct_time(list(st[:3]) + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - else: - return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) + return struct_time(list(st[:6]) + TIME_EMPTY_EXTRAS) def struct_time_to_jd(st): @@ -106,7 +105,7 @@ def jd_to_struct_time(jd): ) -def _roll_negative_time_fields(year, month, day, hour, minute, second): +def _roll_negative_time_fields(year, month, day, hour, minute, second) -> tuple: """ Fix date/time fields which have nonsense negative values for any field except for year by rolling the overall date/time value backwards, treating @@ -142,4 +141,5 @@ def _roll_negative_time_fields(year, month, day, hour, minute, second): year += int(month / 12.0) # Adjust by whole year in months year -= 1 # Subtract 1 for negative minutes month %= 12 # Convert negative month to positive remainder - return (year, month, day, hour, minute, second) + + return year, month, day, hour, minute, second diff --git a/edtf/jdutil.py b/edtf/jdutil.py index 9fabdd1..4a12b58 100644 --- a/edtf/jdutil.py +++ b/edtf/jdutil.py @@ -17,7 +17,8 @@ # 10-14-1582 never occurred. Python datetime objects will produce incorrect # time deltas if one date is from before 10-15-1582. -def mjd_to_jd(mjd): + +def mjd_to_jd(mjd: float) -> float: """ Convert Modified Julian Day to Julian Day. @@ -30,13 +31,11 @@ def mjd_to_jd(mjd): ------- jd : float Julian Day - - """ return mjd + 2400000.5 -def jd_to_mjd(jd): +def jd_to_mjd(jd: float) -> float: """ Convert Julian Day to Modified Julian Day @@ -54,7 +53,7 @@ def jd_to_mjd(jd): return jd - 2400000.5 -def date_to_jd(year,month,day): +def date_to_jd(year: int, month: int, day: float) -> float: """ Convert a date to Julian Day. @@ -117,7 +116,7 @@ def date_to_jd(year,month,day): return jd -def jd_to_date(jd): +def jd_to_date(jd: float) -> (int, int, float): """ Convert Julian Day to date. @@ -184,7 +183,10 @@ def jd_to_date(jd): return year, month, day -def hmsm_to_days(hour=0,min=0,sec=0,micro=0): +def hmsm_to_days(hour: int = 0, + min: int = 0, + sec: int = 0, + micro: int = 0) -> float: """ Convert hours, minutes, seconds, and microseconds to fractional days. @@ -222,7 +224,7 @@ def hmsm_to_days(hour=0,min=0,sec=0,micro=0): return days / 24. -def days_to_hmsm(days): +def days_to_hmsm(days: float) -> (int, int, int, int): """ Convert fractional days to hours, minutes, seconds, and microseconds. Precision beyond microseconds is rounded to the nearest microsecond. @@ -271,7 +273,7 @@ def days_to_hmsm(days): return int(hour), int(min), int(sec), int(micro) -def datetime_to_jd(date): +def datetime_to_jd(date: dt.datetime) -> float: """ Convert a `datetime.datetime` object to Julian Day. @@ -298,7 +300,7 @@ def datetime_to_jd(date): return date_to_jd(date.year,date.month,days) -def jd_to_datetime(jd): +def jd_to_datetime(jd: float) -> dt.datetime: """ Convert a Julian Day to an `jdutil.datetime` object. @@ -328,7 +330,7 @@ def jd_to_datetime(jd): return datetime(year,month,day,hour,min,sec,micro) -def timedelta_to_days(td): +def timedelta_to_days(td: dt.timedelta) -> float: """ Convert a `datetime.timedelta` object to a total number of days. @@ -372,7 +374,7 @@ class datetime(dt.datetime): datetime.datetime : Parent class. """ - def __add__(self,other): + def __add__(self, other): if not isinstance(other,dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -383,7 +385,7 @@ def __add__(self,other): return jd_to_datetime(combined) - def __radd__(self,other): + def __radd__(self, other): if not isinstance(other,dt.timedelta): s = "jdutil.datetime supports '+' only with datetime.timedelta" raise TypeError(s) @@ -394,7 +396,7 @@ def __radd__(self,other): return jd_to_datetime(combined) - def __sub__(self,other): + def __sub__(self, other): if isinstance(other,dt.timedelta): days = timedelta_to_days(other) @@ -412,7 +414,7 @@ def __sub__(self,other): s += "datetime.timedelta, jdutil.datetime and datetime.datetime" raise TypeError(s) - def __rsub__(self,other): + def __rsub__(self, other): if not isinstance(other, (datetime,dt.datetime)): s = "jdutil.datetime supports '-' with: " s += "jdutil.datetime and datetime.datetime" diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 4f68f21..8cb72c4 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -36,6 +36,11 @@ APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') +MENTIONS_YEAR = re.compile(r'\byear\b.+(in|during)\b') +MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') +MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') + + # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. @@ -212,9 +217,9 @@ def text_to_edtf_date(text) -> Optional[str]: date2 = dt2.isoformat()[:10] # guess precision of 'unspecified' characters to use - mentions_year = re.findall(r'\byear\b.+(in|during)\b', t) - mentions_month = re.findall(r'\bmonth\b.+(in|during)\b', t) - mentions_day = re.findall(r'\bday\b.+(in|during)\b', t) + mentions_year = re.findall(MENTIONS_YEAR, t) + mentions_month = re.findall(MENTIONS_MONTH, t) + mentions_day = re.findall(MENTIONS_DAY, t) for i in range(len(date1)): # if the given year could be a century (e.g. '1800s') then use diff --git a/edtf/natlang/tests.py b/edtf/natlang/tests.py index ea137d2..d18ec76 100644 --- a/edtf/natlang/tests.py +++ b/edtf/natlang/tests.py @@ -207,8 +207,8 @@ def test_natlang(self): """ for i, o in EXAMPLES: e = text_to_edtf(i) - print("%s => %s" % (i, e)) - self.assertEqual(e, o) + print(f"{i} => {e}") + self.assertEqual(e, o, msg=f"Testing {i}") if __name__ == '__main__': diff --git a/edtf/parser/grammar.py b/edtf/parser/grammar.py index d612c5f..14cb3a4 100644 --- a/edtf/parser/grammar.py +++ b/edtf/parser/grammar.py @@ -282,14 +282,16 @@ def f(toks): edtfParser = level0Expression("level0") ^ level1Expression("level1") ^ level2Expression("level2") -def parse_edtf(str, parseAll=True, fail_silently=False): +def parse_edtf(inp: str, parse_all: bool = True, fail_silently: bool = False): + if not inp: + raise ParseException("You must supply some input text") + try: - if not str: - raise ParseException("You must supply some input text") - p = edtfParser.parseString(str.strip(), parseAll) - if p: - return p[0] + p = edtfParser.parseString(inp.strip(), parse_all) except ParseException as e: if fail_silently: return None raise EDTFParseException(e) + + if p: + return p[0] diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index b670296..ae7adb4 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -3,6 +3,7 @@ from time import struct_time from datetime import date, datetime from operator import add, sub +from typing import Optional from dateutil.relativedelta import relativedelta @@ -22,7 +23,7 @@ PRECISION_DAY = "day" -def days_in_month(year, month): +def days_in_month(year: int, month: int) -> dict: """ Return the number of days in the given year and month, where month is 1=January to 12=December, and respecting leap years as identified by @@ -85,11 +86,15 @@ def apply_delta(op, time_struct, delta): class EDTFObject(object): """ - Object to attact to a parser to become instantiated when the parser + Object to attach to a parser to become instantiated when the parser completes. """ parser = None + def __init__(self, *args, **kwargs): + errmsg: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" + raise NotImplementedError(f"{errmsg} is not implemented.") + @classmethod def set_parser(cls, p): cls.parser = p @@ -99,7 +104,7 @@ def set_parser(cls, p): def parse_action(cls, toks): kwargs = toks.asDict() try: - return cls(**kwargs) # replace the token list with the class + return cls(**kwargs) # replace the token list with the class except Exception as e: print("trying to %s.__init__(**%s)" % (cls.__name__, kwargs)) raise e @@ -109,19 +114,12 @@ def parse(cls, s): return cls.parser.parseString(s)[0] def __repr__(self): - return "%s: '%s'" % (type(self).__name__, str(self)) - - def __init__(self, *args, **kwargs): - str = "%s.__init__(*%s, **%s)" % ( - type(self).__name__, - args, kwargs, - ) - raise NotImplementedError("%s is not implemented." % str) + return f"{type(self).__name__}: '{str(self)}'" def __str__(self): raise NotImplementedError - def _strict_date(self, lean): + def _strict_date(self, lean: str): raise NotImplementedError def lower_strict(self): @@ -130,7 +128,7 @@ def lower_strict(self): def upper_strict(self): return self._strict_date(lean=LATEST) - def _get_fuzzy_padding(self, lean): + def _get_fuzzy_padding(self, lean: str): """ Subclasses should override this to pad based on how precise they are. """ @@ -216,41 +214,40 @@ def __le__(self, other): # (* ************************** Level 0 *************************** *) class Date(EDTFObject): + def __init__(self, year=None, month=None, day=None, **kwargs): + for param in ('date', 'lower', 'upper'): + if param in kwargs: + self.__init__(**kwargs[param]) + return + + self.year = year # Year is required, but sometimes passed in as a 'date' dict. + self.month = month + self.day = day - def set_year(self, y): + def set_year(self, y: int): if y is None: raise AttributeError("Year must not be None") self._year = y - def get_year(self): + def get_year(self) -> int: return self._year year = property(get_year, set_year) - def set_month(self, m): + def set_month(self, m: Optional[int]): self._month = m - if m == None: + if m is None: self.day = None - def get_month(self): + def get_month(self) -> Optional[int]: return self._month month = property(get_month, set_month) - def __init__(self, year=None, month=None, day=None, **kwargs): - for param in ('date', 'lower', 'upper'): - if param in kwargs: - self.__init__(**kwargs[param]) - return - - self.year = year # Year is required, but sometimes passed in as a 'date' dict. - self.month = month - self.day = day - def __str__(self): r = self.year if self.month: - r += "-%s" % self.month + r += f"-{self.month}" if self.day: - r += "-%s" % self.day + r += f"-{self.day}" return r def isoformat(self, default=date.max): @@ -260,14 +257,14 @@ def isoformat(self, default=date.max): int(self.day or default.day), ) - def _precise_year(self, lean): + def _precise_year(self, lean: str): # Replace any ambiguous characters in the year string with 0s or 9s if lean == EARLIEST: return int(re.sub(r'[xu]', r'0', self.year)) else: return int(re.sub(r'[xu]', r'9', self.year)) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.month and self.month != "uu": try: return int(self.month) @@ -276,7 +273,7 @@ def _precise_month(self, lean): else: return 1 if lean == EARLIEST else 12 - def _precise_day(self, lean): + def _precise_day(self, lean: str): if not self.day or self.day == 'uu': if lean == EARLIEST: return 1 @@ -343,7 +340,7 @@ def __init__(self, lower, upper): self.upper = upper def __str__(self): - return "%s/%s" % (self.lower, self.upper) + return f"{self.lower}/{self.upper}" def _strict_date(self, lean): if lean == EARLIEST: @@ -416,8 +413,8 @@ def __str__(self): def _strict_date(self, lean): if self.date == "open": return dt_to_struct_time(date.today()) - if self.date =="unknown": - return None # depends on the other date + if self.date == "unknown": + return None # depends on the other date return self.date._strict_date(lean) def _get_fuzzy_padding(self, lean): @@ -454,12 +451,12 @@ def __init__(self, year): self.year = year def __str__(self): - return "y%s" % self.year + return f"y{self.year}" def _precise_year(self): return int(self.year) - def _strict_date(self, lean): + def _strict_date(self, lean: str): py = self._precise_year() if lean == EARLIEST: return struct_time( @@ -478,30 +475,26 @@ def __init__(self, year, season, **kwargs): self.day = None def __str__(self): - return "%s-%s" % (self.year, self.season) + return f"{self.year}-{self.season}" def _precise_month(self, lean): rng = appsettings.SEASON_MONTHS_RANGE[int(self.season)] if lean == EARLIEST: return rng[0] - else: - return rng[1] + + return rng[1] # (* ************************** Level 2 *************************** *) class PartialUncertainOrApproximate(Date): - - def set_year(self, y): # Year can be None. - self._year = y - year = property(Date.get_year, set_year) - def __init__( self, year=None, month=None, day=None, - year_ua=False, month_ua = False, day_ua = False, - year_month_ua = False, month_day_ua = False, - ssn=None, season_ua=False, all_ua=False + year_ua: Optional[UA] = None, month_ua: Optional[UA] = None, + day_ua: Optional[UA] = None, year_month_ua: Optional[UA] = None, + month_day_ua: Optional[UA] = None, ssn=None, + season_ua: Optional[UA] = None, all_ua: Optional[UA] = None ): self.year = year self.month = month @@ -520,56 +513,60 @@ def __init__( self.all_ua = all_ua def __str__(self): - if self.season_ua: - return "%s%s" % (self.season, self.season_ua) + return f"{self.season}{self.season_ua}" if self.year_ua: - y = "%s%s" % (self.year, self.year_ua) + y = f"{self.year}{self.year_ua}" else: y = str(self.year) if self.month_ua: - m = "(%s)%s" % (self.month, self.month_ua) + m = f"({self.month}){self.month_ua}" else: m = str(self.month) if self.day: if self.day_ua: - d = "(%s)%s" % (self.day, self.day_ua) + d = f"({self.day}){self.day_ua}" else: d = str(self.day) else: d = None if self.year_month_ua: # year/month approximate. No brackets needed. - ym = "%s-%s%s" % (y, m, self.year_month_ua) + ym = f"{y}-{m}{self.year_month_ua}" if d: - result = "%s-%s" % (ym, d) + result = f"{ym}-{d}" else: result = ym + elif self.month_day_ua: - if self.year_ua: # we don't need the brackets round month and day - result = "%s-%s-%s%s" % (y, m, d, self.month_day_ua) + if self.year_ua: # we don't need the brackets round month and day + result = f"{y}-{m}-{d}{self.month_day_ua}" else: - result = "%s-(%s-%s)%s" % (y, m, d, self.month_day_ua) + result = f"{y}-({m}-{d}){self.month_day_ua}" else: if d: - result = "%s-%s-%s" % (y, m, d) + result = f"{y}-{m}-{d}" else: - result = "%s-%s" % (y, m) + result = f"{y}-{m}" if self.all_ua: - result = "(%s)%s" % (result, self.all_ua) + result = f"({result}){self.all_ua}" return result - def _precise_year(self, lean): + def set_year(self, y): # Year can be None. + self._year = y + year = property(Date.get_year, set_year) + + def _precise_year(self, lean: str): if self.season: return self.season._precise_year(lean) return super(PartialUncertainOrApproximate, self)._precise_year(lean) - def _precise_month(self, lean): + def _precise_month(self, lean: str): if self.season: return self.season._precise_month(lean) return super(PartialUncertainOrApproximate, self)._precise_month(lean) @@ -638,7 +635,7 @@ def __init__(self, lower=None, upper=None): self.upper = upper def __str__(self): - return "%s..%s" % (self.lower or '', self.upper or '') + return f"{self.lower or ''}..{self.upper or ''}" class EarlierConsecutives(Consecutives): @@ -650,41 +647,40 @@ class LaterConsecutives(Consecutives): class OneOfASet(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): - return "[%s]" % (", ".join([str(o) for o in self.objects])) + return f"[{', '.join([str(o) for o in self.objects])}]" - def _strict_date(self, lean): + def _strict_date(self, lean: str): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + + return min([x._strict_date(lean) for x in self.objects]) class MultipleDates(EDTFObject): + def __init__(self, *args): + self.objects = args + @classmethod def parse_action(cls, toks): args = [t for t in toks.asList() if isinstance(t, EDTFObject)] return cls(*args) - def __init__(self, *args): - self.objects = args - def __str__(self): - return "{%s}" % (", ".join([str(o) for o in self.objects])) + return f"{{{', '.join([str(o) for o in self.objects])}}}" def _strict_date(self, lean): if lean == LATEST: return max([x._strict_date(lean) for x in self.objects]) - else: - return min([x._strict_date(lean) for x in self.objects]) + return min([x._strict_date(lean) for x in self.objects]) class MaskedPrecision(Date): @@ -695,12 +691,13 @@ class Level2Interval(Level1Interval): def __init__(self, lower, upper): # Check whether incoming lower/upper values are single-item lists, and # if so take just the first item. This works around what I *think* is a - # bug in the grammer that provides us with single-item lists of + # bug in the grammar that provides us with single-item lists of # `PartialUncertainOrApproximate` items for lower/upper values. if isinstance(lower, (tuple, list)) and len(lower) == 1: self.lower = lower[0] else: self.lower = lower + if isinstance(lower, (tuple, list)) and len(upper) == 1: self.upper = upper[0] else: @@ -718,7 +715,7 @@ def _precise_year(self): def get_year(self): if self.precision: - return '%se%sp%s' % (self.base, self.exponent, self.precision) + return f'{self.base}e{self.exponent}p{self.precision}' else: - return '%se%s' % (self.base, self.exponent) + return f'{self.base}e{self.exponent}' year = property(get_year) diff --git a/edtf/parser/tests.py b/edtf/parser/tests.py index f9dde42..77c2ad3 100644 --- a/edtf/parser/tests.py +++ b/edtf/parser/tests.py @@ -3,10 +3,11 @@ from datetime import date from time import struct_time +from pyparsing import ParseException + from edtf.parser.grammar import parse_edtf as parse from edtf.parser.parser_classes import EDTFObject, TIME_EMPTY_TIME, \ TIME_EMPTY_EXTRAS -from edtf.parser.edtf_exceptions import EDTFParseException # Example object types and attributes. # the first item in each tuple is the input EDTF string, and expected parse result. @@ -192,17 +193,30 @@ None, '', 'not a edtf string', - 'y17e7-12-26', # not implemented - '2016-13-08', # wrong day order - '2016-02-39', # out of range + 'y17e7-12-26', # not implemented + '2016-13-08', # wrong day order + '2016-02-39', # out of range '-0000-01-01', # negative zero year ) class TestParsing(unittest.TestCase): + def iso_to_struct_time(self, iso_date): + """ Convert YYYY-mm-dd date strings to time structs """ + if iso_date[0] == '-': + is_negative = True + iso_date = iso_date[1:] + else: + is_negative = False + y, mo, d = [int(i) for i in iso_date.split('-')] + if is_negative: + y *= -1 + return struct_time( + [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) + def test_non_parsing(self): for i in BAD_EXAMPLES: - self.assertRaises(EDTFParseException, parse, i) + self.assertRaises(ParseException, parse, i) def test_date_values(self): """ @@ -217,13 +231,15 @@ def test_date_values(self): else: o = i - sys.stdout.write("parsing '%s'" % i) + sys.stdout.write(f"parsing '{i}'") f = parse(i) - sys.stdout.write(" => %s()\n" % type(f).__name__) + sys.stdout.write(f" => {type(f).__name__}()\n") self.assertIsInstance(f, EDTFObject) - self.assertEqual(str(f), o) + self.assertEqual(str(f), o, msg=f"Testing {i}") - if len(e) == 5: + if len(e) == 1: + continue + elif len(e) == 5: expected_lower_strict = e[1] expected_upper_strict = e[2] expected_lower_fuzzy = e[3] @@ -243,33 +259,21 @@ def test_date_values(self): expected_upper_strict = e[1] expected_lower_fuzzy = e[1] expected_upper_fuzzy = e[1] - if len(e) == 1: + else: + print(f"Unexpected value {e}; skipping.") continue - def iso_to_struct_time(iso_date): - """ Convert YYYY-mm-dd date strings to time structs """ - if iso_date[0] == '-': - is_negative = True - iso_date = iso_date[1:] - else: - is_negative = False - y, mo, d = [int(i) for i in iso_date.split('-')] - if is_negative: - y *= -1 - return struct_time( - [y, mo, d] + TIME_EMPTY_TIME + TIME_EMPTY_EXTRAS) - # Convert string date representations into `struct_time`s - expected_lower_strict = iso_to_struct_time(expected_lower_strict) - expected_upper_strict = iso_to_struct_time(expected_upper_strict) - expected_lower_fuzzy = iso_to_struct_time(expected_lower_fuzzy) - expected_upper_fuzzy = iso_to_struct_time(expected_upper_fuzzy) + exp_lower_str = self.iso_to_struct_time(expected_lower_strict) + exp_upper_str = self.iso_to_struct_time(expected_upper_strict) + exp_lower_fuzz = self.iso_to_struct_time(expected_lower_fuzzy) + exp_upper_fuzz = self.iso_to_struct_time(expected_upper_fuzzy) try: - self.assertEqual(f.lower_strict(), expected_lower_strict) - self.assertEqual(f.upper_strict(), expected_upper_strict) - self.assertEqual(f.lower_fuzzy(), expected_lower_fuzzy) - self.assertEqual(f.upper_fuzzy(), expected_upper_fuzzy) + self.assertEqual(f.lower_strict(), exp_lower_str) + self.assertEqual(f.upper_strict(), exp_upper_str) + self.assertEqual(f.lower_fuzzy(), exp_lower_fuzz) + self.assertEqual(f.upper_fuzzy(), exp_upper_fuzz) except Exception as x: # Write to stdout for manual debugging, I guess sys.stdout.write(str(x)) From 06ab934befb7a665301587134794ddbc50b60964 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Wed, 24 Jul 2024 11:18:51 +0200 Subject: [PATCH 08/14] Further optimizations --- edtf/natlang/en.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 8cb72c4..d7d7b8d 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,4 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" +import functools from datetime import datetime from typing import Optional @@ -40,15 +41,12 @@ MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') - - # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = ( - re.compile(r'.*dynasty.*'), # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -) +REJECT_RULES = re.compile(r'.*dynasty.*') # Don't parse '23rd Dynasty' to 'uuuu-uu-23' +@functools.lru_cache() def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -123,7 +121,8 @@ def text_to_edtf(text: str) -> Optional[str]: return result -def text_to_edtf_date(text) -> Optional[str]: +@functools.lru_cache() +def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. @@ -137,9 +136,8 @@ def text_to_edtf_date(text) -> Optional[str]: t = text.lower() result = '' - for reject_re in REJECT_RULES: - if re.match(reject_re, t): - return None + if re.match(REJECT_RULES, t): + return None # matches on '1800s'. Needs to happen before is_decade. could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) @@ -185,7 +183,6 @@ def text_to_edtf_date(text) -> Optional[str]: else: # try dateutil.parse - try: # parse twice, using different defaults to see what was # parsed and what was guessed. From c9cb56fe7dfcfe3f55ee981106bce7e73e7b7554 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Mon, 12 Aug 2024 14:27:41 +0200 Subject: [PATCH 09/14] Update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index ba74660..4d58675 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,5 @@ docs/_build/ # PyBuilder target/ +.idea +.DS_Store From 9e51373eea989f4ea306408138b31ce53bdef1ab Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 15:01:47 +0200 Subject: [PATCH 10/14] Black formatting, updates --- edtf/natlang/en.py | 101 +++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 44 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index d7d7b8d..191199e 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -14,36 +14,42 @@ DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0) DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0) -SHORT_YEAR_RE = re.compile(r'(-?)([\du])([\dxu])([\dxu])([\dxu])') -LONG_YEAR_RE = re.compile(r'y(-?)([1-9]\d\d\d\d+)') -CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') -CENTURY_RANGE = re.compile(r'\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]') -CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)') -ONE_DIGIT_PARTIAL_FIRST = re.compile(r'\d\D\b') -TWO_DIGIT_PARTIAL_FIRST = re.compile(r'\d\d\b') -PARTIAL_CHECK = re.compile(r'\b\d\d\d\d$') +SHORT_YEAR_RE = re.compile(r"(-?)([\du])([\dxu])([\dxu])([\dxu])") +LONG_YEAR_RE = re.compile(r"y(-?)([1-9]\d\d\d\d+)") +CENTURY_RE = re.compile(r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?") +CENTURY_RANGE = re.compile(r"\b(\d\d)(th|st|nd|rd|)-(\d\d)(th|st|nd|rd) [cC]") +CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)") +ONE_DIGIT_PARTIAL_FIRST = re.compile(r"\d\D\b") +TWO_DIGIT_PARTIAL_FIRST = re.compile(r"\d\d\b") +PARTIAL_CHECK = re.compile(r"\b\d\d\d\d$") SLASH_YEAR = re.compile(r"(\d\d\d\d)/(\d\d\d\d)") BEFORE_CHECK = re.compile(r"\b(?:before|earlier|avant)\b") AFTER_CHECK = re.compile(r"\b(after|since|later|aprés|apres)\b") -APPROX_CHECK = re.compile(r'\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)') +APPROX_CHECK = re.compile( + r"\b(?:ca?\.? ?\d{4}|circa|approx|approximately|around|about|~\d{3,4})|(?:^~)" +) UNCERTAIN_CHECK = re.compile(r"\b(?:uncertain|possibly|maybe|guess|\d{3,4}\?)") -UNCERTAIN_REPL = re.compile(r'(\d{4})\?') -MIGHT_BE_CENTURY = re.compile(r'(\d{2}00)s') -MIGHT_BE_DECADE = re.compile(r'(\d{3}0)s') +UNCERTAIN_REPL = re.compile(r"(\d{4})\?") +MIGHT_BE_CENTURY = re.compile(r"(\d{2}00)s") +MIGHT_BE_DECADE = re.compile(r"(\d{3}0)s") -APPROX_CENTURY_RE = re.compile(r'\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?') -UNCERTAIN_CENTURY_RE = re.compile(r'(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?') +APPROX_CENTURY_RE = re.compile( + r"\b(ca?\.?) ?(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?" +) +UNCERTAIN_CENTURY_RE = re.compile( + r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?\?" +) -APPROX_CE_RE = re.compile(r'\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)') -UNCERTAIN_CE_RE = re.compile(r'(\d{1,4}) (ad|ce|bc|bce)\?') +APPROX_CE_RE = re.compile(r"\b(ca?\.?) ?(\d{1,4}) (ad|ce|bc|bce)") +UNCERTAIN_CE_RE = re.compile(r"(\d{1,4}) (ad|ce|bc|bce)\?") -MENTIONS_YEAR = re.compile(r'\byear\b.+(in|during)\b') -MENTIONS_MONTH = re.compile(r'\bmonth\b.+(in|during)\b') -MENTIONS_DAY = re.compile(r'\bday\b.+(in|during)\b') +MENTIONS_YEAR = re.compile(r"\byear\b.+(in|during)\b") +MENTIONS_MONTH = re.compile(r"\bmonth\b.+(in|during)\b") +MENTIONS_DAY = re.compile(r"\bday\b.+(in|during)\b") # Set of RE rules that will cause us to abort text processing, since we know # the results will be wrong. -REJECT_RULES = re.compile(r'.*dynasty.*') # Don't parse '23rd Dynasty' to 'uuuu-uu-23' +REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' @functools.lru_cache() @@ -57,16 +63,16 @@ def text_to_edtf(text: str) -> Optional[str]: t = text.lower() # try parsing the whole thing - result = text_to_edtf_date(t) + result: Optional[str] = text_to_edtf_date(t) if not result: # split by list delims and move fwd with the first thing that returns a non-empty string. # TODO: assemble multiple dates into a {} or [] structure. for split in [",", ";", "or"]: for list_item in t.split(split): - # try parsing as an interval - split by '-' - toks = list_item.split("-") + toks: list[str] = list_item.split("-") + if len(toks) == 2: d1 = toks[0].strip() d2 = toks[1].strip() @@ -74,10 +80,16 @@ def text_to_edtf(text: str) -> Optional[str]: # match looks from the beginning of the string, search # looks anywhere. - if re.match(ONE_DIGIT_PARTIAL_FIRST, d2): # 1-digit year partial e.g. 1868-9 - if re.search(PARTIAL_CHECK, d1): # TODO: evaluate it and see if it's a year + if re.match( + ONE_DIGIT_PARTIAL_FIRST, d2 + ): # 1-digit year partial e.g. 1868-9 + if re.search( + PARTIAL_CHECK, d1 + ): # TODO: evaluate it and see if it's a year d2 = d1[-4:-1] + d2 - elif re.match(TWO_DIGIT_PARTIAL_FIRST, d2): # 2-digit year partial e.g. 1809-10 + elif re.match( + TWO_DIGIT_PARTIAL_FIRST, d2 + ): # 2-digit year partial e.g. 1809-10 if re.search(PARTIAL_CHECK, d1): d2 = d1[-4:-2] + d2 else: @@ -134,7 +146,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: return None t = text.lower() - result = '' + result: str = "" if re.match(REJECT_RULES, t): return None @@ -143,7 +155,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: could_be_century: list = re.findall(MIGHT_BE_CENTURY, t) # matches on '1800s' and '1910s'. Removes the 's'. # Needs to happen before is_uncertain because e.g. "1860s?" - t, is_decade = re.subn(MIGHT_BE_DECADE, r'\1', t) + t, is_decade = re.subn(MIGHT_BE_DECADE, r"\1", t) # detect approximation signifiers # a few 'circa' abbreviations just before the year @@ -151,7 +163,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # the word 'circa' anywhere # detect uncertainty signifiers - t, is_uncertain = re.subn(UNCERTAIN_REPL, r'\1', t) + t, is_uncertain = re.subn(UNCERTAIN_REPL, r"\1", t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CHECK, t) # detect century forms @@ -191,7 +203,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_1 + default=DEFAULT_DATE_1, ) dt2 = parse( @@ -199,14 +211,13 @@ def text_to_edtf_date(text: str) -> Optional[str]: dayfirst=appsettings.DAY_FIRST, yearfirst=False, fuzzy=True, # force a match, even if it's default date - default=DEFAULT_DATE_2 + default=DEFAULT_DATE_2, ) except ValueError: return None - if dt1.date() == DEFAULT_DATE_1.date() and \ - dt2.date() == DEFAULT_DATE_2.date(): + if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. return None @@ -223,12 +234,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += 'x' + result += "x" elif i == 3 and is_decade: if mentions_year: - result += 'u' # year precision + result += "X" # year precision else: - result += 'x' # decade precision + result += "x" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default @@ -236,12 +247,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: else: # different values were produced, meaning that it's likely # a default. Use 'unspecified' - result += "u" + result += "X" # strip off unknown chars from end of string - except the first 4 for i in reversed(range(len(result))): - if result[i] not in ('u', 'x', '-'): + if result[i] not in ("X", "-"): smallest_length = 4 if mentions_month: @@ -265,14 +276,16 @@ def text_to_edtf_date(text: str) -> Optional[str]: # end dateutil post-parsing - if is_uncertain: - result += "?" - - if is_approximate: - result += "~" + if is_uncertain and is_approximate: + result += "%" + else: + if is_uncertain: + result += "?" + if is_approximate: + result += "~" # weed out bad parses - if result.startswith("uu-uu"): + if result.startswith("XX-XX"): return None return result From 1aa53cfb2d4e0a2a3c284ec20db60f841b88a7f9 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 15:03:16 +0200 Subject: [PATCH 11/14] Update imports --- edtf/natlang/en.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 191199e..ba192e8 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,12 +1,12 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" import functools +import re from datetime import datetime from typing import Optional from dateutil.parser import parse -import re -from edtf import appsettings +from edtf import appsettings # two dates where every digit of an ISO date representation is different, # and one is in the past and one is in the future. From 8c4f9685bc31224bcd0efcf811485f2e3f34e292 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 16:48:01 +0200 Subject: [PATCH 12/14] Merge fixes --- edtf/natlang/en.py | 18 ++++++++++-------- edtf/parser/parser_classes.py | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index ba192e8..49b04f3 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Optional -from dateutil.parser import parse +from dateutil.parser import ParserError, parse from edtf import appsettings @@ -126,9 +126,9 @@ def text_to_edtf(text: str) -> Optional[str]: is_after = re.findall(AFTER_CHECK, t) if is_before: - result = f"unknown/{result}" + result = f"/{result}" elif is_after: - result = f"{result}/unknown" + result = f"{result}/" return result @@ -172,7 +172,7 @@ def text_to_edtf_date(text: str) -> Optional[str]: # detect CE/BCE year form is_ce = re.findall(CE_RE, t) if is_century: - result = "%02dxx" % (int(is_century[0][0]) - 1,) + result = "%02dXX" % (int(is_century[0][0]) - 1,) is_approximate = is_approximate or re.findall(APPROX_CENTURY_RE, t) is_uncertain = is_uncertain or re.findall(UNCERTAIN_CENTURY_RE, t) @@ -214,8 +214,10 @@ def text_to_edtf_date(text: str) -> Optional[str]: default=DEFAULT_DATE_2, ) - except ValueError: - return None + except ParserError: + return + except Exception: + return if dt1.date() == DEFAULT_DATE_1.date() and dt2.date() == DEFAULT_DATE_2.date(): # couldn't parse anything - defaults are untouched. @@ -234,12 +236,12 @@ def text_to_edtf_date(text: str) -> Optional[str]: # approximate/uncertain markers to decide whether we treat it as # a century or a decade. if i == 2 and could_be_century and not (is_approximate or is_uncertain): - result += "x" + result += "X" elif i == 3 and is_decade: if mentions_year: result += "X" # year precision else: - result += "x" # decade precision + result += "X" # decade precision elif date1[i] == date2[i]: # since both attempts at parsing produced the same result # it must be parsed value, not a default diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index d103660..eada1f9 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -4,6 +4,7 @@ from datetime import date, datetime from operator import add, sub from time import struct_time +from typing import Optional from dateutil.relativedelta import relativedelta From 6f08bce95cb583f2825353cbe8ae6a1de1c47df7 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 16:55:59 +0200 Subject: [PATCH 13/14] ruff formatting --- edtf/natlang/en.py | 5 +++-- edtf/parser/parser_classes.py | 9 ++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py index 49b04f3..97230db 100644 --- a/edtf/natlang/en.py +++ b/edtf/natlang/en.py @@ -1,4 +1,5 @@ """Utilities to derive an EDTF string from an (English) natural language string.""" + import functools import re from datetime import datetime @@ -52,7 +53,7 @@ REJECT_RULES = re.compile(r".*dynasty.*") # Don't parse '23rd Dynasty' to 'uuuu-uu-23' -@functools.lru_cache() +@functools.lru_cache def text_to_edtf(text: str) -> Optional[str]: """ Generate EDTF string equivalent of a given natural language date string. @@ -133,7 +134,7 @@ def text_to_edtf(text: str) -> Optional[str]: return result -@functools.lru_cache() +@functools.lru_cache def text_to_edtf_date(text: str) -> Optional[str]: """ Return EDTF string equivalent of a given natural language date string. diff --git a/edtf/parser/parser_classes.py b/edtf/parser/parser_classes.py index eada1f9..ad690fb 100644 --- a/edtf/parser/parser_classes.py +++ b/edtf/parser/parser_classes.py @@ -98,10 +98,6 @@ class EDTFObject: parser = None - def __init__(self, *args, **kwargs): - errmsg: str = f"{type(self).__name__}.__init__(*{args}, **{kwargs})" - raise NotImplementedError(f"{errmsg} is not implemented.") - @classmethod def set_parser(cls, p): cls.parser = p @@ -288,6 +284,7 @@ def set_year(self, y: int): def get_year(self) -> int: return self._year + year = property(get_year, set_year) def set_month(self, m: Optional[int]): @@ -297,6 +294,7 @@ def set_month(self, m: Optional[int]): def get_month(self) -> Optional[int]: return self._month + month = property(get_month, set_month) def __str__(self): @@ -932,8 +930,9 @@ def __str__(self): return result - def set_year(self, y): # Year can be None. + def set_year(self, y): # Year can be None. self._year = y + year = property(Date.get_year, set_year) def _precise_year(self, lean: str): From 51255e3e0d82ed374b91cd10a96bf1afede056d2 Mon Sep 17 00:00:00 2001 From: Andrew Hankinson Date: Tue, 13 Aug 2024 17:03:31 +0200 Subject: [PATCH 14/14] Run benchmarks --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4645d13..7645ec9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -100,7 +100,7 @@ jobs: - name: Publish benchmark results uses: benchmark-action/github-action-benchmark@v1 - if: github.event_name == 'pull_request' && github.repository == 'ixc/python-edtf' + if: github.event_name == 'pull_request' && github.repository == 'rism-digital/python-edtf' with: tool: 'pytest' auto-push: true @@ -112,7 +112,7 @@ jobs: summary-always: true - name: Comment on benchmark results without publishing - if: github.event_name != 'pull_request' || github.repository != 'ixc/python-edtf' + if: github.event_name != 'pull_request' || github.repository != 'rism-digital/python-edtf' uses: benchmark-action/github-action-benchmark@v1 with: tool: 'pytest'